-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSnowflake-EDA.sql
More file actions
371 lines (282 loc) · 14.2 KB
/
Snowflake-EDA.sql
File metadata and controls
371 lines (282 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
SELECT * FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA LIMIT 100;
-------=======================================================================================================================================================
---// Hypothesis for EDA:
--1. Univariate Analysis
--2. Does patients length of stay (LOS) changes wrt HOSPITAL_TYPE_CODE?
--3. Does patients length of stay (LOS) changes wrt CITY_CODE_HOSPITAL?
--4. Does patients length of stay (LOS) changes wrt HOSPITAL_REGION_CODE?
--5. DEPARTMENT of admission and its impact on LOS
--6. DEPARTMENT X HOSPITAL_REGION_CODE and its impact on LOS
--7. Does more AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL impact a patients LOS
--8. TYPE_OF_ADMISSION and its impact on LOS
--9. SEVERITY_OF_ILLNESS and its impact on LOS
--10. AGE and its impact on LOS
--11. ADMISSION_DEPOSIT and its relation to LOS
--12. Does more visitors come with patients who have more severe illness?
--13. Is there any differences in the LOS for different WARD_TYPE & WARD_FACILITY_CODE in each DEPARTMENT
--14. Does BED_GRADE affects LOS of patients?
--15. Does more visitors come when younger patients got admitted than the older patients?
--16. What type of illness & admission does majority of patients who are less than 30 years of age have and which department most of them are getting admitted to?
--17. Are patients below 40 years pay more ADMISSION_DEPOSIT when they get admitted to the hospital?
-------=======================================================================================================================================================
-- // 1. Univariate Analysis //
-- Unique number of hospital codes and their distribution
SELECT HOSPITAL_CODE, COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of hospital type code and their distribution
SELECT HOSPITAL_TYPE_CODE, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of city code hospital and their distribution
SELECT CITY_CODE_HOSPITAL, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of hospital region code and their distribution
SELECT HOSPITAL_REGION_CODE, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of department and their distribution
SELECT DEPARTMENT, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of Bed grade and their distribution
SELECT BED_GRADE, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of admission type and their distribution
SELECT TYPE_OF_ADMISSION, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of severity of illness and their distribution
SELECT SEVERITY_OF_ILLNESS, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
-- Unique number of Age groups and their distribution
SELECT AGE, COUNT(DISTINCT HOSPITAL_CODE) AS CNT_HOSPITAL_CODE,
COUNT(DISTINCT HOSPITAL_TYPE_CODE) AS CNT_HOSPITAL_TYPE_CODE,
COUNT(*) AS CNT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 2 DESC;
----========================================================================
-- // 2/3/4. Does patients length of stay (LOS) changes wrt HOSPITAL_TYPE_CODE/CITY_CODE_HOSPITAL/HOSPITAL_REGION_CODE? //
SELECT HOSPITAL_TYPE_CODE,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
SELECT CITY_CODE_HOSPITAL,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
SELECT HOSPITAL_REGION_CODE,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 5. DEPARTMENT of admission and its impact on LOS //
SELECT DEPARTMENT,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 6. DEPARTMENT X HOSPITAL_REGION_CODE and its impact on LOS //
SELECT DEPARTMENT,HOSPITAL_REGION_CODE,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1,2;
----========================================================================
-- // 7. Does more AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL impact a patients LOS // (Insight)
SELECT AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 8. TYPE_OF_ADMISSION and its impact on LOS // (Insight)
SELECT TYPE_OF_ADMISSION,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 9. SEVERITY_OF_ILLNESS and its impact on LOS // (Insight)
SELECT SEVERITY_OF_ILLNESS,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 10. AGE and its impact on LOS // (Insight)
SELECT AGE,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 11. ADMISSION_DEPOSIT and its relation to LOS //
SELECT DISTINCT ADMISSION_DEPOSIT FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA; --- Ranges from 1800 to 11000
WITH BASE AS (
SELECT ADMISSION_DEPOSIT,
CASE WHEN ADMISSION_DEPOSIT <= 3000 THEN '1. Less than 3K'
WHEN ADMISSION_DEPOSIT > 7000 THEN '3. Greater than 7K'
ELSE '2. Between 3K to 7K' END AS DEPOSIT_BUCKET,
ADMISSION_DATE,
DISCHARGE_DATE,
CASE_ID
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
)
-- SELECT * FROM BASE;
SELECT DEPOSIT_BUCKET,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM BASE
GROUP BY 1
ORDER BY 1;
----========================================================================
-- // 12. Does more visitors come with patients who have more severe illness? //
SELECT SEVERITY_OF_ILLNESS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(VISITORS_WITH_PATIENT) AS MIN_VISITORS,
MAX(VISITORS_WITH_PATIENT) AS MAX_VISITORS,
AVG(VISITORS_WITH_PATIENT) AS AVG_VISITORS,
MEDIAN(VISITORS_WITH_PATIENT) AS MEDIAN_VISITORS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
----========================================================================
-- // 13. Is there any differences in the LOS for different WARD_TYPE & WARD_FACILITY_CODE in each DEPARTMENT // (Insight)
SELECT DEPARTMENT,WARD_TYPE,WARD_FACILITY_CODE,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1,2,3
ORDER BY 1,2,3;
SELECT DEPARTMENT,SEVERITY_OF_ILLNESS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1,2
ORDER BY 1,2;
----========================================================================
-- // 14. Does BED_GRADE affects LOS of patients? // (Insight)
SELECT BED_GRADE,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1;
SELECT SEVERITY_OF_ILLNESS,BED_GRADE,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MIN_LOS,
MAX(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MAX_LOS,
AVG(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS AVG_LOS,
MEDIAN(DATEDIFF(day,ADMISSION_DATE,DISCHARGE_DATE)) AS MEDIAN_LOS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1,2
ORDER BY 1,2;
----========================================================================
-- // 15. Does more visitors come when younger patients got admitted than the older patients? // (Insight)
SELECT AGE,
COUNT(DISTINCT CASE_ID) AS CNT_CASES,
MIN(VISITORS_WITH_PATIENT) AS MIN_VISITORS,
MAX(VISITORS_WITH_PATIENT) AS MAX_VISITORS,
AVG(VISITORS_WITH_PATIENT) AS AVG_VISITORS,
MEDIAN(VISITORS_WITH_PATIENT) AS MEDIAN_VISITORS
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
GROUP BY 1
ORDER BY 1;
----========================================================================
-- // 16. What type of illness & admission does majority of patients who are less than 30 years of age have and which department most of them are getting admitted to? //
WITH BASE AS (
SELECT *
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
WHERE AGE IN ('0-10','20-Nov','21-30')
),
ILLNESS_N_ADMISSION AS (
SELECT TYPE_OF_ADMISSION, SEVERITY_OF_ILLNESS,
COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM BASE
GROUP BY 1,2
ORDER BY 1,2
),
DEPARTMENT AS (
SELECT DEPARTMENT, COUNT(DISTINCT CASE_ID) AS CNT_CASES
FROM BASE
GROUP BY 1
)
-- SELECT * FROM BASE; -- 43,417 cases
-- SELECT * FROM ILLNESS_N_ADMISSION;
SELECT * FROM DEPARTMENT;
----========================================================================
-- // 17. Are patients below 40 years pay more ADMISSION_DEPOSIT when they get admitted to the hospital? // (Insight)
WITH BASE AS (
SELECT *,
CASE WHEN AGE IN ('0-10','20-Nov','21-30','31-40') THEN 1 ELSE 0 END AS BELOW_40_IND
FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
)
SELECT BELOW_40_IND,
MIN(ADMISSION_DEPOSIT) AS MIN_DEP,
MAX(ADMISSION_DEPOSIT) AS MAX_DEP,
AVG(ADMISSION_DEPOSIT) AS AVG_DEP
FROM BASE
GROUP BY 1;
----========================================================================
----========================================================================