Skip to content

Commit 52ab47c

Browse files
Sam FengSam Feng
Sam Feng
authored and
Sam Feng
committed
add to_sql method and more queries type
1 parent 3bc491c commit 52ab47c

File tree

3 files changed

+125
-32
lines changed

3 files changed

+125
-32
lines changed

.DS_Store

6 KB
Binary file not shown.

etl.ipynb

+86-31
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
},
1313
{
1414
"cell_type": "code",
15-
"execution_count": 41,
15+
"execution_count": 30,
1616
"metadata": {
1717
"editable": true
1818
},
@@ -27,7 +27,7 @@
2727
},
2828
{
2929
"cell_type": "code",
30-
"execution_count": 42,
30+
"execution_count": 31,
3131
"metadata": {
3232
"editable": true
3333
},
@@ -39,7 +39,7 @@
3939
},
4040
{
4141
"cell_type": "code",
42-
"execution_count": 43,
42+
"execution_count": 32,
4343
"metadata": {
4444
"editable": true
4545
},
@@ -72,7 +72,7 @@
7272
},
7373
{
7474
"cell_type": "code",
75-
"execution_count": 44,
75+
"execution_count": 33,
7676
"metadata": {
7777
"editable": true
7878
},
@@ -83,7 +83,7 @@
8383
},
8484
{
8585
"cell_type": "code",
86-
"execution_count": 45,
86+
"execution_count": 34,
8787
"metadata": {
8888
"editable": true
8989
},
@@ -94,7 +94,7 @@
9494
},
9595
{
9696
"cell_type": "code",
97-
"execution_count": 46,
97+
"execution_count": 35,
9898
"metadata": {
9999
"editable": true
100100
},
@@ -161,7 +161,7 @@
161161
"0 Keepin It Real (Skit) 0 "
162162
]
163163
},
164-
"execution_count": 46,
164+
"execution_count": 35,
165165
"metadata": {},
166166
"output_type": "execute_result"
167167
}
@@ -187,7 +187,7 @@
187187
},
188188
{
189189
"cell_type": "code",
190-
"execution_count": 47,
190+
"execution_count": 36,
191191
"metadata": {
192192
"editable": true
193193
},
@@ -202,7 +202,7 @@
202202
" 114.78159]"
203203
]
204204
},
205-
"execution_count": 47,
205+
"execution_count": 36,
206206
"metadata": {},
207207
"output_type": "execute_result"
208208
}
@@ -224,7 +224,7 @@
224224
},
225225
{
226226
"cell_type": "code",
227-
"execution_count": 48,
227+
"execution_count": 37,
228228
"metadata": {
229229
"editable": true
230230
},
@@ -235,7 +235,7 @@
235235
"'\\nINSERT INTO songs (song_id, title, artist_id, year, duration)\\nVALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;\\n'"
236236
]
237237
},
238-
"execution_count": 48,
238+
"execution_count": 37,
239239
"metadata": {},
240240
"output_type": "execute_result"
241241
}
@@ -246,7 +246,7 @@
246246
},
247247
{
248248
"cell_type": "code",
249-
"execution_count": 49,
249+
"execution_count": 38,
250250
"metadata": {
251251
"editable": true
252252
},
@@ -281,7 +281,7 @@
281281
},
282282
{
283283
"cell_type": "code",
284-
"execution_count": 50,
284+
"execution_count": 39,
285285
"metadata": {
286286
"editable": true
287287
},
@@ -292,7 +292,7 @@
292292
"['ARD0S291187B9B7BF5', 'Rated R', 'Ohio', nan, nan]"
293293
]
294294
},
295-
"execution_count": 50,
295+
"execution_count": 39,
296296
"metadata": {},
297297
"output_type": "execute_result"
298298
}
@@ -314,7 +314,7 @@
314314
},
315315
{
316316
"cell_type": "code",
317-
"execution_count": 51,
317+
"execution_count": 40,
318318
"metadata": {
319319
"editable": true
320320
},
@@ -350,7 +350,7 @@
350350
},
351351
{
352352
"cell_type": "code",
353-
"execution_count": 52,
353+
"execution_count": 41,
354354
"metadata": {
355355
"editable": true
356356
},
@@ -361,7 +361,7 @@
361361
},
362362
{
363363
"cell_type": "code",
364-
"execution_count": 53,
364+
"execution_count": 42,
365365
"metadata": {
366366
"editable": true
367367
},
@@ -372,7 +372,7 @@
372372
},
373373
{
374374
"cell_type": "code",
375-
"execution_count": 54,
375+
"execution_count": 43,
376376
"metadata": {
377377
"editable": true
378378
},
@@ -565,7 +565,7 @@
565565
"4 24 "
566566
]
567567
},
568-
"execution_count": 54,
568+
"execution_count": 43,
569569
"metadata": {},
570570
"output_type": "execute_result"
571571
}
@@ -595,7 +595,7 @@
595595
},
596596
{
597597
"cell_type": "code",
598-
"execution_count": 55,
598+
"execution_count": 44,
599599
"metadata": {
600600
"editable": true
601601
},
@@ -788,7 +788,7 @@
788788
"4 24 "
789789
]
790790
},
791-
"execution_count": 55,
791+
"execution_count": 44,
792792
"metadata": {},
793793
"output_type": "execute_result"
794794
}
@@ -800,7 +800,7 @@
800800
},
801801
{
802802
"cell_type": "code",
803-
"execution_count": 56,
803+
"execution_count": 45,
804804
"metadata": {
805805
"editable": true
806806
},
@@ -816,7 +816,7 @@
816816
"Name: ts, dtype: datetime64[ns]"
817817
]
818818
},
819-
"execution_count": 56,
819+
"execution_count": 45,
820820
"metadata": {},
821821
"output_type": "execute_result"
822822
}
@@ -830,7 +830,7 @@
830830
},
831831
{
832832
"cell_type": "code",
833-
"execution_count": 57,
833+
"execution_count": 46,
834834
"metadata": {
835835
"editable": true
836836
},
@@ -842,7 +842,7 @@
842842
},
843843
{
844844
"cell_type": "code",
845-
"execution_count": 58,
845+
"execution_count": 47,
846846
"metadata": {
847847
"editable": true
848848
},
@@ -941,7 +941,7 @@
941941
"4 2018-11-29 00:07:13.796 0 29 48 11 2018 3"
942942
]
943943
},
944-
"execution_count": 58,
944+
"execution_count": 47,
945945
"metadata": {},
946946
"output_type": "execute_result"
947947
}
@@ -963,7 +963,7 @@
963963
},
964964
{
965965
"cell_type": "code",
966-
"execution_count": 59,
966+
"execution_count": 48,
967967
"metadata": {
968968
"editable": true
969969
},
@@ -996,13 +996,16 @@
996996
},
997997
{
998998
"cell_type": "code",
999-
"execution_count": 60,
999+
"execution_count": 55,
10001000
"metadata": {
10011001
"editable": true
10021002
},
10031003
"outputs": [],
10041004
"source": [
1005-
"user_df = df[[\"userId\", \"firstName\", \"lastName\", \"gender\", \"level\"]]\n"
1005+
"user_df = df[[\"userId\", \"firstName\", \"lastName\", \"gender\", \"level\"]]\n",
1006+
"user_df.head()\n",
1007+
"user_df = user_df.rename(columns={\"userId\": \"user_id\", \"firstName\": \"first_name\", \"lastName\":\"last_name\"})\n",
1008+
"user_df = user_df.drop_duplicates(subset=['user_id'])"
10061009
]
10071010
},
10081011
{
@@ -1017,7 +1020,59 @@
10171020
},
10181021
{
10191022
"cell_type": "code",
1020-
"execution_count": 61,
1023+
"execution_count": 50,
1024+
"metadata": {
1025+
"editable": true
1026+
},
1027+
"outputs": [],
1028+
"source": [
1029+
"from sqlalchemy import create_engine\n",
1030+
"# conn = psycopg2.connect(\"host=127.0.0.1 dbname=sparkifydb user=student password=student\")\n",
1031+
"engine = create_engine('postgresql+psycopg2://student:student@localhost/sparkifydb')"
1032+
]
1033+
},
1034+
{
1035+
"cell_type": "code",
1036+
"execution_count": 56,
1037+
"metadata": {
1038+
"editable": true
1039+
},
1040+
"outputs": [],
1041+
"source": [
1042+
"delete_user_table = (\"\"\"\n",
1043+
"DELETE FROM users\n",
1044+
";\n",
1045+
"\"\"\")\n",
1046+
"cur.execute(delete_user_table)"
1047+
]
1048+
},
1049+
{
1050+
"cell_type": "code",
1051+
"execution_count": 57,
1052+
"metadata": {
1053+
"editable": true
1054+
},
1055+
"outputs": [],
1056+
"source": [
1057+
"user_df.to_sql('users', con = engine, if_exists = 'append', chunksize = 1000, index = False)"
1058+
]
1059+
},
1060+
{
1061+
"cell_type": "code",
1062+
"execution_count": 27,
1063+
"metadata": {
1064+
"editable": true
1065+
},
1066+
"outputs": [],
1067+
"source": [
1068+
"# rows = []\n",
1069+
"# for row in user_df.iterrows():\n",
1070+
"# rows.append(row)"
1071+
]
1072+
},
1073+
{
1074+
"cell_type": "code",
1075+
"execution_count": 29,
10211076
"metadata": {
10221077
"editable": true
10231078
},
@@ -1099,7 +1154,7 @@
10991154
},
11001155
{
11011156
"cell_type": "code",
1102-
"execution_count": 63,
1157+
"execution_count": 59,
11031158
"metadata": {
11041159
"editable": true
11051160
},

sql_queries.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,45 @@
104104
;
105105
""")
106106

107+
108+
# DATA QUALITY CHECK
109+
check_deupliates_songplay_table = ("""
110+
SELECT songplay_id, COUNT(*)
111+
FROM songplay_table
112+
GROUP BY songplay_id
113+
HAVING COUNT(*) > 1
114+
;
115+
""")
116+
117+
118+
check_deupliates_user_table = ("""
119+
SELECT user_id, COUNT(*)
120+
FROM users
121+
GROUP BY user_id
122+
HAVING COUNT(*) > 1
123+
;
124+
""")
125+
126+
check_deupliates_song_table = ("""
127+
SELECT song_id, COUNT(*)
128+
FROM songs
129+
GROUP BY song_id
130+
HAVING COUNT(*) > 1
131+
;
132+
""")
133+
134+
135+
# DELETE
136+
delete_user_table = ("""
137+
DELETE FROM users
138+
;
139+
""")
140+
141+
142+
143+
107144
# QUERY LISTS
108145

109146
create_table_queries = [songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create]
110-
drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]
147+
drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]
148+
delete_table_queries = [delete_user_table]

0 commit comments

Comments
 (0)