Skip to content

Commit 7fd046e

Browse files
committed
final update
1 parent 69b915f commit 7fd046e

8 files changed

+6437
-0
lines changed

1.1-Data-Engineering-Explore.ipynb

+388
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,388 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "b8e4608a",
6+
"metadata": {},
7+
"source": [
8+
"### Data Engineering"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 1,
14+
"id": "8ff0ab9d",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import pandas as pd\n",
19+
"import json as json"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 2,
25+
"id": "fa9d1a84",
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"# read in the data\n",
30+
"with open(\"data/aac-data/data/9_5.json\", 'r') as f:\n",
31+
" data = json.load(f)"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": 3,
37+
"id": "04edfa92",
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"df = pd.DataFrame(data)"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 4,
47+
"id": "96741da9",
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"data": {
52+
"text/html": [
53+
"<div>\n",
54+
"<style scoped>\n",
55+
" .dataframe tbody tr th:only-of-type {\n",
56+
" vertical-align: middle;\n",
57+
" }\n",
58+
"\n",
59+
" .dataframe tbody tr th {\n",
60+
" vertical-align: top;\n",
61+
" }\n",
62+
"\n",
63+
" .dataframe thead th {\n",
64+
" text-align: right;\n",
65+
" }\n",
66+
"</style>\n",
67+
"<table border=\"1\" class=\"dataframe\">\n",
68+
" <thead>\n",
69+
" <tr style=\"text-align: right;\">\n",
70+
" <th></th>\n",
71+
" <th>Organism</th>\n",
72+
" <th>Uniprot_AC</th>\n",
73+
" <th>position</th>\n",
74+
" <th>standard_seq</th>\n",
75+
" <th>PTM_Catalogy</th>\n",
76+
" <th>PTM_Type</th>\n",
77+
" <th>ref</th>\n",
78+
" <th>cell_line</th>\n",
79+
" <th>Identification_Strategy</th>\n",
80+
" <th>Identification_Technique</th>\n",
81+
" </tr>\n",
82+
" </thead>\n",
83+
" <tbody>\n",
84+
" <tr>\n",
85+
" <th>0</th>\n",
86+
" <td>Homo sapiens</td>\n",
87+
" <td>Q9Y6C9</td>\n",
88+
" <td>79</td>\n",
89+
" <td>LFSYAQHIASIDGRRGLFTGLTPRLCSGVLGTVVHGKVLQHYQESD...</td>\n",
90+
" <td>Oxidation PTM</td>\n",
91+
" <td>S-sulfenylation</td>\n",
92+
" <td>28355876</td>\n",
93+
" <td>RKO cells</td>\n",
94+
" <td>Direct capture</td>\n",
95+
" <td>Dyn-2, TD, PYD, PRD and BTD</td>\n",
96+
" </tr>\n",
97+
" <tr>\n",
98+
" <th>1</th>\n",
99+
" <td>Homo sapiens</td>\n",
100+
" <td>Q9Y5M8</td>\n",
101+
" <td>100</td>\n",
102+
" <td>SGKTLLFVRLLTGLYRDTQTSITDSCAVYRVNNNRGNSLTLIDLPG...</td>\n",
103+
" <td>Oxidation PTM</td>\n",
104+
" <td>S-sulfenylation</td>\n",
105+
" <td>25175731</td>\n",
106+
" <td>RKO and A431 cells</td>\n",
107+
" <td>Direct capture</td>\n",
108+
" <td>Dyn-2</td>\n",
109+
" </tr>\n",
110+
" <tr>\n",
111+
" <th>2</th>\n",
112+
" <td>Homo sapiens</td>\n",
113+
" <td>Q9Y5M8</td>\n",
114+
" <td>73</td>\n",
115+
" <td>LLTLVFWKLIRSRRSSQRAVLLVGLCDSGKTLLFVRLLTGLYRDTQ...</td>\n",
116+
" <td>Oxidation PTM</td>\n",
117+
" <td>S-sulfenylation</td>\n",
118+
" <td>25175731</td>\n",
119+
" <td>RKO and A431 cells</td>\n",
120+
" <td>Direct capture</td>\n",
121+
" <td>Dyn-2</td>\n",
122+
" </tr>\n",
123+
" <tr>\n",
124+
" <th>3</th>\n",
125+
" <td>Homo sapiens</td>\n",
126+
" <td>Q9Y5M8</td>\n",
127+
" <td>179</td>\n",
128+
" <td>DVAEFLYQVLIDSMGLKNTPSFLIACNKQDIAMAKSAKLIQQQLEK...</td>\n",
129+
" <td>Oxidation PTM</td>\n",
130+
" <td>S-sulfenylation</td>\n",
131+
" <td>28355876</td>\n",
132+
" <td>RKO cells</td>\n",
133+
" <td>Direct capture</td>\n",
134+
" <td>Dyn-2, TD, PYD, PRD and BTD</td>\n",
135+
" </tr>\n",
136+
" <tr>\n",
137+
" <th>4</th>\n",
138+
" <td>Homo sapiens</td>\n",
139+
" <td>Q9Y277</td>\n",
140+
" <td>36</td>\n",
141+
" <td>GKAAKDVFNKGYGFGMVKIDLKTKSCSGVEFSTSGHAYTDTGKASG...</td>\n",
142+
" <td>Oxidation PTM</td>\n",
143+
" <td>S-sulfenylation</td>\n",
144+
" <td>25175731</td>\n",
145+
" <td>RKO and A431 cells</td>\n",
146+
" <td>Direct capture</td>\n",
147+
" <td>Dyn-2</td>\n",
148+
" </tr>\n",
149+
" <tr>\n",
150+
" <th>...</th>\n",
151+
" <td>...</td>\n",
152+
" <td>...</td>\n",
153+
" <td>...</td>\n",
154+
" <td>...</td>\n",
155+
" <td>...</td>\n",
156+
" <td>...</td>\n",
157+
" <td>...</td>\n",
158+
" <td>...</td>\n",
159+
" <td>...</td>\n",
160+
" <td>...</td>\n",
161+
" </tr>\n",
162+
" <tr>\n",
163+
" <th>3433</th>\n",
164+
" <td>Homo sapiens</td>\n",
165+
" <td>Q9UJW0</td>\n",
166+
" <td>258</td>\n",
167+
" <td>YTRPVNLTEVTTLQQRLLQPDFQPVCASQLYPRHKHLLIKRSLRCR...</td>\n",
168+
" <td>Oxidation PTM</td>\n",
169+
" <td>S-sulfenylation</td>\n",
170+
" <td>31246462</td>\n",
171+
" <td>HEK293T cells</td>\n",
172+
" <td>Direct capture</td>\n",
173+
" <td>SAM-TCO</td>\n",
174+
" </tr>\n",
175+
" <tr>\n",
176+
" <th>3434</th>\n",
177+
" <td>Homo sapiens</td>\n",
178+
" <td>P52564</td>\n",
179+
" <td>38</td>\n",
180+
" <td>LKIPKEAFEQPQTSSTPPRDLDSKACISIGNQNFEVKADDLEPIME...</td>\n",
181+
" <td>Oxidation PTM</td>\n",
182+
" <td>S-sulfenylation</td>\n",
183+
" <td>31246462</td>\n",
184+
" <td>HEK293T cells</td>\n",
185+
" <td>Direct capture</td>\n",
186+
" <td>SAM-TCO</td>\n",
187+
" </tr>\n",
188+
" <tr>\n",
189+
" <th>3435</th>\n",
190+
" <td>Homo sapiens</td>\n",
191+
" <td>Q6PRD1</td>\n",
192+
" <td>1536</td>\n",
193+
" <td>KGSFGEMGEQTVKAVQKLSQQQESVCPRESTVPGHSSPCLDNSSSK...</td>\n",
194+
" <td>Oxidation PTM</td>\n",
195+
" <td>S-sulfenylation</td>\n",
196+
" <td>31246462</td>\n",
197+
" <td>HEK293T cells</td>\n",
198+
" <td>Direct capture</td>\n",
199+
" <td>SAM-TCO</td>\n",
200+
" </tr>\n",
201+
" <tr>\n",
202+
" <th>3436</th>\n",
203+
" <td>Homo sapiens</td>\n",
204+
" <td>Q01658</td>\n",
205+
" <td>94</td>\n",
206+
" <td>EHVIQALESLGFGSYISEVKEVLQECKTVALKRRKASSRLENLGIP...</td>\n",
207+
" <td>Oxidation PTM</td>\n",
208+
" <td>S-sulfenylation</td>\n",
209+
" <td>31246462</td>\n",
210+
" <td>HEK293T cells</td>\n",
211+
" <td>Direct capture</td>\n",
212+
" <td>SAM-TCO</td>\n",
213+
" </tr>\n",
214+
" <tr>\n",
215+
" <th>3437</th>\n",
216+
" <td>Homo sapiens</td>\n",
217+
" <td>Q96H55</td>\n",
218+
" <td>275</td>\n",
219+
" <td>RLQWHLPEGAAFSWLPNPERSLEEDCFEVTREAMLHLGIDTPTQNN...</td>\n",
220+
" <td>Oxidation PTM</td>\n",
221+
" <td>S-sulfenylation</td>\n",
222+
" <td>31246462</td>\n",
223+
" <td>HEK293T cells</td>\n",
224+
" <td>Direct capture</td>\n",
225+
" <td>SAM-TCO</td>\n",
226+
" </tr>\n",
227+
" </tbody>\n",
228+
"</table>\n",
229+
"<p>3438 rows × 10 columns</p>\n",
230+
"</div>"
231+
],
232+
"text/plain": [
233+
" Organism Uniprot_AC position \\\n",
234+
"0 Homo sapiens Q9Y6C9 79 \n",
235+
"1 Homo sapiens Q9Y5M8 100 \n",
236+
"2 Homo sapiens Q9Y5M8 73 \n",
237+
"3 Homo sapiens Q9Y5M8 179 \n",
238+
"4 Homo sapiens Q9Y277 36 \n",
239+
"... ... ... ... \n",
240+
"3433 Homo sapiens Q9UJW0 258 \n",
241+
"3434 Homo sapiens P52564 38 \n",
242+
"3435 Homo sapiens Q6PRD1 1536 \n",
243+
"3436 Homo sapiens Q01658 94 \n",
244+
"3437 Homo sapiens Q96H55 275 \n",
245+
"\n",
246+
" standard_seq PTM_Catalogy \\\n",
247+
"0 LFSYAQHIASIDGRRGLFTGLTPRLCSGVLGTVVHGKVLQHYQESD... Oxidation PTM \n",
248+
"1 SGKTLLFVRLLTGLYRDTQTSITDSCAVYRVNNNRGNSLTLIDLPG... Oxidation PTM \n",
249+
"2 LLTLVFWKLIRSRRSSQRAVLLVGLCDSGKTLLFVRLLTGLYRDTQ... Oxidation PTM \n",
250+
"3 DVAEFLYQVLIDSMGLKNTPSFLIACNKQDIAMAKSAKLIQQQLEK... Oxidation PTM \n",
251+
"4 GKAAKDVFNKGYGFGMVKIDLKTKSCSGVEFSTSGHAYTDTGKASG... Oxidation PTM \n",
252+
"... ... ... \n",
253+
"3433 YTRPVNLTEVTTLQQRLLQPDFQPVCASQLYPRHKHLLIKRSLRCR... Oxidation PTM \n",
254+
"3434 LKIPKEAFEQPQTSSTPPRDLDSKACISIGNQNFEVKADDLEPIME... Oxidation PTM \n",
255+
"3435 KGSFGEMGEQTVKAVQKLSQQQESVCPRESTVPGHSSPCLDNSSSK... Oxidation PTM \n",
256+
"3436 EHVIQALESLGFGSYISEVKEVLQECKTVALKRRKASSRLENLGIP... Oxidation PTM \n",
257+
"3437 RLQWHLPEGAAFSWLPNPERSLEEDCFEVTREAMLHLGIDTPTQNN... Oxidation PTM \n",
258+
"\n",
259+
" PTM_Type ref cell_line Identification_Strategy \\\n",
260+
"0 S-sulfenylation 28355876 RKO cells Direct capture \n",
261+
"1 S-sulfenylation 25175731 RKO and A431 cells Direct capture \n",
262+
"2 S-sulfenylation 25175731 RKO and A431 cells Direct capture \n",
263+
"3 S-sulfenylation 28355876 RKO cells Direct capture \n",
264+
"4 S-sulfenylation 25175731 RKO and A431 cells Direct capture \n",
265+
"... ... ... ... ... \n",
266+
"3433 S-sulfenylation 31246462 HEK293T cells Direct capture \n",
267+
"3434 S-sulfenylation 31246462 HEK293T cells Direct capture \n",
268+
"3435 S-sulfenylation 31246462 HEK293T cells Direct capture \n",
269+
"3436 S-sulfenylation 31246462 HEK293T cells Direct capture \n",
270+
"3437 S-sulfenylation 31246462 HEK293T cells Direct capture \n",
271+
"\n",
272+
" Identification_Technique \n",
273+
"0 Dyn-2, TD, PYD, PRD and BTD \n",
274+
"1 Dyn-2 \n",
275+
"2 Dyn-2 \n",
276+
"3 Dyn-2, TD, PYD, PRD and BTD \n",
277+
"4 Dyn-2 \n",
278+
"... ... \n",
279+
"3433 SAM-TCO \n",
280+
"3434 SAM-TCO \n",
281+
"3435 SAM-TCO \n",
282+
"3436 SAM-TCO \n",
283+
"3437 SAM-TCO \n",
284+
"\n",
285+
"[3438 rows x 10 columns]"
286+
]
287+
},
288+
"execution_count": 4,
289+
"metadata": {},
290+
"output_type": "execute_result"
291+
}
292+
],
293+
"source": [
294+
"df"
295+
]
296+
},
297+
{
298+
"cell_type": "code",
299+
"execution_count": 5,
300+
"id": "e4791db8",
301+
"metadata": {},
302+
"outputs": [
303+
{
304+
"data": {
305+
"text/plain": [
306+
"(3438, 10)"
307+
]
308+
},
309+
"execution_count": 5,
310+
"metadata": {},
311+
"output_type": "execute_result"
312+
}
313+
],
314+
"source": [
315+
"df.shape"
316+
]
317+
},
318+
{
319+
"cell_type": "code",
320+
"execution_count": null,
321+
"id": "8e0611c5",
322+
"metadata": {},
323+
"outputs": [],
324+
"source": []
325+
},
326+
{
327+
"cell_type": "code",
328+
"execution_count": null,
329+
"id": "501e6c88",
330+
"metadata": {},
331+
"outputs": [],
332+
"source": []
333+
},
334+
{
335+
"cell_type": "code",
336+
"execution_count": null,
337+
"id": "a6c00081",
338+
"metadata": {},
339+
"outputs": [],
340+
"source": []
341+
},
342+
{
343+
"cell_type": "code",
344+
"execution_count": null,
345+
"id": "edfb68d4",
346+
"metadata": {},
347+
"outputs": [],
348+
"source": []
349+
},
350+
{
351+
"cell_type": "code",
352+
"execution_count": null,
353+
"id": "e7b070a5",
354+
"metadata": {},
355+
"outputs": [],
356+
"source": []
357+
},
358+
{
359+
"cell_type": "code",
360+
"execution_count": null,
361+
"id": "f1a2bf31",
362+
"metadata": {},
363+
"outputs": [],
364+
"source": []
365+
}
366+
],
367+
"metadata": {
368+
"kernelspec": {
369+
"display_name": "Python 3 (ipykernel)",
370+
"language": "python",
371+
"name": "python3"
372+
},
373+
"language_info": {
374+
"codemirror_mode": {
375+
"name": "ipython",
376+
"version": 3
377+
},
378+
"file_extension": ".py",
379+
"mimetype": "text/x-python",
380+
"name": "python",
381+
"nbconvert_exporter": "python",
382+
"pygments_lexer": "ipython3",
383+
"version": "3.9.13"
384+
}
385+
},
386+
"nbformat": 4,
387+
"nbformat_minor": 5
388+
}

0 commit comments

Comments
 (0)