From 41f4badc47fce672eb9f11501f6bb82486ee4f2d Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Tue, 7 Mar 2023 18:23:11 +0000 Subject: [PATCH] Add code for v2.1 --- baseline-expression.ipynb | 464 +++++--------------------------------- 1 file changed, 58 insertions(+), 406 deletions(-) diff --git a/baseline-expression.ipynb b/baseline-expression.ipynb index 5bd8745..daf143d 100644 --- a/baseline-expression.ipynb +++ b/baseline-expression.ipynb @@ -18,7 +18,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "59 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", + "43 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n" @@ -8020,7 +8020,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": { "scrolled": true }, @@ -8074,12 +8074,7 @@ "[1] 21500\n", "[1] 22000\n", "[1] 22500\n", - "[1] 23000\n", - "[1] 23500\n", - "[1] 24000\n", - "[1] 24500\n", - "[1] 25000\n", - "[1] 25500\n" + "[1] 23000\n" ] } ], @@ -8103,230 +8098,9 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Brodmann (1909) area 24breastcaudate nucleuscerebellar hemispherecerebellumcerebral cortexcoronary arterycortex of kidneyectocervixendocervix...transformed skin fibroblastadrenal glandtransverse colonurinary bladderuterusvaginaamygdalaaortaatrium auricular regionblood
ENSG000000015610.1229230.1229230.122923-0.464703-0.464703-0.4647032.076508-1.183898-3.417923-1.183898...-5.651947-1.183898-0.464703-1.183898-3.417923-5.651947-0.4647031.4297430.122923-5.651947
ENSG00000004142-0.0278490.400274-1.195017-0.3347990.1189180.920024-0.0278492.588215-0.178998-0.495546...1.3913515.0121160.261550-0.495546-0.334799-0.027849-1.584679-0.3347991.161170-3.844282
ENSG000000047660.7134140.713414-0.8922832.7363522.7363520.713414-0.892283-0.892283-0.892283-0.892283...-0.8922830.7134140.713414-0.8922830.7134140.713414-0.892283-0.892283-3.637239-3.637239
ENSG000000049480.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
ENSG00000005073-0.503866-0.503866-0.503866-0.503866-0.503866-0.503866-0.503866-0.5038662.3966222.907462...1.330439-0.5038661.3304391.7455323.2726061.330439-0.503866-0.503866-0.503866-0.503866
\n", - "

5 rows × 53 columns

\n", - "
" - ], - "text/plain": [ - " Brodmann (1909) area 24 breast caudate nucleus \\\n", - "ENSG00000001561 0.122923 0.122923 0.122923 \n", - "ENSG00000004142 -0.027849 0.400274 -1.195017 \n", - "ENSG00000004766 0.713414 0.713414 -0.892283 \n", - "ENSG00000004948 0.000000 0.000000 0.000000 \n", - "ENSG00000005073 -0.503866 -0.503866 -0.503866 \n", - "\n", - " cerebellar hemisphere cerebellum cerebral cortex \\\n", - "ENSG00000001561 -0.464703 -0.464703 -0.464703 \n", - "ENSG00000004142 -0.334799 0.118918 0.920024 \n", - "ENSG00000004766 2.736352 2.736352 0.713414 \n", - "ENSG00000004948 0.000000 0.000000 0.000000 \n", - "ENSG00000005073 -0.503866 -0.503866 -0.503866 \n", - "\n", - " coronary artery cortex of kidney ectocervix endocervix \\\n", - "ENSG00000001561 2.076508 -1.183898 -3.417923 -1.183898 \n", - "ENSG00000004142 -0.027849 2.588215 -0.178998 -0.495546 \n", - "ENSG00000004766 -0.892283 -0.892283 -0.892283 -0.892283 \n", - "ENSG00000004948 0.000000 0.000000 0.000000 0.000000 \n", - "ENSG00000005073 -0.503866 -0.503866 2.396622 2.907462 \n", - "\n", - " ... transformed skin fibroblast adrenal gland \\\n", - "ENSG00000001561 ... -5.651947 -1.183898 \n", - "ENSG00000004142 ... 1.391351 5.012116 \n", - "ENSG00000004766 ... -0.892283 0.713414 \n", - "ENSG00000004948 ... 0.000000 0.000000 \n", - "ENSG00000005073 ... 1.330439 -0.503866 \n", - "\n", - " transverse colon urinary bladder uterus vagina \\\n", - "ENSG00000001561 -0.464703 -1.183898 -3.417923 -5.651947 \n", - "ENSG00000004142 0.261550 -0.495546 -0.334799 -0.027849 \n", - "ENSG00000004766 0.713414 -0.892283 0.713414 0.713414 \n", - "ENSG00000004948 0.000000 0.000000 0.000000 0.000000 \n", - "ENSG00000005073 1.330439 1.745532 3.272606 1.330439 \n", - "\n", - " amygdala aorta atrium auricular region blood \n", - "ENSG00000001561 -0.464703 1.429743 0.122923 -5.651947 \n", - "ENSG00000004142 -1.584679 -0.334799 1.161170 -3.844282 \n", - "ENSG00000004766 -0.892283 -0.892283 -3.637239 -3.637239 \n", - "ENSG00000004948 0.000000 0.000000 0.000000 0.000000 \n", - "ENSG00000005073 -0.503866 -0.503866 -0.503866 -0.503866 \n", - "\n", - "[5 rows x 53 columns]" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "adatiss = (\n", " # Read Adatiss output\n", @@ -8346,7 +8120,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -8366,7 +8140,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -8375,163 +8149,9 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ginihpaSpecificityhpaDistributionadatissScores
geneProductId
ENSG000000015610.299Low tissue specificityDetected in many[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000000041420.174Low tissue specificityDetected in all[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000000047660.169Low tissue specificityDetected in many[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000000049480.947Tissue enrichedDetected in single[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000000050730.861Tissue enhancedDetected in some[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
...............
ENSG000002815230.792Tissue enhancedDetected in some[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000002815310.163Low tissue specificityDetected in some[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000002815480.298Tissue enrichedDetected in single[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
ENSG000002816760.655Tissue enhancedDetected in someNaN
ENSG000002819120.257Low tissue specificityDetected in some[{'bodyPartLevel': 'tissue', 'bodyPartName': '...
\n", - "

28619 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " gini hpaSpecificity hpaDistribution \\\n", - "geneProductId \n", - "ENSG00000001561 0.299 Low tissue specificity Detected in many \n", - "ENSG00000004142 0.174 Low tissue specificity Detected in all \n", - "ENSG00000004766 0.169 Low tissue specificity Detected in many \n", - "ENSG00000004948 0.947 Tissue enriched Detected in single \n", - "ENSG00000005073 0.861 Tissue enhanced Detected in some \n", - "... ... ... ... \n", - "ENSG00000281523 0.792 Tissue enhanced Detected in some \n", - "ENSG00000281531 0.163 Low tissue specificity Detected in some \n", - "ENSG00000281548 0.298 Tissue enriched Detected in single \n", - "ENSG00000281676 0.655 Tissue enhanced Detected in some \n", - "ENSG00000281912 0.257 Low tissue specificity Detected in some \n", - "\n", - " adatissScores \n", - "geneProductId \n", - "ENSG00000001561 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000004142 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000004766 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000004948 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000005073 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "... ... \n", - "ENSG00000281523 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000281531 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000281548 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "ENSG00000281676 NaN \n", - "ENSG00000281912 [{'bodyPartLevel': 'tissue', 'bodyPartName': '... \n", - "\n", - "[28619 rows x 4 columns]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "a_out" ] @@ -8540,23 +8160,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 10. Produce the final output" + "# 10. Produce and verify the output\n", + "\n", + "## 10.1. Pack and output the data" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"ensemblGeneId\": \"ENSG00000001561\", \"expression\": [{\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0006101\", \"bodyPartName\": \"Brodmann (1909) area 24\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000310\", \"bodyPartName\": \"breast\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001873\", \"bodyPartName\": \"caudate nucleus\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002245\", \"bodyPartName\": \"cerebellar hemisphere\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002037\", \"bodyPartName\": \"cerebellum\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000956\", \"bodyPartName\": \"cerebral cortex\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001621\", \"bodyPartName\": \"coronary artery\", \"fpkm\": 11.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001225\", \"bodyPartName\": \"cortex of kidney\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0012249\", \"bodyPartName\": \"ectocervix\", \"fpkm\": 2.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000458\", \"bodyPartName\": \"endocervix\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0007650\", \"bodyPartName\": \"esophagogastric junction\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0013540\", \"bodyPartName\": \"Brodmann (1909) area 9\", \"fpkm\": 7.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002469\", \"bodyPartName\": \"esophagus mucosa\", \"fpkm\": 0.9}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0004648\", \"bodyPartName\": \"esophagus muscularis mucosa\", \"fpkm\": 7.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0003889\", \"bodyPartName\": \"fallopian tube\", \"fpkm\": 3.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0005448\", \"bodyPartName\": \"greater omentum\", \"fpkm\": 8.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002084\", \"bodyPartName\": \"heart left ventricle\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001954\", \"bodyPartName\": \"hippocampus proper\", \"fpkm\": 8.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001898\", \"bodyPartName\": \"hypothalamus\", \"fpkm\": 8.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002107\", \"bodyPartName\": \"liver\", \"fpkm\": 2.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0004264\", \"bodyPartName\": \"lower leg skin\", \"fpkm\": 2.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002048\", \"bodyPartName\": \"lung\", \"fpkm\": 9.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0006469\", \"bodyPartName\": \"C1 segment of cervical spinal cord\", \"fpkm\": 19.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001830\", \"bodyPartName\": \"minor salivary gland\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001882\", \"bodyPartName\": \"nucleus accumbens\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000992\", \"bodyPartName\": \"ovary\", \"fpkm\": 1.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001264\", \"bodyPartName\": \"pancreas\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000007\", \"bodyPartName\": \"pituitary gland\", \"fpkm\": 8.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002367\", \"bodyPartName\": \"prostate gland\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001874\", \"bodyPartName\": \"putamen\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001159\", \"bodyPartName\": \"sigmoid colon\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001134\", \"bodyPartName\": \"skeletal muscle tissue\", \"fpkm\": 7.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0003454\", \"bodyPartName\": \"small intestine Peyer's patch\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"CL:0000945\", \"bodyPartName\": \"EBV-transformed lymphocyte\", \"fpkm\": 10.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002106\", \"bodyPartName\": \"spleen\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000945\", \"bodyPartName\": \"stomach\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002190\", \"bodyPartName\": \"subcutaneous adipose tissue\", \"fpkm\": 7.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002038\", \"bodyPartName\": \"substantia nigra\", \"fpkm\": 9.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0036149\", \"bodyPartName\": \"suprapubic skin\", \"fpkm\": 1.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000473\", \"bodyPartName\": \"testis\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002046\", \"bodyPartName\": \"thyroid gland\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0007610\", \"bodyPartName\": \"tibial artery\", \"fpkm\": 12.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001323\", \"bodyPartName\": \"tibial nerve\", \"fpkm\": 7.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"CL:0002620\", \"bodyPartName\": \"transformed skin fibroblast\", \"fpkm\": 1.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0002369\", \"bodyPartName\": \"adrenal gland\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001157\", \"bodyPartName\": \"transverse colon\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001255\", \"bodyPartName\": \"urinary bladder\", \"fpkm\": 4.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000995\", \"bodyPartName\": \"uterus\", \"fpkm\": 2.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000996\", \"bodyPartName\": \"vagina\", \"fpkm\": 1.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0001876\", \"bodyPartName\": \"amygdala\", \"fpkm\": 5.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000947\", \"bodyPartName\": \"aorta\", \"fpkm\": 9.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0006618\", \"bodyPartName\": \"atrium auricular region\", \"fpkm\": 6.0}, {\"bodyPartLevel\": \"tissue\", \"bodyPartId\": \"UBERON:0000178\", \"bodyPartName\": \"blood\", \"fpkm\": 0.9}], \"expressionSpecificity\": {\"gini\": 0.299, \"hpaSpecificity\": \"Low tissue specificity\", \"hpaDistribution\": \"Detected in many\", \"adatissScores\": [{\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"Brodmann (1909) area 24\", \"bodyPartId\": \"UBERON:0006101\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"breast\", \"bodyPartId\": \"UBERON:0000310\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"caudate nucleus\", \"bodyPartId\": \"UBERON:0001873\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"cerebellar hemisphere\", \"bodyPartId\": \"UBERON:0002245\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"cerebellum\", \"bodyPartId\": \"UBERON:0002037\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"cerebral cortex\", \"bodyPartId\": \"UBERON:0000956\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"coronary artery\", \"bodyPartId\": \"UBERON:0001621\", \"adatissScore\": 2.077}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"cortex of kidney\", \"bodyPartId\": \"UBERON:0001225\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"ectocervix\", \"bodyPartId\": \"UBERON:0012249\", \"adatissScore\": -3.418}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"endocervix\", \"bodyPartId\": \"UBERON:0000458\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"esophagogastric junction\", \"bodyPartId\": \"UBERON:0007650\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"Brodmann (1909) area 9\", \"bodyPartId\": \"UBERON:0013540\", \"adatissScore\": 0.62}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"esophagus mucosa\", \"bodyPartId\": \"UBERON:0002469\", \"adatissScore\": -5.652}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"esophagus muscularis mucosa\", \"bodyPartId\": \"UBERON:0004648\", \"adatissScore\": 0.62}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"fallopian tube\", \"bodyPartId\": \"UBERON:0003889\", \"adatissScore\": -2.111}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"greater omentum\", \"bodyPartId\": \"UBERON:0005448\", \"adatissScore\": 1.05}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"heart left ventricle\", \"bodyPartId\": \"UBERON:0002084\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"hippocampus proper\", \"bodyPartId\": \"UBERON:0001954\", \"adatissScore\": 1.05}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"hypothalamus\", \"bodyPartId\": \"UBERON:0001898\", \"adatissScore\": 1.05}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"liver\", \"bodyPartId\": \"UBERON:0002107\", \"adatissScore\": -3.418}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"lower leg skin\", \"bodyPartId\": \"UBERON:0004264\", \"adatissScore\": -3.418}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"lung\", \"bodyPartId\": \"UBERON:0002048\", \"adatissScore\": 1.43}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"C1 segment of cervical spinal cord\", \"bodyPartId\": \"UBERON:0006469\", \"adatissScore\": 3.838}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"minor salivary gland\", \"bodyPartId\": \"UBERON:0001830\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"nucleus accumbens\", \"bodyPartId\": \"UBERON:0001882\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"ovary\", \"bodyPartId\": \"UBERON:0000992\", \"adatissScore\": -5.652}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"pancreas\", \"bodyPartId\": \"UBERON:0001264\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"pituitary gland\", \"bodyPartId\": \"UBERON:0000007\", \"adatissScore\": 1.05}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"prostate gland\", \"bodyPartId\": \"UBERON:0002367\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"putamen\", \"bodyPartId\": \"UBERON:0001874\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"sigmoid colon\", \"bodyPartId\": \"UBERON:0001159\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"skeletal muscle tissue\", \"bodyPartId\": \"UBERON:0001134\", \"adatissScore\": 0.62}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"small intestine Peyer's patch\", \"bodyPartId\": \"UBERON:0003454\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"EBV-transformed lymphocyte\", \"bodyPartId\": \"CL:0000945\", \"adatissScore\": 1.769}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"spleen\", \"bodyPartId\": \"UBERON:0002106\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"stomach\", \"bodyPartId\": \"UBERON:0000945\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"subcutaneous adipose tissue\", \"bodyPartId\": \"UBERON:0002190\", \"adatissScore\": 0.62}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"substantia nigra\", \"bodyPartId\": \"UBERON:0002038\", \"adatissScore\": 1.43}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"suprapubic skin\", \"bodyPartId\": \"UBERON:0036149\", \"adatissScore\": -5.652}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"testis\", \"bodyPartId\": \"UBERON:0000473\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"thyroid gland\", \"bodyPartId\": \"UBERON:0002046\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"tibial artery\", \"bodyPartId\": \"UBERON:0007610\", \"adatissScore\": 2.357}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"tibial nerve\", \"bodyPartId\": \"UBERON:0001323\", \"adatissScore\": 0.62}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"transformed skin fibroblast\", \"bodyPartId\": \"CL:0002620\", \"adatissScore\": -5.652}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"adrenal gland\", \"bodyPartId\": \"UBERON:0002369\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"transverse colon\", \"bodyPartId\": \"UBERON:0001157\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"urinary bladder\", \"bodyPartId\": \"UBERON:0001255\", \"adatissScore\": -1.184}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"uterus\", \"bodyPartId\": \"UBERON:0000995\", \"adatissScore\": -3.418}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"vagina\", \"bodyPartId\": \"UBERON:0000996\", \"adatissScore\": -5.652}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"amygdala\", \"bodyPartId\": \"UBERON:0001876\", \"adatissScore\": -0.465}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"aorta\", \"bodyPartId\": \"UBERON:0000947\", \"adatissScore\": 1.43}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"atrium auricular region\", \"bodyPartId\": \"UBERON:0006618\", \"adatissScore\": 0.123}, {\"bodyPartLevel\": \"tissue\", \"bodyPartName\": \"blood\", \"bodyPartId\": \"UBERON:0000178\", \"adatissScore\": -5.652}]}}\n" - ] - } - ], + "outputs": [], "source": [ + "def remove_adatiss_if_none(d):\n", + " if d['adatissScores'] != d['adatissScores']: # using the fact that NaN != NaN\n", + " return {k: v for k, v in d.items() if k != 'adatissScores'}\n", + " print('removed')\n", + " else:\n", + " return d\n", + "\n", "# Create a list to store JSON objects\n", "json_list = []\n", "\n", @@ -8575,26 +8196,57 @@ " for key, value\n", " in d_out.loc[idx].to_dict().items()\n", " ],\n", - " \"expressionSpecificity\": a_out.loc[idx].to_dict()\n", + " \"expressionSpecificity\": remove_adatiss_if_none(a_out.loc[idx].to_dict())\n", " }\n", " # Append the dictionary to the list\n", - " json_list.append(row_dict)\n", - "\n", - "print(json.dumps(json_list[0]))" + " json_list.append(row_dict)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "with gzip.open(\"expression_v2.jsonl.gz\", \"wt\", compresslevel=9) as f:\n", + "with gzip.open(\"expression_v2.1.jsonl.gz\", \"wt\", compresslevel=9) as f:\n", " for obj in json_list:\n", " f.write(json.dumps(obj))\n", " f.write('\\n')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10.2. Output one record" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(json.dumps(json_list[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10.3. Ingest the data back and print Spark schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.format(\"json\").option(\"compression\", \"gzip\").load(\"expression_v2.1.jsonl.gz\")\n", + "df.printSchema()" + ] + }, { "cell_type": "markdown", "metadata": {},