diff --git a/petagraph/code/preprocessing/ASP2019.ipynb b/petagraph/code/preprocessing/ASP2019.ipynb
deleted file mode 100644
index 8846b67..0000000
--- a/petagraph/code/preprocessing/ASP2019.ipynb
+++ /dev/null
@@ -1,2712 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "from collections import Counter\n",
- "#import matplotlib.pyplot as plt\n",
- "import warnings\n",
- "warnings.filterwarnings('ignore')\n",
- "pd.set_option('display.max_columns', None)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# The end of this workflow is different from the original scHeart.ipynb notebook located in /Users/stearb/Dropbox/CHOP/R03/code/scHeart, we are using Jonathan Silversteins workflow for the Neo4j CSV creation (meaning the files produced by this workflow will be the inputs into JS's workflow) ...so we only need to create 2 files, a nodes.tsv and an edges.tsv (instead of the ~6 files, CUIs, CUI-CUIs, Code-CUIs, Terms, etc.)\n",
- "\n",
- "## The guide for how to create these new nodes and edges files can be found in the Data Distillerys [github](https://github.com/dbmi-pitt/UBKG/tree/main/user%20guide)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "#umls_dir = '/Users/stearb/Desktop/hubmap-kg/new_build_csv_data/'\n",
- "umls_dir = '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryDistributions/DataDistillery10Sept2023/DataDistillery10September2023/'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# !jupyter nbconvert --to script scHeart_JS.ipynb"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Flatten/melt the average gene expression data so we have 1 gene and 1 tissue per row\n",
- "#df = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/asp_average_gene_expression_14_celltypes.csv')\n",
- "#df.rename(columns={'Unnamed: 0':'Gene'},inplace=True)\n",
- "#print(df.shape)\n",
- "\n",
- "#df2 = pd.melt(df, id_vars=\"Gene\",value_name=\"Average Exp\")\n",
- "#df2['Average Exp'].plot(kind='hist',range=[0,10],bins=50)\n",
- "#df2.to_csv('/Users/stearb/Desktop/R03_local/data/scHeart/asp_average_gene_expression_14_celltypes_LONG.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "#cui_codes_temp = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/temp_files/CUI-CODEs.csv')\n",
- "#cui_codes_temp"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2932\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.513025 | \n",
- " 0.0 | \n",
- " 7 | \n",
- " BPGM | \n",
- " Erythrocytes | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 5.509262 | \n",
- " 0.0 | \n",
- " 7 | \n",
- " MT1H | \n",
- " Erythrocytes | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4.710317 | \n",
- " 0.0 | \n",
- " 7 | \n",
- " GYPB | \n",
- " Erythrocytes | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " avg_log2FC p_val_adj cluster gene cluster_celltype_name\n",
- "0 5.513025 0.0 7 BPGM Erythrocytes\n",
- "1 5.509262 0.0 7 MT1H Erythrocytes\n",
- "2 4.710317 0.0 7 GYPB Erythrocytes"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "z = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data/scHeart/asp_cell_type_markers_zeros_labelled.csv')\n",
- "z.drop(['p_val','pct.1','pct.2'],axis=1,inplace=True)\n",
- "print(len(z))\n",
- "z.head(3) # this data has already been filtered to remove p-vals greater than .06"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2816\n"
- ]
- }
- ],
- "source": [
- "#### Merge in HGNC IDs\n",
- "hgnc_master = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data/use_config/HELPER_FILES/hgnc_master.txt',sep='\\t')\n",
- "gene_map = hgnc_master[['hgnc_id','symbol']].rename(columns={'symbol':'gene'})\n",
- "\n",
- "z2 = pd.merge(z,gene_map,on='gene')\n",
- "\n",
- "print(len(z2))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "# We lost 116 rows when merging in the hgnc ids."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Here are the rows we lost.\n",
- "#z[~z['gene'].isin(gene_map['gene'])].sample(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryDistributions/DataDistillery10Sept2023/DataDistillery10September2023/'"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "umls_dir"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0\n"
- ]
- }
- ],
- "source": [
- "#### Merge in HGNC CUIs\n",
- "\n",
- "# First, GET CUI - HGNC CODE MAPPINGS STRAIGHT FROM CSVs\n",
- "UMLS_CUI_CODEs = pd.read_csv(umls_dir+'CUI-CODEs.csv')\n",
- "\n",
- "umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(\n",
- " columns={':START_ID':'CUI_hgnc',':END_ID':'hgnc_id'})\n",
- "\n",
- "umls_genes['hgnc_id'] = [i.split(':')[1] for i in umls_genes['hgnc_id']]\n",
- "\n",
- "z3 = pd.merge(z2,umls_genes,on='hgnc_id')\n",
- "\n",
- "print(len(z3))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "7285 3513\n",
- "12399 546\n",
- "12404 1836\n",
- "12405 1952\n",
- "12415 3444\n",
- " ... \n",
- "17234239 9634\n",
- "17234240 11445\n",
- "17234241 4883\n",
- "17234242 14007\n",
- "17234243 10001\n",
- "Name: hgnc_id, Length: 43178, dtype: object"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "umls_genes['hgnc_id']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "z3 = z2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "z3['cluster_celltype_name_NO_SPACE'] = [i.replace(' ','-') for i in z3['cluster_celltype_name'] ]\n",
- "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('_','-') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n",
- "\n",
- "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('/','slash') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n",
- "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('&','and') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n",
- "\n",
- "z3['hgnc_id_dashes'] = [ i.replace(':','-') for i in z3['hgnc_id'] ]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- " hgnc_id | \n",
- " cluster_celltype_name_NO_SPACE | \n",
- " hgnc_id_dashes | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [avg_log2FC, p_val_adj, cluster, gene, cluster_celltype_name, hgnc_id, cluster_celltype_name_NO_SPACE, hgnc_id_dashes]\n",
- "Index: []"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "z3[z3['cluster_celltype_name_NO_SPACE'].str.contains('&')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- " hgnc_id | \n",
- " cluster_celltype_name_NO_SPACE | \n",
- " hgnc_id_dashes | \n",
- " SAB | \n",
- " CODE | \n",
- " CODEID | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.513025 | \n",
- " 0.000000 | \n",
- " 7 | \n",
- " BPGM | \n",
- " Erythrocytes | \n",
- " HGNC:1093 | \n",
- " Erythrocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Erythrocytes-HGNC-1093 | \n",
- " SCHEART Erythrocytes-HGNC-1093 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " -2.470632 | \n",
- " 0.010812 | \n",
- " 6 | \n",
- " BPGM | \n",
- " Epicardium-derived cells | \n",
- " HGNC:1093 | \n",
- " Epicardium-derived-cells | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Epicardium-derived-cells-HGNC-1093 | \n",
- " SCHEART Epicardium-derived-cells-HGNC-1093 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " -1.961892 | \n",
- " 0.000016 | \n",
- " 14 | \n",
- " BPGM | \n",
- " Ventricular cardiomyocytes | \n",
- " HGNC:1093 | \n",
- " Ventricular-cardiomyocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " SCHEART Ventricular-cardiomyocytes-HGNC-1093 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " avg_log2FC p_val_adj cluster gene cluster_celltype_name \n",
- "0 5.513025 0.000000 7 BPGM Erythrocytes \\\n",
- "1 -2.470632 0.010812 6 BPGM Epicardium-derived cells \n",
- "2 -1.961892 0.000016 14 BPGM Ventricular cardiomyocytes \n",
- "\n",
- " hgnc_id cluster_celltype_name_NO_SPACE hgnc_id_dashes SAB \n",
- "0 HGNC:1093 Erythrocytes HGNC-1093 SCHEART \\\n",
- "1 HGNC:1093 Epicardium-derived-cells HGNC-1093 SCHEART \n",
- "2 HGNC:1093 Ventricular-cardiomyocytes HGNC-1093 SCHEART \n",
- "\n",
- " CODE \n",
- "0 Erythrocytes-HGNC-1093 \\\n",
- "1 Epicardium-derived-cells-HGNC-1093 \n",
- "2 Ventricular-cardiomyocytes-HGNC-1093 \n",
- "\n",
- " CODEID \n",
- "0 SCHEART Erythrocytes-HGNC-1093 \n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n",
- "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 "
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "### Create the scHeart SAB\n",
- "#z3['SAB'] = 'scHeart_PMID_31835037'\n",
- "\n",
- "z3['SAB'] = 'SCHEART'\n",
- "\n",
- "# Create scHeart CODE\n",
- "\n",
- "# assert that the celltype and hgncid col will uniquely identify each code node\n",
- "assert len(z3) == len(z3[['cluster_celltype_name_NO_SPACE','hgnc_id']].drop_duplicates()) \n",
- "\n",
- "z3['CODE'] = z3['cluster_celltype_name_NO_SPACE'] + '-' + z3['hgnc_id_dashes']\n",
- "\n",
- "# Create scHeart CODEID\n",
- "z3['CODEID'] = z3['SAB'] + ' ' + z3['CODE']\n",
- "\n",
- "# Create scHeart CUIs\n",
- "#z3['CUI:ID'] = 'CUI_SAB'+z3['CODEID']\n",
- "\n",
- "#z3.drop(['cluster','gene','hgnc_id'],axis=1,inplace=True)\n",
- "\n",
- "z3.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Create cell type nodes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Now create the celltype nodes\n",
- "z3.rename(columns={'cluster_celltype_name_NO_SPACE':'celltype_CODE'},inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "z3['celltype_CODE'] = [i.replace('/','slash') for i in z3['celltype_CODE'] ]\n",
- "z3['celltype_CODE'] = [i.replace('&','and') for i in z3['celltype_CODE'] ]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "z3['celltype_SAB'] = 'author_defined_cluster'\n",
- "z3['celltype_CODEID'] = z3['celltype_SAB'] + ' ' + z3['celltype_CODE']\n",
- "#z3['celltype_CUI'] = 'CUI_SAB ' + z3['celltype_CODEID']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- " hgnc_id | \n",
- " celltype_CODE | \n",
- " hgnc_id_dashes | \n",
- " SAB | \n",
- " CODE | \n",
- " CODEID | \n",
- " celltype_SAB | \n",
- " celltype_CODEID | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [avg_log2FC, p_val_adj, cluster, gene, cluster_celltype_name, hgnc_id, celltype_CODE, hgnc_id_dashes, SAB, CODE, CODEID, celltype_SAB, celltype_CODEID]\n",
- "Index: []"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "z3[z3['celltype_CODE'].str.contains('&')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- " hgnc_id | \n",
- " celltype_CODE | \n",
- " hgnc_id_dashes | \n",
- " SAB | \n",
- " CODE | \n",
- " CODEID | \n",
- " celltype_SAB | \n",
- " celltype_CODEID | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.513025 | \n",
- " 0.000000 | \n",
- " 7 | \n",
- " BPGM | \n",
- " Erythrocytes | \n",
- " HGNC:1093 | \n",
- " Erythrocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Erythrocytes-HGNC-1093 | \n",
- " SCHEART Erythrocytes-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Erythrocytes | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " -2.470632 | \n",
- " 0.010812 | \n",
- " 6 | \n",
- " BPGM | \n",
- " Epicardium-derived cells | \n",
- " HGNC:1093 | \n",
- " Epicardium-derived-cells | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Epicardium-derived-cells-HGNC-1093 | \n",
- " SCHEART Epicardium-derived-cells-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Epicardium-derived-cells | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " -1.961892 | \n",
- " 0.000016 | \n",
- " 14 | \n",
- " BPGM | \n",
- " Ventricular cardiomyocytes | \n",
- " HGNC:1093 | \n",
- " Ventricular-cardiomyocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " SCHEART Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Ventricular-cardiomyocytes | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " avg_log2FC p_val_adj cluster gene cluster_celltype_name \n",
- "0 5.513025 0.000000 7 BPGM Erythrocytes \\\n",
- "1 -2.470632 0.010812 6 BPGM Epicardium-derived cells \n",
- "2 -1.961892 0.000016 14 BPGM Ventricular cardiomyocytes \n",
- "\n",
- " hgnc_id celltype_CODE hgnc_id_dashes SAB \n",
- "0 HGNC:1093 Erythrocytes HGNC-1093 SCHEART \\\n",
- "1 HGNC:1093 Epicardium-derived-cells HGNC-1093 SCHEART \n",
- "2 HGNC:1093 Ventricular-cardiomyocytes HGNC-1093 SCHEART \n",
- "\n",
- " CODE \n",
- "0 Erythrocytes-HGNC-1093 \\\n",
- "1 Epicardium-derived-cells-HGNC-1093 \n",
- "2 Ventricular-cardiomyocytes-HGNC-1093 \n",
- "\n",
- " CODEID celltype_SAB \n",
- "0 SCHEART Erythrocytes-HGNC-1093 author_defined_cluster \\\n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 author_defined_cluster \n",
- "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 author_defined_cluster \n",
- "\n",
- " celltype_CODEID \n",
- "0 author_defined_cluster Erythrocytes \n",
- "1 author_defined_cluster Epicardium-derived-cells \n",
- "2 author_defined_cluster Ventricular-cardiomyocytes "
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "z3.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load in main dataset\n",
- "The data_w_bins.csv file is the most up-to-date"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [],
- "source": [
- "#fulldata = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/data_w_bins.csv')\n",
- "#fulldata['CODEID'] = fulldata['SAB'] + fulldata['preCODE']\n",
- "#fulldata['CUI:ID'] = CUIbase64(fulldata['CODEID'])\n",
- "#fulldata.head(3)\n",
- "\n",
- "fulldata = z3"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Remove rows w/ p val greater than .06 "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0\n"
- ]
- }
- ],
- "source": [
- "pre=len(fulldata)\n",
- "\n",
- "fulldata = fulldata[fulldata['p_val_adj'] < .06]\n",
- "\n",
- "post=len(fulldata)\n",
- "\n",
- "print(pre-post)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "collapsed": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['author_defined_cluster Erythrocytes',\n",
- " 'author_defined_cluster Epicardium-derived-cells',\n",
- " 'author_defined_cluster Ventricular-cardiomyocytes',\n",
- " 'author_defined_cluster Smooth-muscle-cells-slash-fibroblast-like)',\n",
- " 'author_defined_cluster Fibroblast-like-(related-to-cardiac-skeleton-connective-tissue)',\n",
- " 'author_defined_cluster Capillary-endothelium',\n",
- " 'author_defined_cluster Endothelium-slash-pericytes-slash-adventitia',\n",
- " 'author_defined_cluster Immune-cells',\n",
- " 'author_defined_cluster Fibroblast-like-(related-to-smaller-vascular-development)',\n",
- " 'author_defined_cluster Cardiac-neural-crest-cells-and-Schwann-progenitor-cells',\n",
- " 'author_defined_cluster Fibroblast-like-(related-to-larger-vascular-development)',\n",
- " 'author_defined_cluster Epicardial-cells',\n",
- " 'author_defined_cluster Myoz2-enriched-cardiomyocytes',\n",
- " 'author_defined_cluster Atrial-cardiomyocytes'], dtype=object)"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fulldata['celltype_CODEID'].unique() "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Create edges file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " SCHEART Erythrocytes-HGNC-1093 | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:1093 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " SCHEART Epicardium-derived-cells-HGNC-1093 | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:1093 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " subject predicate object\n",
- "0 SCHEART Erythrocytes-HGNC-1093 RO:0002206 HGNC HGNC:1093\n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 RO:0002206 HGNC HGNC:1093"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fulldata['hgnc_codeID'] = 'HGNC '+ fulldata['hgnc_id']\n",
- "fulldata['hgnc_singleCell_predicate'] = 'RO:0002206' # 'expressed in' , OLD rel: 'gene_has_single_cell_expression'\n",
- "\n",
- "# hgnc to single cell\n",
- "edges1 = fulldata[['CODEID','hgnc_singleCell_predicate','hgnc_codeID']]\n",
- "edges1.columns = ['subject','predicate','object']\n",
- "edges1.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [subject, predicate, object]\n",
- "Index: []"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "edges1[edges1['object'].str.contains('&')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [],
- "source": [
- "fulldata['celltype_singleCell_predicate'] = 'RO:0002206' # 'expressed in' 'cell_type_has_single_cell_expression'\n",
- "\n",
- "# cell type to singe cell\n",
- "edges2 = fulldata[['CODEID','celltype_singleCell_predicate','celltype_CODEID']]\n",
- "edges2.columns = ['subject','predicate','object']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " SCHEART Erythrocytes-HGNC-1093 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Erythrocytes | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " SCHEART Epicardium-derived-cells-HGNC-1093 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Epicardium-derived-cells | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " SCHEART Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Ventricular-cardiomyocytes | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " SCHEART Erythrocytes-HGNC-7400 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Erythrocytes | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " SCHEART Erythrocytes-HGNC-4703 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Erythrocytes | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 2811 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- " 2812 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- " 2813 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- " 2814 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- " 2815 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- "
\n",
- "
2816 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " subject predicate \n",
- "0 SCHEART Erythrocytes-HGNC-1093 RO:0002206 \\\n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 RO:0002206 \n",
- "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 RO:0002206 \n",
- "3 SCHEART Erythrocytes-HGNC-7400 RO:0002206 \n",
- "4 SCHEART Erythrocytes-HGNC-4703 RO:0002206 \n",
- "... ... ... \n",
- "2811 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "2812 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "2813 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "2814 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "2815 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "\n",
- " object \n",
- "0 author_defined_cluster Erythrocytes \n",
- "1 author_defined_cluster Epicardium-derived-cells \n",
- "2 author_defined_cluster Ventricular-cardiomyocytes \n",
- "3 author_defined_cluster Erythrocytes \n",
- "4 author_defined_cluster Erythrocytes \n",
- "... ... \n",
- "2811 author_defined_cluster Fibroblast-like-(relate... \n",
- "2812 author_defined_cluster Fibroblast-like-(relate... \n",
- "2813 author_defined_cluster Fibroblast-like-(relate... \n",
- "2814 author_defined_cluster Fibroblast-like-(relate... \n",
- "2815 author_defined_cluster Fibroblast-like-(relate... \n",
- "\n",
- "[2816 rows x 3 columns]"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "edges2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [],
- "source": [
- "edges = pd.concat([edges1,edges2])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [subject, predicate, object]\n",
- "Index: []"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "edges2[edges2['object'].str.contains('&')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [],
- "source": [
- "assert len(edges1) == 2816\n",
- "assert len(edges2) == 2816"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Scroll down to code to create bins for p-vals and log2fc for single cell expression Terms (past the CUI/CUI-CUI, CODEs files)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## SUIs and CODE-SUIs will both be created in 2 steps.\n",
- "First create the p-value and log2fc Terms and then create the threshold Terms"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create bins for p-values and log2fc\n",
- "I took the bins list straight from gtex, so we have the exact same bins for this data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Split p-vals up into bins\n",
- "\n",
- "# Define pval Bins:\n",
- "#### EXACT SAME BINS from GTEx notebook. ######\n",
- "bins = [0,1e-12,1e-11,1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,.005,.01,.02,.03,.04,.05,.06]\n",
- "\n",
- "# Bin pvals\n",
- "fulldata['pvalue_bins'] = pd.cut(fulldata['p_val_adj'], bins)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Now bin the log2FC column"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [],
- "source": [
- "log2fc_bins_neg = [-5,-4,-3,-2.5,-2,-1.75,-1.5,-1.25,-1,-.75,-.5,-.25,-.2,-.15,-.1,-.05]\n",
- "log2fc_bins_pos = [i*-1 for i in log2fc_bins_neg][::-1] # and reverse it.\n",
- "log2fc_bins = log2fc_bins_neg + [0] + log2fc_bins_pos + [6,7]\n",
- "fulldata['log2fc_bins'] = pd.cut(fulldata['avg_log2FC'], log2fc_bins)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [],
- "source": [
- "# pvalues that are 0 need to be addded to the (0.0,1e-12] bin. The lowerbound for this bin is not inclusive so\n",
- "# 0's are not automatically added to it.\n",
- "\n",
- "fulldata['pval_bins'] = [i if i is not np.nan else '(0.0,1e-12]' for i in fulldata['pvalue_bins']]\n",
- "fulldata.drop('pvalue_bins',axis=1,inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Remove [] and () characters from intervals before creating SUIs\n",
- "fulldata['log2fc_bins'] = [str(i)[1:-1] for i in fulldata['log2fc_bins']]\n",
- "fulldata['pval_bins'] = [str(i)[1:-1] for i in fulldata['pval_bins']]\n",
- "\n",
- "fulldata['log2fc_bins'] = fulldata['log2fc_bins'].str.replace(' ','')\n",
- "fulldata['pval_bins'] = fulldata['pval_bins'].str.replace(' ','')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [],
- "source": [
- "assert fulldata.isna().sum().sum() == 0\n",
- "assert len(fulldata[fulldata['log2fc_bins'].isna()]) == 0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {
- "collapsed": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " avg_log2FC | \n",
- " p_val_adj | \n",
- " cluster | \n",
- " gene | \n",
- " cluster_celltype_name | \n",
- " hgnc_id | \n",
- " celltype_CODE | \n",
- " hgnc_id_dashes | \n",
- " SAB | \n",
- " CODE | \n",
- " CODEID | \n",
- " celltype_SAB | \n",
- " celltype_CODEID | \n",
- " hgnc_codeID | \n",
- " hgnc_singleCell_predicate | \n",
- " celltype_singleCell_predicate | \n",
- " log2fc_bins | \n",
- " pval_bins | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 5.513025 | \n",
- " 0.000000e+00 | \n",
- " 7 | \n",
- " BPGM | \n",
- " Erythrocytes | \n",
- " HGNC:1093 | \n",
- " Erythrocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Erythrocytes-HGNC-1093 | \n",
- " SCHEART Erythrocytes-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Erythrocytes | \n",
- " HGNC HGNC:1093 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 5.0,6.0 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " -2.470632 | \n",
- " 1.081173e-02 | \n",
- " 6 | \n",
- " BPGM | \n",
- " Epicardium-derived cells | \n",
- " HGNC:1093 | \n",
- " Epicardium-derived-cells | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Epicardium-derived-cells-HGNC-1093 | \n",
- " SCHEART Epicardium-derived-cells-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Epicardium-derived-cells | \n",
- " HGNC HGNC:1093 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " -2.5,-2.0 | \n",
- " 0.01,0.02 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " -1.961892 | \n",
- " 1.570796e-05 | \n",
- " 14 | \n",
- " BPGM | \n",
- " Ventricular cardiomyocytes | \n",
- " HGNC:1093 | \n",
- " Ventricular-cardiomyocytes | \n",
- " HGNC-1093 | \n",
- " SCHEART | \n",
- " Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " SCHEART Ventricular-cardiomyocytes-HGNC-1093 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Ventricular-cardiomyocytes | \n",
- " HGNC HGNC:1093 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " -2.0,-1.75 | \n",
- " 1e-05,0.0001 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 5.509262 | \n",
- " 0.000000e+00 | \n",
- " 7 | \n",
- " MT1H | \n",
- " Erythrocytes | \n",
- " HGNC:7400 | \n",
- " Erythrocytes | \n",
- " HGNC-7400 | \n",
- " SCHEART | \n",
- " Erythrocytes-HGNC-7400 | \n",
- " SCHEART Erythrocytes-HGNC-7400 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Erythrocytes | \n",
- " HGNC HGNC:7400 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 5.0,6.0 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4.710317 | \n",
- " 0.000000e+00 | \n",
- " 7 | \n",
- " GYPB | \n",
- " Erythrocytes | \n",
- " HGNC:4703 | \n",
- " Erythrocytes | \n",
- " HGNC-4703 | \n",
- " SCHEART | \n",
- " Erythrocytes-HGNC-4703 | \n",
- " SCHEART Erythrocytes-HGNC-4703 | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Erythrocytes | \n",
- " HGNC HGNC:4703 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 4.0,5.0 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 2811 | \n",
- " 1.037628 | \n",
- " 1.397046e-43 | \n",
- " 9 | \n",
- " EPS8 | \n",
- " Fibroblast-like (related to larger vascular de... | \n",
- " HGNC:3420 | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " HGNC-3420 | \n",
- " SCHEART | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- " HGNC HGNC:3420 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 1.0,1.25 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2812 | \n",
- " 1.006935 | \n",
- " 5.950515e-42 | \n",
- " 9 | \n",
- " SEMA5A | \n",
- " Fibroblast-like (related to larger vascular de... | \n",
- " HGNC:10736 | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " HGNC-10736 | \n",
- " SCHEART | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- " HGNC HGNC:10736 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 1.0,1.25 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2813 | \n",
- " 1.072414 | \n",
- " 8.713596e-40 | \n",
- " 9 | \n",
- " MFGE8 | \n",
- " Fibroblast-like (related to larger vascular de... | \n",
- " HGNC:7036 | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " HGNC-7036 | \n",
- " SCHEART | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- " HGNC HGNC:7036 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 1.0,1.25 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2814 | \n",
- " 1.082802 | \n",
- " 1.299664e-38 | \n",
- " 9 | \n",
- " ZEB2 | \n",
- " Fibroblast-like (related to larger vascular de... | \n",
- " HGNC:14881 | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " HGNC-14881 | \n",
- " SCHEART | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- " HGNC HGNC:14881 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 1.0,1.25 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2815 | \n",
- " 1.015079 | \n",
- " 8.359097e-35 | \n",
- " 9 | \n",
- " PHGDH | \n",
- " Fibroblast-like (related to larger vascular de... | \n",
- " HGNC:8923 | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " HGNC-8923 | \n",
- " SCHEART | \n",
- " Fibroblast-like-(related-to-larger-vascular-de... | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " author_defined_cluster | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- " HGNC HGNC:8923 | \n",
- " RO:0002206 | \n",
- " RO:0002206 | \n",
- " 1.0,1.25 | \n",
- " 0.0,1e-12 | \n",
- "
\n",
- " \n",
- "
\n",
- "
2816 rows × 18 columns
\n",
- "
"
- ],
- "text/plain": [
- " avg_log2FC p_val_adj cluster gene \n",
- "0 5.513025 0.000000e+00 7 BPGM \\\n",
- "1 -2.470632 1.081173e-02 6 BPGM \n",
- "2 -1.961892 1.570796e-05 14 BPGM \n",
- "3 5.509262 0.000000e+00 7 MT1H \n",
- "4 4.710317 0.000000e+00 7 GYPB \n",
- "... ... ... ... ... \n",
- "2811 1.037628 1.397046e-43 9 EPS8 \n",
- "2812 1.006935 5.950515e-42 9 SEMA5A \n",
- "2813 1.072414 8.713596e-40 9 MFGE8 \n",
- "2814 1.082802 1.299664e-38 9 ZEB2 \n",
- "2815 1.015079 8.359097e-35 9 PHGDH \n",
- "\n",
- " cluster_celltype_name hgnc_id \n",
- "0 Erythrocytes HGNC:1093 \\\n",
- "1 Epicardium-derived cells HGNC:1093 \n",
- "2 Ventricular cardiomyocytes HGNC:1093 \n",
- "3 Erythrocytes HGNC:7400 \n",
- "4 Erythrocytes HGNC:4703 \n",
- "... ... ... \n",
- "2811 Fibroblast-like (related to larger vascular de... HGNC:3420 \n",
- "2812 Fibroblast-like (related to larger vascular de... HGNC:10736 \n",
- "2813 Fibroblast-like (related to larger vascular de... HGNC:7036 \n",
- "2814 Fibroblast-like (related to larger vascular de... HGNC:14881 \n",
- "2815 Fibroblast-like (related to larger vascular de... HGNC:8923 \n",
- "\n",
- " celltype_CODE hgnc_id_dashes \n",
- "0 Erythrocytes HGNC-1093 \\\n",
- "1 Epicardium-derived-cells HGNC-1093 \n",
- "2 Ventricular-cardiomyocytes HGNC-1093 \n",
- "3 Erythrocytes HGNC-7400 \n",
- "4 Erythrocytes HGNC-4703 \n",
- "... ... ... \n",
- "2811 Fibroblast-like-(related-to-larger-vascular-de... HGNC-3420 \n",
- "2812 Fibroblast-like-(related-to-larger-vascular-de... HGNC-10736 \n",
- "2813 Fibroblast-like-(related-to-larger-vascular-de... HGNC-7036 \n",
- "2814 Fibroblast-like-(related-to-larger-vascular-de... HGNC-14881 \n",
- "2815 Fibroblast-like-(related-to-larger-vascular-de... HGNC-8923 \n",
- "\n",
- " SAB CODE \n",
- "0 SCHEART Erythrocytes-HGNC-1093 \\\n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n",
- "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 \n",
- "3 SCHEART Erythrocytes-HGNC-7400 \n",
- "4 SCHEART Erythrocytes-HGNC-4703 \n",
- "... ... ... \n",
- "2811 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n",
- "2812 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n",
- "2813 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n",
- "2814 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n",
- "2815 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n",
- "\n",
- " CODEID \n",
- "0 SCHEART Erythrocytes-HGNC-1093 \\\n",
- "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n",
- "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 \n",
- "3 SCHEART Erythrocytes-HGNC-7400 \n",
- "4 SCHEART Erythrocytes-HGNC-4703 \n",
- "... ... \n",
- "2811 SCHEART Fibroblast-like-(related-to-larger-vas... \n",
- "2812 SCHEART Fibroblast-like-(related-to-larger-vas... \n",
- "2813 SCHEART Fibroblast-like-(related-to-larger-vas... \n",
- "2814 SCHEART Fibroblast-like-(related-to-larger-vas... \n",
- "2815 SCHEART Fibroblast-like-(related-to-larger-vas... \n",
- "\n",
- " celltype_SAB \n",
- "0 author_defined_cluster \\\n",
- "1 author_defined_cluster \n",
- "2 author_defined_cluster \n",
- "3 author_defined_cluster \n",
- "4 author_defined_cluster \n",
- "... ... \n",
- "2811 author_defined_cluster \n",
- "2812 author_defined_cluster \n",
- "2813 author_defined_cluster \n",
- "2814 author_defined_cluster \n",
- "2815 author_defined_cluster \n",
- "\n",
- " celltype_CODEID hgnc_codeID \n",
- "0 author_defined_cluster Erythrocytes HGNC HGNC:1093 \\\n",
- "1 author_defined_cluster Epicardium-derived-cells HGNC HGNC:1093 \n",
- "2 author_defined_cluster Ventricular-cardiomyocytes HGNC HGNC:1093 \n",
- "3 author_defined_cluster Erythrocytes HGNC HGNC:7400 \n",
- "4 author_defined_cluster Erythrocytes HGNC HGNC:4703 \n",
- "... ... ... \n",
- "2811 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:3420 \n",
- "2812 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:10736 \n",
- "2813 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:7036 \n",
- "2814 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:14881 \n",
- "2815 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:8923 \n",
- "\n",
- " hgnc_singleCell_predicate celltype_singleCell_predicate log2fc_bins \n",
- "0 RO:0002206 RO:0002206 5.0,6.0 \\\n",
- "1 RO:0002206 RO:0002206 -2.5,-2.0 \n",
- "2 RO:0002206 RO:0002206 -2.0,-1.75 \n",
- "3 RO:0002206 RO:0002206 5.0,6.0 \n",
- "4 RO:0002206 RO:0002206 4.0,5.0 \n",
- "... ... ... ... \n",
- "2811 RO:0002206 RO:0002206 1.0,1.25 \n",
- "2812 RO:0002206 RO:0002206 1.0,1.25 \n",
- "2813 RO:0002206 RO:0002206 1.0,1.25 \n",
- "2814 RO:0002206 RO:0002206 1.0,1.25 \n",
- "2815 RO:0002206 RO:0002206 1.0,1.25 \n",
- "\n",
- " pval_bins \n",
- "0 0.0,1e-12 \n",
- "1 0.01,0.02 \n",
- "2 1e-05,0.0001 \n",
- "3 0.0,1e-12 \n",
- "4 0.0,1e-12 \n",
- "... ... \n",
- "2811 0.0,1e-12 \n",
- "2812 0.0,1e-12 \n",
- "2813 0.0,1e-12 \n",
- "2814 0.0,1e-12 \n",
- "2815 0.0,1e-12 \n",
- "\n",
- "[2816 rows x 18 columns]"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fulldata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "fulldata['celltype_label'] = np.nan\n",
- "fulldata['singlecell_label'] = np.nan"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Now create nodes file\n",
- "Need to create a 'single-cell' nodes file and a 'cell type' nodes file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [],
- "source": [
- "# cell type nodes file\n",
- "nodes_celltypes = fulldata[['celltype_CODEID','celltype_label']]\n",
- "\n",
- "# Celltype nodes dont have log2fc or pvals so create nan cols so we can concatenate to the singlecell nodes file\n",
- "#nodes_celltypes['log2fc_bins'] = np.nan\n",
- "#nodes_celltypes['pval_bins'] = np.nan\n",
- "\n",
- "# single cell nodes file\n",
- "nodes_singlecell = fulldata[['CODEID','singlecell_label']] #\n",
- "\n",
- "# log2fc_bins pval_bins ---> additional columns for upper and lower threshold? (4 float columns total)\n",
- "\n",
- "nodes_celltypes.columns = ['node_id','node_label']\n",
- "nodes_singlecell.columns = ['node_id','node_label']\n",
- "\n",
- "nodes = pd.concat([nodes_celltypes,nodes_singlecell])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create log2FC edge file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "fulldata['CODEID_log2FC'] = ['LOG2FCBINS ' + i for i in fulldata['log2fc_bins']]\n",
- "edges_scHeart_log2FC = fulldata[['CODEID','CODEID_log2FC']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [],
- "source": [
- "edges_scHeart_log2FC['predicate'] = 'log2FC'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [],
- "source": [
- "edges_scHeart_log2FC = edges_scHeart_log2FC[['CODEID','predicate','CODEID_log2FC']]\n",
- "edges_scHeart_log2FC.columns = ['subject','predicate','object']\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [],
- "source": [
- "edges = pd.concat([edges,edges_scHeart_log2FC])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1016 | \n",
- " SCHEART Fibroblast-like-(related-to-smaller-va... | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:9630 | \n",
- "
\n",
- " \n",
- " 2635 | \n",
- " SCHEART Cardiac-neural-crest-cells-and-Schwann... | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:1704 | \n",
- "
\n",
- " \n",
- " 507 | \n",
- " SCHEART Fibroblast-like-(related-to-larger-vas... | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Fibroblast-like-(relate... | \n",
- "
\n",
- " \n",
- " 205 | \n",
- " SCHEART Erythrocytes-HGNC-695 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Erythrocytes | \n",
- "
\n",
- " \n",
- " 2640 | \n",
- " SCHEART Cardiac-neural-crest-cells-and-Schwann... | \n",
- " log2FC | \n",
- " LOG2FCBINS 1.0,1.25 | \n",
- "
\n",
- " \n",
- " 2427 | \n",
- " SCHEART Capillary-endothelium-HGNC-3176 | \n",
- " RO:0002206 | \n",
- " author_defined_cluster Capillary-endothelium | \n",
- "
\n",
- " \n",
- " 2320 | \n",
- " SCHEART Immune-cells-HGNC-1765 | \n",
- " log2FC | \n",
- " LOG2FCBINS 1.0,1.25 | \n",
- "
\n",
- " \n",
- " 2425 | \n",
- " SCHEART Capillary-endothelium-HGNC-1759 | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:1759 | \n",
- "
\n",
- " \n",
- " 1172 | \n",
- " SCHEART Erythrocytes-HGNC-21689 | \n",
- " RO:0002206 | \n",
- " HGNC HGNC:21689 | \n",
- "
\n",
- " \n",
- " 2604 | \n",
- " SCHEART Cardiac-neural-crest-cells-and-Schwann... | \n",
- " log2FC | \n",
- " LOG2FCBINS 2.0,2.5 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " subject predicate \n",
- "1016 SCHEART Fibroblast-like-(related-to-smaller-va... RO:0002206 \\\n",
- "2635 SCHEART Cardiac-neural-crest-cells-and-Schwann... RO:0002206 \n",
- "507 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n",
- "205 SCHEART Erythrocytes-HGNC-695 RO:0002206 \n",
- "2640 SCHEART Cardiac-neural-crest-cells-and-Schwann... log2FC \n",
- "2427 SCHEART Capillary-endothelium-HGNC-3176 RO:0002206 \n",
- "2320 SCHEART Immune-cells-HGNC-1765 log2FC \n",
- "2425 SCHEART Capillary-endothelium-HGNC-1759 RO:0002206 \n",
- "1172 SCHEART Erythrocytes-HGNC-21689 RO:0002206 \n",
- "2604 SCHEART Cardiac-neural-crest-cells-and-Schwann... log2FC \n",
- "\n",
- " object \n",
- "1016 HGNC HGNC:9630 \n",
- "2635 HGNC HGNC:1704 \n",
- "507 author_defined_cluster Fibroblast-like-(relate... \n",
- "205 author_defined_cluster Erythrocytes \n",
- "2640 LOG2FCBINS 1.0,1.25 \n",
- "2427 author_defined_cluster Capillary-endothelium \n",
- "2320 LOG2FCBINS 1.0,1.25 \n",
- "2425 HGNC HGNC:1759 \n",
- "1172 HGNC HGNC:21689 \n",
- "2604 LOG2FCBINS 2.0,2.5 "
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "edges.sample(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Save edges"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [],
- "source": [
- "edges.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/scHeart/OWLNETS_edgelist.txt',\n",
- " sep='\\t',index=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## need nodes for avg logFC but NOT for pval (GTEx ingest includes P_VALUE_BINS)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [],
- "source": [
- "nodes_log2fc = pd.DataFrame(edges_scHeart_log2FC['object'].drop_duplicates())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [],
- "source": [
- "nodes_log2fc['node_label'] = np.nan"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " node_id | \n",
- " node_label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " LOG2FCBINS 5.0,6.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " LOG2FCBINS -2.5,-2.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " LOG2FCBINS -2.0,-1.75 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " node_id node_label\n",
- "0 LOG2FCBINS 5.0,6.0 NaN\n",
- "1 LOG2FCBINS -2.5,-2.0 NaN\n",
- "2 LOG2FCBINS -2.0,-1.75 NaN"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "nodes_log2fc.columns = ['node_id','node_label']\n",
- "nodes_log2fc.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {},
- "outputs": [],
- "source": [
- "nodes = pd.concat([nodes,nodes_log2fc])\n",
- "\n",
- "nodes = nodes.drop_duplicates(subset=['node_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'author_defined_cluster': 14, 'SCHEART': 2816, 'LOG2FCBINS': 17}"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dict(Counter([i[0] for i in nodes['node_id'].str.split(' ')]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [],
- "source": [
- "# other colls can be nan\n",
- "nodes['node_synonyms'] = np.nan\n",
- "nodes['node_namespace'] = np.nan\n",
- "nodes['node_dbxrefs'] = np.nan\n",
- "nodes['node_definition'] = np.nan\n",
- "\n",
- "assert len(nodes) == 2847"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [],
- "source": [
- "def fill_missing_cols(df):\n",
- " \n",
- " if 'node_id' not in df.columns:\n",
- " raise ValueError('Must have at least a \"node_id\" column.')\n",
- " \n",
- " all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',\n",
- " 'node_definition','node_namespace','value','lowerbound','upperbound','unit'])\n",
- " \n",
- " missing_cols = list(all_cols - set(df.columns))\n",
- " nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)\n",
- " nan_cols_df.index = df.index\n",
- " return pd.concat([df,nan_cols_df],axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "metadata": {},
- "outputs": [],
- "source": [
- "nodes = fill_missing_cols(nodes)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {},
- "outputs": [],
- "source": [
- "nodes.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/scHeart/OWLNETS_node_metadata.txt',\n",
- " sep='\\t',index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [],
- "source": [
- "a=set(edges[edges['object'].str.startswith('scHeart_PMID_31835037 ')]['object'].drop_duplicates())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [],
- "source": [
- "b=set(edges[edges['subject'].str.startswith('scHeart_PMID_31835037 ')]['subject'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [],
- "source": [
- "c=set(nodes[nodes['node_id'].str.startswith('scHeart_PMID_31835037')]['node_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "a == b == c"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [],
- "source": [
- "# SABs: author_defiined_cluster, scHeart_PMID_31835037, LOG2FC_BINS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [],
- "source": [
- "pd.options.display.max_colwidth = 100"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cypher checks\n",
- "\n",
- "# match (cc:Code {SAB:'scHeart_PMID_31835037'}) return count(distinct cc) # 2816\n",
- "# match (cc:Code {SAB:'author_defined_cluster'}) return count(distinct cc) # 14\n",
- "# match (cc:Code {SAB:'LOG2FC_BINS'}) return cc.CODE # 17"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [],
- "source": [
- "celltypes_in_kg = [\"Erythrocytes\",\n",
- "\"Epicardium-derived-cells\",\n",
- "\"Ventricular-cardiomyocytes\",\n",
- "\"Fibroblast-like-(related-to-cardiac-skeleton-connective-tissue)\",\n",
- "\"Capillary-endothelium\",\n",
- "\"Immune-cells\",\n",
- "\"Fibroblast-like-(related-to-smaller-vascular-development)\",\n",
- "\"Cardiac-neural-crest-cells-&-Schwann-progenitor-cells\",\n",
- "\"Fibroblast-like-(related-to-larger-vascular-development)\",\n",
- "\"Epicardial-cells\",\n",
- "\"Myoz2-enriched-cardiomyocytes\",\n",
- "\"Atrial-cardiomyocytes\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [],
- "source": [
- "all_cts = [i[1] for i in nodes[nodes['node_id'].str.startswith('author-')]['node_id'].str.split(' ')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "#set(all_cts) - set(celltypes_in_kg)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "14"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(np.unique([i[1] for i in edges[edges['subject'].str.startswith('author-')]['subject'].str.split(' ')]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create pval edges -- still need to complete"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "metadata": {
- "collapsed": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " subject | \n",
- " predicate | \n",
- " object | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " scHeart_PMID:31835037 Erythrocytes_HGNC:1093 | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " scHeart_PMID:31835037 Epicardium-derived_cells... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.01,0.02 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " scHeart_PMID:31835037 Ventricular_cardiomyocyt... | \n",
- " p_value | \n",
- " P_VALUE_BINS 1e-05,0.0001 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " scHeart_PMID:31835037 Erythrocytes_HGNC:7400 | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " scHeart_PMID:31835037 Erythrocytes_HGNC:4703 | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 2811 | \n",
- " scHeart_PMID:31835037 Fibroblast-like_(related... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2812 | \n",
- " scHeart_PMID:31835037 Fibroblast-like_(related... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2813 | \n",
- " scHeart_PMID:31835037 Fibroblast-like_(related... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2814 | \n",
- " scHeart_PMID:31835037 Fibroblast-like_(related... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- " 2815 | \n",
- " scHeart_PMID:31835037 Fibroblast-like_(related... | \n",
- " p_value | \n",
- " P_VALUE_BINS 0.0,1e-12 | \n",
- "
\n",
- " \n",
- "
\n",
- "
2816 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " subject predicate \\\n",
- "0 scHeart_PMID:31835037 Erythrocytes_HGNC:1093 p_value \n",
- "1 scHeart_PMID:31835037 Epicardium-derived_cells... p_value \n",
- "2 scHeart_PMID:31835037 Ventricular_cardiomyocyt... p_value \n",
- "3 scHeart_PMID:31835037 Erythrocytes_HGNC:7400 p_value \n",
- "4 scHeart_PMID:31835037 Erythrocytes_HGNC:4703 p_value \n",
- "... ... ... \n",
- "2811 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n",
- "2812 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n",
- "2813 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n",
- "2814 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n",
- "2815 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n",
- "\n",
- " object \n",
- "0 P_VALUE_BINS 0.0,1e-12 \n",
- "1 P_VALUE_BINS 0.01,0.02 \n",
- "2 P_VALUE_BINS 1e-05,0.0001 \n",
- "3 P_VALUE_BINS 0.0,1e-12 \n",
- "4 P_VALUE_BINS 0.0,1e-12 \n",
- "... ... \n",
- "2811 P_VALUE_BINS 0.0,1e-12 \n",
- "2812 P_VALUE_BINS 0.0,1e-12 \n",
- "2813 P_VALUE_BINS 0.0,1e-12 \n",
- "2814 P_VALUE_BINS 0.0,1e-12 \n",
- "2815 P_VALUE_BINS 0.0,1e-12 \n",
- "\n",
- "[2816 rows x 3 columns]"
- ]
- },
- "execution_count": 68,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "'''\n",
- "fulldata['pval_bins_CODEID'] = ['P_VALUE_BINS '+i for i in fulldata['pval_bins']]\n",
- "\n",
- "edges_scHeart_pvals = fulldata[['CODEID','pval_bins_CODEID']]\n",
- "edges_scHeart_pvals['predicate'] = 'p_value'\n",
- "edges_scHeart_pvals = edges_scHeart_pvals[['CODEID','predicate','pval_bins_CODEID']]\n",
- "edges_scHeart_pvals.columns = ['subject','predicate','object']\n",
- "edges_scHeart_pvals'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " node_id | \n",
- " node_label | \n",
- " node_synonyms | \n",
- " node_namespace | \n",
- " node_dbxrefs | \n",
- " node_definition | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2495 | \n",
- " scHeart-PMID-31835037 Atrial-cardiomyocytes-HG... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " node_id node_label \\\n",
- "2495 scHeart-PMID-31835037 Atrial-cardiomyocytes-HG... NaN \n",
- "\n",
- " node_synonyms node_namespace node_dbxrefs node_definition \n",
- "2495 NaN NaN NaN NaN "
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "nodes[nodes['node_id'].str.contains('Atrial-cardiomyocytes-HGNC-10446')]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Create the Threshold Terms (they will be strings)\n",
- "\n",
- "Thresholds:\n",
- "- < .05\n",
- "- < .001\n",
- "- < .0001\n",
- "- < 1e-10"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Assign Threshold values "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 78,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " pval_threshold_1e-10 | \n",
- " pval_threshold_1e-4 | \n",
- " pval_threshold_1e-3 | \n",
- " pval_threshold_1e-2 | \n",
- " pval_threshold_.05 | \n",
- " p_val_adj | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " < 1e-10 | \n",
- " < 0.0001 | \n",
- " < 0.001 | \n",
- " < 0.01 | \n",
- " < 0.05 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " < 0.05 | \n",
- " 0.010812 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " NaN | \n",
- " < 0.0001 | \n",
- " < 0.001 | \n",
- " < 0.01 | \n",
- " < 0.05 | \n",
- " 0.000016 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " pval_threshold_1e-10 pval_threshold_1e-4 pval_threshold_1e-3 \\\n",
- "0 < 1e-10 < 0.0001 < 0.001 \n",
- "1 NaN NaN NaN \n",
- "2 NaN < 0.0001 < 0.001 \n",
- "\n",
- " pval_threshold_1e-2 pval_threshold_.05 p_val_adj \n",
- "0 < 0.01 < 0.05 0.000000 \n",
- "1 NaN < 0.05 0.010812 \n",
- "2 < 0.01 < 0.05 0.000016 "
- ]
- },
- "execution_count": 78,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fulldata['pval_threshold_1e-10'] = ['< 1e-10' if i < 1e-10 else np.nan for i in fulldata['p_val_adj']]\n",
- "\n",
- "fulldata['pval_threshold_1e-4'] = ['< 0.0001' if i < 1e-4 else np.nan for i in fulldata['p_val_adj']]\n",
- "\n",
- "fulldata['pval_threshold_1e-3'] = ['< 0.001' if i < 1e-3 else np.nan for i in fulldata['p_val_adj']]\n",
- "\n",
- "fulldata['pval_threshold_1e-2'] = ['< 0.01' if i < 1e-3 else np.nan for i in fulldata['p_val_adj']]\n",
- "\n",
- "fulldata['pval_threshold_.05'] = ['< 0.05' if i < .05 else np.nan for i in fulldata['p_val_adj']]\n",
- "\n",
- "\n",
- "thresh_df = fulldata[['pval_threshold_1e-10','pval_threshold_1e-4',\n",
- " 'pval_threshold_1e-3','pval_threshold_1e-2','pval_threshold_.05','p_val_adj']]\n",
- "\n",
- "thresh_df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 81,
- "metadata": {},
- "outputs": [],
- "source": [
- "pd.set_option('display.max_columns', None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}