diff --git a/petagraph/code/preprocessing/ASP2019.ipynb b/petagraph/code/preprocessing/ASP2019.ipynb deleted file mode 100644 index 8846b67..0000000 --- a/petagraph/code/preprocessing/ASP2019.ipynb +++ /dev/null @@ -1,2712 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from collections import Counter\n", - "#import matplotlib.pyplot as plt\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "pd.set_option('display.max_columns', None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# The end of this workflow is different from the original scHeart.ipynb notebook located in /Users/stearb/Dropbox/CHOP/R03/code/scHeart, we are using Jonathan Silversteins workflow for the Neo4j CSV creation (meaning the files produced by this workflow will be the inputs into JS's workflow) ...so we only need to create 2 files, a nodes.tsv and an edges.tsv (instead of the ~6 files, CUIs, CUI-CUIs, Code-CUIs, Terms, etc.)\n", - "\n", - "## The guide for how to create these new nodes and edges files can be found in the Data Distillerys [github](https://github.com/dbmi-pitt/UBKG/tree/main/user%20guide)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "#umls_dir = '/Users/stearb/Desktop/hubmap-kg/new_build_csv_data/'\n", - "umls_dir = '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryDistributions/DataDistillery10Sept2023/DataDistillery10September2023/'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# !jupyter nbconvert --to script scHeart_JS.ipynb" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "## Flatten/melt the average gene expression data so we have 1 gene and 1 tissue per row\n", - "#df = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/asp_average_gene_expression_14_celltypes.csv')\n", - "#df.rename(columns={'Unnamed: 0':'Gene'},inplace=True)\n", - "#print(df.shape)\n", - "\n", - "#df2 = pd.melt(df, id_vars=\"Gene\",value_name=\"Average Exp\")\n", - "#df2['Average Exp'].plot(kind='hist',range=[0,10],bins=50)\n", - "#df2.to_csv('/Users/stearb/Desktop/R03_local/data/scHeart/asp_average_gene_expression_14_celltypes_LONG.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#cui_codes_temp = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/temp_files/CUI-CODEs.csv')\n", - "#cui_codes_temp" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2932\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_name
05.5130250.07BPGMErythrocytes
15.5092620.07MT1HErythrocytes
24.7103170.07GYPBErythrocytes
\n", - "
" - ], - "text/plain": [ - " avg_log2FC p_val_adj cluster gene cluster_celltype_name\n", - "0 5.513025 0.0 7 BPGM Erythrocytes\n", - "1 5.509262 0.0 7 MT1H Erythrocytes\n", - "2 4.710317 0.0 7 GYPB Erythrocytes" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "z = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data/scHeart/asp_cell_type_markers_zeros_labelled.csv')\n", - "z.drop(['p_val','pct.1','pct.2'],axis=1,inplace=True)\n", - "print(len(z))\n", - "z.head(3) # this data has already been filtered to remove p-vals greater than .06" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2816\n" - ] - } - ], - "source": [ - "#### Merge in HGNC IDs\n", - "hgnc_master = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data/use_config/HELPER_FILES/hgnc_master.txt',sep='\\t')\n", - "gene_map = hgnc_master[['hgnc_id','symbol']].rename(columns={'symbol':'gene'})\n", - "\n", - "z2 = pd.merge(z,gene_map,on='gene')\n", - "\n", - "print(len(z2))" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# We lost 116 rows when merging in the hgnc ids." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Here are the rows we lost.\n", - "#z[~z['gene'].isin(gene_map['gene'])].sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryDistributions/DataDistillery10Sept2023/DataDistillery10September2023/'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "umls_dir" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "#### Merge in HGNC CUIs\n", - "\n", - "# First, GET CUI - HGNC CODE MAPPINGS STRAIGHT FROM CSVs\n", - "UMLS_CUI_CODEs = pd.read_csv(umls_dir+'CUI-CODEs.csv')\n", - "\n", - "umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(\n", - " columns={':START_ID':'CUI_hgnc',':END_ID':'hgnc_id'})\n", - "\n", - "umls_genes['hgnc_id'] = [i.split(':')[1] for i in umls_genes['hgnc_id']]\n", - "\n", - "z3 = pd.merge(z2,umls_genes,on='hgnc_id')\n", - "\n", - "print(len(z3))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "7285 3513\n", - "12399 546\n", - "12404 1836\n", - "12405 1952\n", - "12415 3444\n", - " ... \n", - "17234239 9634\n", - "17234240 11445\n", - "17234241 4883\n", - "17234242 14007\n", - "17234243 10001\n", - "Name: hgnc_id, Length: 43178, dtype: object" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "umls_genes['hgnc_id']" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "z3 = z2" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "z3['cluster_celltype_name_NO_SPACE'] = [i.replace(' ','-') for i in z3['cluster_celltype_name'] ]\n", - "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('_','-') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n", - "\n", - "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('/','slash') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n", - "z3['cluster_celltype_name_NO_SPACE'] = [i.replace('&','and') for i in z3['cluster_celltype_name_NO_SPACE'] ]\n", - "\n", - "z3['hgnc_id_dashes'] = [ i.replace(':','-') for i in z3['hgnc_id'] ]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_namehgnc_idcluster_celltype_name_NO_SPACEhgnc_id_dashes
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [avg_log2FC, p_val_adj, cluster, gene, cluster_celltype_name, hgnc_id, cluster_celltype_name_NO_SPACE, hgnc_id_dashes]\n", - "Index: []" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "z3[z3['cluster_celltype_name_NO_SPACE'].str.contains('&')]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_namehgnc_idcluster_celltype_name_NO_SPACEhgnc_id_dashesSABCODECODEID
05.5130250.0000007BPGMErythrocytesHGNC:1093ErythrocytesHGNC-1093SCHEARTErythrocytes-HGNC-1093SCHEART Erythrocytes-HGNC-1093
1-2.4706320.0108126BPGMEpicardium-derived cellsHGNC:1093Epicardium-derived-cellsHGNC-1093SCHEARTEpicardium-derived-cells-HGNC-1093SCHEART Epicardium-derived-cells-HGNC-1093
2-1.9618920.00001614BPGMVentricular cardiomyocytesHGNC:1093Ventricular-cardiomyocytesHGNC-1093SCHEARTVentricular-cardiomyocytes-HGNC-1093SCHEART Ventricular-cardiomyocytes-HGNC-1093
\n", - "
" - ], - "text/plain": [ - " avg_log2FC p_val_adj cluster gene cluster_celltype_name \n", - "0 5.513025 0.000000 7 BPGM Erythrocytes \\\n", - "1 -2.470632 0.010812 6 BPGM Epicardium-derived cells \n", - "2 -1.961892 0.000016 14 BPGM Ventricular cardiomyocytes \n", - "\n", - " hgnc_id cluster_celltype_name_NO_SPACE hgnc_id_dashes SAB \n", - "0 HGNC:1093 Erythrocytes HGNC-1093 SCHEART \\\n", - "1 HGNC:1093 Epicardium-derived-cells HGNC-1093 SCHEART \n", - "2 HGNC:1093 Ventricular-cardiomyocytes HGNC-1093 SCHEART \n", - "\n", - " CODE \n", - "0 Erythrocytes-HGNC-1093 \\\n", - "1 Epicardium-derived-cells-HGNC-1093 \n", - "2 Ventricular-cardiomyocytes-HGNC-1093 \n", - "\n", - " CODEID \n", - "0 SCHEART Erythrocytes-HGNC-1093 \n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n", - "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "### Create the scHeart SAB\n", - "#z3['SAB'] = 'scHeart_PMID_31835037'\n", - "\n", - "z3['SAB'] = 'SCHEART'\n", - "\n", - "# Create scHeart CODE\n", - "\n", - "# assert that the celltype and hgncid col will uniquely identify each code node\n", - "assert len(z3) == len(z3[['cluster_celltype_name_NO_SPACE','hgnc_id']].drop_duplicates()) \n", - "\n", - "z3['CODE'] = z3['cluster_celltype_name_NO_SPACE'] + '-' + z3['hgnc_id_dashes']\n", - "\n", - "# Create scHeart CODEID\n", - "z3['CODEID'] = z3['SAB'] + ' ' + z3['CODE']\n", - "\n", - "# Create scHeart CUIs\n", - "#z3['CUI:ID'] = 'CUI_SAB'+z3['CODEID']\n", - "\n", - "#z3.drop(['cluster','gene','hgnc_id'],axis=1,inplace=True)\n", - "\n", - "z3.head(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create cell type nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "# Now create the celltype nodes\n", - "z3.rename(columns={'cluster_celltype_name_NO_SPACE':'celltype_CODE'},inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "z3['celltype_CODE'] = [i.replace('/','slash') for i in z3['celltype_CODE'] ]\n", - "z3['celltype_CODE'] = [i.replace('&','and') for i in z3['celltype_CODE'] ]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "z3['celltype_SAB'] = 'author_defined_cluster'\n", - "z3['celltype_CODEID'] = z3['celltype_SAB'] + ' ' + z3['celltype_CODE']\n", - "#z3['celltype_CUI'] = 'CUI_SAB ' + z3['celltype_CODEID']" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_namehgnc_idcelltype_CODEhgnc_id_dashesSABCODECODEIDcelltype_SABcelltype_CODEID
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [avg_log2FC, p_val_adj, cluster, gene, cluster_celltype_name, hgnc_id, celltype_CODE, hgnc_id_dashes, SAB, CODE, CODEID, celltype_SAB, celltype_CODEID]\n", - "Index: []" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "z3[z3['celltype_CODE'].str.contains('&')]" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_namehgnc_idcelltype_CODEhgnc_id_dashesSABCODECODEIDcelltype_SABcelltype_CODEID
05.5130250.0000007BPGMErythrocytesHGNC:1093ErythrocytesHGNC-1093SCHEARTErythrocytes-HGNC-1093SCHEART Erythrocytes-HGNC-1093author_defined_clusterauthor_defined_cluster Erythrocytes
1-2.4706320.0108126BPGMEpicardium-derived cellsHGNC:1093Epicardium-derived-cellsHGNC-1093SCHEARTEpicardium-derived-cells-HGNC-1093SCHEART Epicardium-derived-cells-HGNC-1093author_defined_clusterauthor_defined_cluster Epicardium-derived-cells
2-1.9618920.00001614BPGMVentricular cardiomyocytesHGNC:1093Ventricular-cardiomyocytesHGNC-1093SCHEARTVentricular-cardiomyocytes-HGNC-1093SCHEART Ventricular-cardiomyocytes-HGNC-1093author_defined_clusterauthor_defined_cluster Ventricular-cardiomyocytes
\n", - "
" - ], - "text/plain": [ - " avg_log2FC p_val_adj cluster gene cluster_celltype_name \n", - "0 5.513025 0.000000 7 BPGM Erythrocytes \\\n", - "1 -2.470632 0.010812 6 BPGM Epicardium-derived cells \n", - "2 -1.961892 0.000016 14 BPGM Ventricular cardiomyocytes \n", - "\n", - " hgnc_id celltype_CODE hgnc_id_dashes SAB \n", - "0 HGNC:1093 Erythrocytes HGNC-1093 SCHEART \\\n", - "1 HGNC:1093 Epicardium-derived-cells HGNC-1093 SCHEART \n", - "2 HGNC:1093 Ventricular-cardiomyocytes HGNC-1093 SCHEART \n", - "\n", - " CODE \n", - "0 Erythrocytes-HGNC-1093 \\\n", - "1 Epicardium-derived-cells-HGNC-1093 \n", - "2 Ventricular-cardiomyocytes-HGNC-1093 \n", - "\n", - " CODEID celltype_SAB \n", - "0 SCHEART Erythrocytes-HGNC-1093 author_defined_cluster \\\n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 author_defined_cluster \n", - "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 author_defined_cluster \n", - "\n", - " celltype_CODEID \n", - "0 author_defined_cluster Erythrocytes \n", - "1 author_defined_cluster Epicardium-derived-cells \n", - "2 author_defined_cluster Ventricular-cardiomyocytes " - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "z3.head(3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load in main dataset\n", - "The data_w_bins.csv file is the most up-to-date" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "#fulldata = pd.read_csv('/Users/stearb/Desktop/R03_local/data/scHeart/data_w_bins.csv')\n", - "#fulldata['CODEID'] = fulldata['SAB'] + fulldata['preCODE']\n", - "#fulldata['CUI:ID'] = CUIbase64(fulldata['CODEID'])\n", - "#fulldata.head(3)\n", - "\n", - "fulldata = z3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Remove rows w/ p val greater than .06 " - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "pre=len(fulldata)\n", - "\n", - "fulldata = fulldata[fulldata['p_val_adj'] < .06]\n", - "\n", - "post=len(fulldata)\n", - "\n", - "print(pre-post)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['author_defined_cluster Erythrocytes',\n", - " 'author_defined_cluster Epicardium-derived-cells',\n", - " 'author_defined_cluster Ventricular-cardiomyocytes',\n", - " 'author_defined_cluster Smooth-muscle-cells-slash-fibroblast-like)',\n", - " 'author_defined_cluster Fibroblast-like-(related-to-cardiac-skeleton-connective-tissue)',\n", - " 'author_defined_cluster Capillary-endothelium',\n", - " 'author_defined_cluster Endothelium-slash-pericytes-slash-adventitia',\n", - " 'author_defined_cluster Immune-cells',\n", - " 'author_defined_cluster Fibroblast-like-(related-to-smaller-vascular-development)',\n", - " 'author_defined_cluster Cardiac-neural-crest-cells-and-Schwann-progenitor-cells',\n", - " 'author_defined_cluster Fibroblast-like-(related-to-larger-vascular-development)',\n", - " 'author_defined_cluster Epicardial-cells',\n", - " 'author_defined_cluster Myoz2-enriched-cardiomyocytes',\n", - " 'author_defined_cluster Atrial-cardiomyocytes'], dtype=object)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fulldata['celltype_CODEID'].unique() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create edges file" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
0SCHEART Erythrocytes-HGNC-1093RO:0002206HGNC HGNC:1093
1SCHEART Epicardium-derived-cells-HGNC-1093RO:0002206HGNC HGNC:1093
\n", - "
" - ], - "text/plain": [ - " subject predicate object\n", - "0 SCHEART Erythrocytes-HGNC-1093 RO:0002206 HGNC HGNC:1093\n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 RO:0002206 HGNC HGNC:1093" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fulldata['hgnc_codeID'] = 'HGNC '+ fulldata['hgnc_id']\n", - "fulldata['hgnc_singleCell_predicate'] = 'RO:0002206' # 'expressed in' , OLD rel: 'gene_has_single_cell_expression'\n", - "\n", - "# hgnc to single cell\n", - "edges1 = fulldata[['CODEID','hgnc_singleCell_predicate','hgnc_codeID']]\n", - "edges1.columns = ['subject','predicate','object']\n", - "edges1.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [subject, predicate, object]\n", - "Index: []" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edges1[edges1['object'].str.contains('&')]" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "fulldata['celltype_singleCell_predicate'] = 'RO:0002206' # 'expressed in' 'cell_type_has_single_cell_expression'\n", - "\n", - "# cell type to singe cell\n", - "edges2 = fulldata[['CODEID','celltype_singleCell_predicate','celltype_CODEID']]\n", - "edges2.columns = ['subject','predicate','object']" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
0SCHEART Erythrocytes-HGNC-1093RO:0002206author_defined_cluster Erythrocytes
1SCHEART Epicardium-derived-cells-HGNC-1093RO:0002206author_defined_cluster Epicardium-derived-cells
2SCHEART Ventricular-cardiomyocytes-HGNC-1093RO:0002206author_defined_cluster Ventricular-cardiomyocytes
3SCHEART Erythrocytes-HGNC-7400RO:0002206author_defined_cluster Erythrocytes
4SCHEART Erythrocytes-HGNC-4703RO:0002206author_defined_cluster Erythrocytes
............
2811SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
2812SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
2813SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
2814SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
2815SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
\n", - "

2816 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " subject predicate \n", - "0 SCHEART Erythrocytes-HGNC-1093 RO:0002206 \\\n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 RO:0002206 \n", - "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 RO:0002206 \n", - "3 SCHEART Erythrocytes-HGNC-7400 RO:0002206 \n", - "4 SCHEART Erythrocytes-HGNC-4703 RO:0002206 \n", - "... ... ... \n", - "2811 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "2812 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "2813 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "2814 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "2815 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "\n", - " object \n", - "0 author_defined_cluster Erythrocytes \n", - "1 author_defined_cluster Epicardium-derived-cells \n", - "2 author_defined_cluster Ventricular-cardiomyocytes \n", - "3 author_defined_cluster Erythrocytes \n", - "4 author_defined_cluster Erythrocytes \n", - "... ... \n", - "2811 author_defined_cluster Fibroblast-like-(relate... \n", - "2812 author_defined_cluster Fibroblast-like-(relate... \n", - "2813 author_defined_cluster Fibroblast-like-(relate... \n", - "2814 author_defined_cluster Fibroblast-like-(relate... \n", - "2815 author_defined_cluster Fibroblast-like-(relate... \n", - "\n", - "[2816 rows x 3 columns]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edges2" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "edges = pd.concat([edges1,edges2])" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [subject, predicate, object]\n", - "Index: []" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edges2[edges2['object'].str.contains('&')]" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(edges1) == 2816\n", - "assert len(edges2) == 2816" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Scroll down to code to create bins for p-vals and log2fc for single cell expression Terms (past the CUI/CUI-CUI, CODEs files)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SUIs and CODE-SUIs will both be created in 2 steps.\n", - "First create the p-value and log2fc Terms and then create the threshold Terms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create bins for p-values and log2fc\n", - "I took the bins list straight from gtex, so we have the exact same bins for this data." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "# Split p-vals up into bins\n", - "\n", - "# Define pval Bins:\n", - "#### EXACT SAME BINS from GTEx notebook. ######\n", - "bins = [0,1e-12,1e-11,1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,.005,.01,.02,.03,.04,.05,.06]\n", - "\n", - "# Bin pvals\n", - "fulldata['pvalue_bins'] = pd.cut(fulldata['p_val_adj'], bins)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Now bin the log2FC column" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "log2fc_bins_neg = [-5,-4,-3,-2.5,-2,-1.75,-1.5,-1.25,-1,-.75,-.5,-.25,-.2,-.15,-.1,-.05]\n", - "log2fc_bins_pos = [i*-1 for i in log2fc_bins_neg][::-1] # and reverse it.\n", - "log2fc_bins = log2fc_bins_neg + [0] + log2fc_bins_pos + [6,7]\n", - "fulldata['log2fc_bins'] = pd.cut(fulldata['avg_log2FC'], log2fc_bins)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "# pvalues that are 0 need to be addded to the (0.0,1e-12] bin. The lowerbound for this bin is not inclusive so\n", - "# 0's are not automatically added to it.\n", - "\n", - "fulldata['pval_bins'] = [i if i is not np.nan else '(0.0,1e-12]' for i in fulldata['pvalue_bins']]\n", - "fulldata.drop('pvalue_bins',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "# Remove [] and () characters from intervals before creating SUIs\n", - "fulldata['log2fc_bins'] = [str(i)[1:-1] for i in fulldata['log2fc_bins']]\n", - "fulldata['pval_bins'] = [str(i)[1:-1] for i in fulldata['pval_bins']]\n", - "\n", - "fulldata['log2fc_bins'] = fulldata['log2fc_bins'].str.replace(' ','')\n", - "fulldata['pval_bins'] = fulldata['pval_bins'].str.replace(' ','')" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "assert fulldata.isna().sum().sum() == 0\n", - "assert len(fulldata[fulldata['log2fc_bins'].isna()]) == 0" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_log2FCp_val_adjclustergenecluster_celltype_namehgnc_idcelltype_CODEhgnc_id_dashesSABCODECODEIDcelltype_SABcelltype_CODEIDhgnc_codeIDhgnc_singleCell_predicatecelltype_singleCell_predicatelog2fc_binspval_bins
05.5130250.000000e+007BPGMErythrocytesHGNC:1093ErythrocytesHGNC-1093SCHEARTErythrocytes-HGNC-1093SCHEART Erythrocytes-HGNC-1093author_defined_clusterauthor_defined_cluster ErythrocytesHGNC HGNC:1093RO:0002206RO:00022065.0,6.00.0,1e-12
1-2.4706321.081173e-026BPGMEpicardium-derived cellsHGNC:1093Epicardium-derived-cellsHGNC-1093SCHEARTEpicardium-derived-cells-HGNC-1093SCHEART Epicardium-derived-cells-HGNC-1093author_defined_clusterauthor_defined_cluster Epicardium-derived-cellsHGNC HGNC:1093RO:0002206RO:0002206-2.5,-2.00.01,0.02
2-1.9618921.570796e-0514BPGMVentricular cardiomyocytesHGNC:1093Ventricular-cardiomyocytesHGNC-1093SCHEARTVentricular-cardiomyocytes-HGNC-1093SCHEART Ventricular-cardiomyocytes-HGNC-1093author_defined_clusterauthor_defined_cluster Ventricular-cardiomyocytesHGNC HGNC:1093RO:0002206RO:0002206-2.0,-1.751e-05,0.0001
35.5092620.000000e+007MT1HErythrocytesHGNC:7400ErythrocytesHGNC-7400SCHEARTErythrocytes-HGNC-7400SCHEART Erythrocytes-HGNC-7400author_defined_clusterauthor_defined_cluster ErythrocytesHGNC HGNC:7400RO:0002206RO:00022065.0,6.00.0,1e-12
44.7103170.000000e+007GYPBErythrocytesHGNC:4703ErythrocytesHGNC-4703SCHEARTErythrocytes-HGNC-4703SCHEART Erythrocytes-HGNC-4703author_defined_clusterauthor_defined_cluster ErythrocytesHGNC HGNC:4703RO:0002206RO:00022064.0,5.00.0,1e-12
.........................................................
28111.0376281.397046e-439EPS8Fibroblast-like (related to larger vascular de...HGNC:3420Fibroblast-like-(related-to-larger-vascular-de...HGNC-3420SCHEARTFibroblast-like-(related-to-larger-vascular-de...SCHEART Fibroblast-like-(related-to-larger-vas...author_defined_clusterauthor_defined_cluster Fibroblast-like-(relate...HGNC HGNC:3420RO:0002206RO:00022061.0,1.250.0,1e-12
28121.0069355.950515e-429SEMA5AFibroblast-like (related to larger vascular de...HGNC:10736Fibroblast-like-(related-to-larger-vascular-de...HGNC-10736SCHEARTFibroblast-like-(related-to-larger-vascular-de...SCHEART Fibroblast-like-(related-to-larger-vas...author_defined_clusterauthor_defined_cluster Fibroblast-like-(relate...HGNC HGNC:10736RO:0002206RO:00022061.0,1.250.0,1e-12
28131.0724148.713596e-409MFGE8Fibroblast-like (related to larger vascular de...HGNC:7036Fibroblast-like-(related-to-larger-vascular-de...HGNC-7036SCHEARTFibroblast-like-(related-to-larger-vascular-de...SCHEART Fibroblast-like-(related-to-larger-vas...author_defined_clusterauthor_defined_cluster Fibroblast-like-(relate...HGNC HGNC:7036RO:0002206RO:00022061.0,1.250.0,1e-12
28141.0828021.299664e-389ZEB2Fibroblast-like (related to larger vascular de...HGNC:14881Fibroblast-like-(related-to-larger-vascular-de...HGNC-14881SCHEARTFibroblast-like-(related-to-larger-vascular-de...SCHEART Fibroblast-like-(related-to-larger-vas...author_defined_clusterauthor_defined_cluster Fibroblast-like-(relate...HGNC HGNC:14881RO:0002206RO:00022061.0,1.250.0,1e-12
28151.0150798.359097e-359PHGDHFibroblast-like (related to larger vascular de...HGNC:8923Fibroblast-like-(related-to-larger-vascular-de...HGNC-8923SCHEARTFibroblast-like-(related-to-larger-vascular-de...SCHEART Fibroblast-like-(related-to-larger-vas...author_defined_clusterauthor_defined_cluster Fibroblast-like-(relate...HGNC HGNC:8923RO:0002206RO:00022061.0,1.250.0,1e-12
\n", - "

2816 rows × 18 columns

\n", - "
" - ], - "text/plain": [ - " avg_log2FC p_val_adj cluster gene \n", - "0 5.513025 0.000000e+00 7 BPGM \\\n", - "1 -2.470632 1.081173e-02 6 BPGM \n", - "2 -1.961892 1.570796e-05 14 BPGM \n", - "3 5.509262 0.000000e+00 7 MT1H \n", - "4 4.710317 0.000000e+00 7 GYPB \n", - "... ... ... ... ... \n", - "2811 1.037628 1.397046e-43 9 EPS8 \n", - "2812 1.006935 5.950515e-42 9 SEMA5A \n", - "2813 1.072414 8.713596e-40 9 MFGE8 \n", - "2814 1.082802 1.299664e-38 9 ZEB2 \n", - "2815 1.015079 8.359097e-35 9 PHGDH \n", - "\n", - " cluster_celltype_name hgnc_id \n", - "0 Erythrocytes HGNC:1093 \\\n", - "1 Epicardium-derived cells HGNC:1093 \n", - "2 Ventricular cardiomyocytes HGNC:1093 \n", - "3 Erythrocytes HGNC:7400 \n", - "4 Erythrocytes HGNC:4703 \n", - "... ... ... \n", - "2811 Fibroblast-like (related to larger vascular de... HGNC:3420 \n", - "2812 Fibroblast-like (related to larger vascular de... HGNC:10736 \n", - "2813 Fibroblast-like (related to larger vascular de... HGNC:7036 \n", - "2814 Fibroblast-like (related to larger vascular de... HGNC:14881 \n", - "2815 Fibroblast-like (related to larger vascular de... HGNC:8923 \n", - "\n", - " celltype_CODE hgnc_id_dashes \n", - "0 Erythrocytes HGNC-1093 \\\n", - "1 Epicardium-derived-cells HGNC-1093 \n", - "2 Ventricular-cardiomyocytes HGNC-1093 \n", - "3 Erythrocytes HGNC-7400 \n", - "4 Erythrocytes HGNC-4703 \n", - "... ... ... \n", - "2811 Fibroblast-like-(related-to-larger-vascular-de... HGNC-3420 \n", - "2812 Fibroblast-like-(related-to-larger-vascular-de... HGNC-10736 \n", - "2813 Fibroblast-like-(related-to-larger-vascular-de... HGNC-7036 \n", - "2814 Fibroblast-like-(related-to-larger-vascular-de... HGNC-14881 \n", - "2815 Fibroblast-like-(related-to-larger-vascular-de... HGNC-8923 \n", - "\n", - " SAB CODE \n", - "0 SCHEART Erythrocytes-HGNC-1093 \\\n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n", - "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 \n", - "3 SCHEART Erythrocytes-HGNC-7400 \n", - "4 SCHEART Erythrocytes-HGNC-4703 \n", - "... ... ... \n", - "2811 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n", - "2812 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n", - "2813 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n", - "2814 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n", - "2815 SCHEART Fibroblast-like-(related-to-larger-vascular-de... \n", - "\n", - " CODEID \n", - "0 SCHEART Erythrocytes-HGNC-1093 \\\n", - "1 SCHEART Epicardium-derived-cells-HGNC-1093 \n", - "2 SCHEART Ventricular-cardiomyocytes-HGNC-1093 \n", - "3 SCHEART Erythrocytes-HGNC-7400 \n", - "4 SCHEART Erythrocytes-HGNC-4703 \n", - "... ... \n", - "2811 SCHEART Fibroblast-like-(related-to-larger-vas... \n", - "2812 SCHEART Fibroblast-like-(related-to-larger-vas... \n", - "2813 SCHEART Fibroblast-like-(related-to-larger-vas... \n", - "2814 SCHEART Fibroblast-like-(related-to-larger-vas... \n", - "2815 SCHEART Fibroblast-like-(related-to-larger-vas... \n", - "\n", - " celltype_SAB \n", - "0 author_defined_cluster \\\n", - "1 author_defined_cluster \n", - "2 author_defined_cluster \n", - "3 author_defined_cluster \n", - "4 author_defined_cluster \n", - "... ... \n", - "2811 author_defined_cluster \n", - "2812 author_defined_cluster \n", - "2813 author_defined_cluster \n", - "2814 author_defined_cluster \n", - "2815 author_defined_cluster \n", - "\n", - " celltype_CODEID hgnc_codeID \n", - "0 author_defined_cluster Erythrocytes HGNC HGNC:1093 \\\n", - "1 author_defined_cluster Epicardium-derived-cells HGNC HGNC:1093 \n", - "2 author_defined_cluster Ventricular-cardiomyocytes HGNC HGNC:1093 \n", - "3 author_defined_cluster Erythrocytes HGNC HGNC:7400 \n", - "4 author_defined_cluster Erythrocytes HGNC HGNC:4703 \n", - "... ... ... \n", - "2811 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:3420 \n", - "2812 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:10736 \n", - "2813 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:7036 \n", - "2814 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:14881 \n", - "2815 author_defined_cluster Fibroblast-like-(relate... HGNC HGNC:8923 \n", - "\n", - " hgnc_singleCell_predicate celltype_singleCell_predicate log2fc_bins \n", - "0 RO:0002206 RO:0002206 5.0,6.0 \\\n", - "1 RO:0002206 RO:0002206 -2.5,-2.0 \n", - "2 RO:0002206 RO:0002206 -2.0,-1.75 \n", - "3 RO:0002206 RO:0002206 5.0,6.0 \n", - "4 RO:0002206 RO:0002206 4.0,5.0 \n", - "... ... ... ... \n", - "2811 RO:0002206 RO:0002206 1.0,1.25 \n", - "2812 RO:0002206 RO:0002206 1.0,1.25 \n", - "2813 RO:0002206 RO:0002206 1.0,1.25 \n", - "2814 RO:0002206 RO:0002206 1.0,1.25 \n", - "2815 RO:0002206 RO:0002206 1.0,1.25 \n", - "\n", - " pval_bins \n", - "0 0.0,1e-12 \n", - "1 0.01,0.02 \n", - "2 1e-05,0.0001 \n", - "3 0.0,1e-12 \n", - "4 0.0,1e-12 \n", - "... ... \n", - "2811 0.0,1e-12 \n", - "2812 0.0,1e-12 \n", - "2813 0.0,1e-12 \n", - "2814 0.0,1e-12 \n", - "2815 0.0,1e-12 \n", - "\n", - "[2816 rows x 18 columns]" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fulldata" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "fulldata['celltype_label'] = np.nan\n", - "fulldata['singlecell_label'] = np.nan" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Now create nodes file\n", - "Need to create a 'single-cell' nodes file and a 'cell type' nodes file" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "# cell type nodes file\n", - "nodes_celltypes = fulldata[['celltype_CODEID','celltype_label']]\n", - "\n", - "# Celltype nodes dont have log2fc or pvals so create nan cols so we can concatenate to the singlecell nodes file\n", - "#nodes_celltypes['log2fc_bins'] = np.nan\n", - "#nodes_celltypes['pval_bins'] = np.nan\n", - "\n", - "# single cell nodes file\n", - "nodes_singlecell = fulldata[['CODEID','singlecell_label']] #\n", - "\n", - "# log2fc_bins pval_bins ---> additional columns for upper and lower threshold? (4 float columns total)\n", - "\n", - "nodes_celltypes.columns = ['node_id','node_label']\n", - "nodes_singlecell.columns = ['node_id','node_label']\n", - "\n", - "nodes = pd.concat([nodes_celltypes,nodes_singlecell])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create log2FC edge file" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "fulldata['CODEID_log2FC'] = ['LOG2FCBINS ' + i for i in fulldata['log2fc_bins']]\n", - "edges_scHeart_log2FC = fulldata[['CODEID','CODEID_log2FC']]" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "edges_scHeart_log2FC['predicate'] = 'log2FC'" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "edges_scHeart_log2FC = edges_scHeart_log2FC[['CODEID','predicate','CODEID_log2FC']]\n", - "edges_scHeart_log2FC.columns = ['subject','predicate','object']\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "edges = pd.concat([edges,edges_scHeart_log2FC])" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
1016SCHEART Fibroblast-like-(related-to-smaller-va...RO:0002206HGNC HGNC:9630
2635SCHEART Cardiac-neural-crest-cells-and-Schwann...RO:0002206HGNC HGNC:1704
507SCHEART Fibroblast-like-(related-to-larger-vas...RO:0002206author_defined_cluster Fibroblast-like-(relate...
205SCHEART Erythrocytes-HGNC-695RO:0002206author_defined_cluster Erythrocytes
2640SCHEART Cardiac-neural-crest-cells-and-Schwann...log2FCLOG2FCBINS 1.0,1.25
2427SCHEART Capillary-endothelium-HGNC-3176RO:0002206author_defined_cluster Capillary-endothelium
2320SCHEART Immune-cells-HGNC-1765log2FCLOG2FCBINS 1.0,1.25
2425SCHEART Capillary-endothelium-HGNC-1759RO:0002206HGNC HGNC:1759
1172SCHEART Erythrocytes-HGNC-21689RO:0002206HGNC HGNC:21689
2604SCHEART Cardiac-neural-crest-cells-and-Schwann...log2FCLOG2FCBINS 2.0,2.5
\n", - "
" - ], - "text/plain": [ - " subject predicate \n", - "1016 SCHEART Fibroblast-like-(related-to-smaller-va... RO:0002206 \\\n", - "2635 SCHEART Cardiac-neural-crest-cells-and-Schwann... RO:0002206 \n", - "507 SCHEART Fibroblast-like-(related-to-larger-vas... RO:0002206 \n", - "205 SCHEART Erythrocytes-HGNC-695 RO:0002206 \n", - "2640 SCHEART Cardiac-neural-crest-cells-and-Schwann... log2FC \n", - "2427 SCHEART Capillary-endothelium-HGNC-3176 RO:0002206 \n", - "2320 SCHEART Immune-cells-HGNC-1765 log2FC \n", - "2425 SCHEART Capillary-endothelium-HGNC-1759 RO:0002206 \n", - "1172 SCHEART Erythrocytes-HGNC-21689 RO:0002206 \n", - "2604 SCHEART Cardiac-neural-crest-cells-and-Schwann... log2FC \n", - "\n", - " object \n", - "1016 HGNC HGNC:9630 \n", - "2635 HGNC HGNC:1704 \n", - "507 author_defined_cluster Fibroblast-like-(relate... \n", - "205 author_defined_cluster Erythrocytes \n", - "2640 LOG2FCBINS 1.0,1.25 \n", - "2427 author_defined_cluster Capillary-endothelium \n", - "2320 LOG2FCBINS 1.0,1.25 \n", - "2425 HGNC HGNC:1759 \n", - "1172 HGNC HGNC:21689 \n", - "2604 LOG2FCBINS 2.0,2.5 " - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edges.sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Save edges" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "edges.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/scHeart/OWLNETS_edgelist.txt',\n", - " sep='\\t',index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## need nodes for avg logFC but NOT for pval (GTEx ingest includes P_VALUE_BINS)" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "nodes_log2fc = pd.DataFrame(edges_scHeart_log2FC['object'].drop_duplicates())" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "nodes_log2fc['node_label'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
node_idnode_label
0LOG2FCBINS 5.0,6.0NaN
1LOG2FCBINS -2.5,-2.0NaN
2LOG2FCBINS -2.0,-1.75NaN
\n", - "
" - ], - "text/plain": [ - " node_id node_label\n", - "0 LOG2FCBINS 5.0,6.0 NaN\n", - "1 LOG2FCBINS -2.5,-2.0 NaN\n", - "2 LOG2FCBINS -2.0,-1.75 NaN" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nodes_log2fc.columns = ['node_id','node_label']\n", - "nodes_log2fc.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "nodes = pd.concat([nodes,nodes_log2fc])\n", - "\n", - "nodes = nodes.drop_duplicates(subset=['node_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'author_defined_cluster': 14, 'SCHEART': 2816, 'LOG2FCBINS': 17}" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dict(Counter([i[0] for i in nodes['node_id'].str.split(' ')]))" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "# other colls can be nan\n", - "nodes['node_synonyms'] = np.nan\n", - "nodes['node_namespace'] = np.nan\n", - "nodes['node_dbxrefs'] = np.nan\n", - "nodes['node_definition'] = np.nan\n", - "\n", - "assert len(nodes) == 2847" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "def fill_missing_cols(df):\n", - " \n", - " if 'node_id' not in df.columns:\n", - " raise ValueError('Must have at least a \"node_id\" column.')\n", - " \n", - " all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',\n", - " 'node_definition','node_namespace','value','lowerbound','upperbound','unit'])\n", - " \n", - " missing_cols = list(all_cols - set(df.columns))\n", - " nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)\n", - " nan_cols_df.index = df.index\n", - " return pd.concat([df,nan_cols_df],axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "nodes = fill_missing_cols(nodes)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "nodes.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/scHeart/OWLNETS_node_metadata.txt',\n", - " sep='\\t',index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "a=set(edges[edges['object'].str.startswith('scHeart_PMID_31835037 ')]['object'].drop_duplicates())" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "b=set(edges[edges['subject'].str.startswith('scHeart_PMID_31835037 ')]['subject'])" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "c=set(nodes[nodes['node_id'].str.startswith('scHeart_PMID_31835037')]['node_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a == b == c" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "# SABs: author_defiined_cluster, scHeart_PMID_31835037, LOG2FC_BINS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_colwidth = 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cypher checks\n", - "\n", - "# match (cc:Code {SAB:'scHeart_PMID_31835037'}) return count(distinct cc) # 2816\n", - "# match (cc:Code {SAB:'author_defined_cluster'}) return count(distinct cc) # 14\n", - "# match (cc:Code {SAB:'LOG2FC_BINS'}) return cc.CODE # 17" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "celltypes_in_kg = [\"Erythrocytes\",\n", - "\"Epicardium-derived-cells\",\n", - "\"Ventricular-cardiomyocytes\",\n", - "\"Fibroblast-like-(related-to-cardiac-skeleton-connective-tissue)\",\n", - "\"Capillary-endothelium\",\n", - "\"Immune-cells\",\n", - "\"Fibroblast-like-(related-to-smaller-vascular-development)\",\n", - "\"Cardiac-neural-crest-cells-&-Schwann-progenitor-cells\",\n", - "\"Fibroblast-like-(related-to-larger-vascular-development)\",\n", - "\"Epicardial-cells\",\n", - "\"Myoz2-enriched-cardiomyocytes\",\n", - "\"Atrial-cardiomyocytes\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "all_cts = [i[1] for i in nodes[nodes['node_id'].str.startswith('author-')]['node_id'].str.split(' ')]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "#set(all_cts) - set(celltypes_in_kg)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(np.unique([i[1] for i in edges[edges['subject'].str.startswith('author-')]['subject'].str.split(' ')]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create pval edges -- still need to complete" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
subjectpredicateobject
0scHeart_PMID:31835037 Erythrocytes_HGNC:1093p_valueP_VALUE_BINS 0.0,1e-12
1scHeart_PMID:31835037 Epicardium-derived_cells...p_valueP_VALUE_BINS 0.01,0.02
2scHeart_PMID:31835037 Ventricular_cardiomyocyt...p_valueP_VALUE_BINS 1e-05,0.0001
3scHeart_PMID:31835037 Erythrocytes_HGNC:7400p_valueP_VALUE_BINS 0.0,1e-12
4scHeart_PMID:31835037 Erythrocytes_HGNC:4703p_valueP_VALUE_BINS 0.0,1e-12
............
2811scHeart_PMID:31835037 Fibroblast-like_(related...p_valueP_VALUE_BINS 0.0,1e-12
2812scHeart_PMID:31835037 Fibroblast-like_(related...p_valueP_VALUE_BINS 0.0,1e-12
2813scHeart_PMID:31835037 Fibroblast-like_(related...p_valueP_VALUE_BINS 0.0,1e-12
2814scHeart_PMID:31835037 Fibroblast-like_(related...p_valueP_VALUE_BINS 0.0,1e-12
2815scHeart_PMID:31835037 Fibroblast-like_(related...p_valueP_VALUE_BINS 0.0,1e-12
\n", - "

2816 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " subject predicate \\\n", - "0 scHeart_PMID:31835037 Erythrocytes_HGNC:1093 p_value \n", - "1 scHeart_PMID:31835037 Epicardium-derived_cells... p_value \n", - "2 scHeart_PMID:31835037 Ventricular_cardiomyocyt... p_value \n", - "3 scHeart_PMID:31835037 Erythrocytes_HGNC:7400 p_value \n", - "4 scHeart_PMID:31835037 Erythrocytes_HGNC:4703 p_value \n", - "... ... ... \n", - "2811 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n", - "2812 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n", - "2813 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n", - "2814 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n", - "2815 scHeart_PMID:31835037 Fibroblast-like_(related... p_value \n", - "\n", - " object \n", - "0 P_VALUE_BINS 0.0,1e-12 \n", - "1 P_VALUE_BINS 0.01,0.02 \n", - "2 P_VALUE_BINS 1e-05,0.0001 \n", - "3 P_VALUE_BINS 0.0,1e-12 \n", - "4 P_VALUE_BINS 0.0,1e-12 \n", - "... ... \n", - "2811 P_VALUE_BINS 0.0,1e-12 \n", - "2812 P_VALUE_BINS 0.0,1e-12 \n", - "2813 P_VALUE_BINS 0.0,1e-12 \n", - "2814 P_VALUE_BINS 0.0,1e-12 \n", - "2815 P_VALUE_BINS 0.0,1e-12 \n", - "\n", - "[2816 rows x 3 columns]" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''\n", - "fulldata['pval_bins_CODEID'] = ['P_VALUE_BINS '+i for i in fulldata['pval_bins']]\n", - "\n", - "edges_scHeart_pvals = fulldata[['CODEID','pval_bins_CODEID']]\n", - "edges_scHeart_pvals['predicate'] = 'p_value'\n", - "edges_scHeart_pvals = edges_scHeart_pvals[['CODEID','predicate','pval_bins_CODEID']]\n", - "edges_scHeart_pvals.columns = ['subject','predicate','object']\n", - "edges_scHeart_pvals'''" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
node_idnode_labelnode_synonymsnode_namespacenode_dbxrefsnode_definition
2495scHeart-PMID-31835037 Atrial-cardiomyocytes-HG...NaNNaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " node_id node_label \\\n", - "2495 scHeart-PMID-31835037 Atrial-cardiomyocytes-HG... NaN \n", - "\n", - " node_synonyms node_namespace node_dbxrefs node_definition \n", - "2495 NaN NaN NaN NaN " - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nodes[nodes['node_id'].str.contains('Atrial-cardiomyocytes-HGNC-10446')]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create the Threshold Terms (they will be strings)\n", - "\n", - "Thresholds:\n", - "- < .05\n", - "- < .001\n", - "- < .0001\n", - "- < 1e-10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Assign Threshold values " - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pval_threshold_1e-10pval_threshold_1e-4pval_threshold_1e-3pval_threshold_1e-2pval_threshold_.05p_val_adj
0< 1e-10< 0.0001< 0.001< 0.01< 0.050.000000
1NaNNaNNaNNaN< 0.050.010812
2NaN< 0.0001< 0.001< 0.01< 0.050.000016
\n", - "
" - ], - "text/plain": [ - " pval_threshold_1e-10 pval_threshold_1e-4 pval_threshold_1e-3 \\\n", - "0 < 1e-10 < 0.0001 < 0.001 \n", - "1 NaN NaN NaN \n", - "2 NaN < 0.0001 < 0.001 \n", - "\n", - " pval_threshold_1e-2 pval_threshold_.05 p_val_adj \n", - "0 < 0.01 < 0.05 0.000000 \n", - "1 NaN < 0.05 0.010812 \n", - "2 < 0.01 < 0.05 0.000016 " - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fulldata['pval_threshold_1e-10'] = ['< 1e-10' if i < 1e-10 else np.nan for i in fulldata['p_val_adj']]\n", - "\n", - "fulldata['pval_threshold_1e-4'] = ['< 0.0001' if i < 1e-4 else np.nan for i in fulldata['p_val_adj']]\n", - "\n", - "fulldata['pval_threshold_1e-3'] = ['< 0.001' if i < 1e-3 else np.nan for i in fulldata['p_val_adj']]\n", - "\n", - "fulldata['pval_threshold_1e-2'] = ['< 0.01' if i < 1e-3 else np.nan for i in fulldata['p_val_adj']]\n", - "\n", - "fulldata['pval_threshold_.05'] = ['< 0.05' if i < .05 else np.nan for i in fulldata['p_val_adj']]\n", - "\n", - "\n", - "thresh_df = fulldata[['pval_threshold_1e-10','pval_threshold_1e-4',\n", - " 'pval_threshold_1e-3','pval_threshold_1e-2','pval_threshold_.05','p_val_adj']]\n", - "\n", - "thresh_df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}