diff --git a/src/analysis/opencravat_latest_benchmarking-Consequence_80_20.ipynb b/src/analysis/opencravat_latest_benchmarking-Consequence_80_20.ipynb
index 9ba3f2e..97f6d71 100644
--- a/src/analysis/opencravat_latest_benchmarking-Consequence_80_20.ipynb
+++ b/src/analysis/opencravat_latest_benchmarking-Consequence_80_20.ipynb
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"id": "2b03269e",
"metadata": {},
"outputs": [
@@ -33,249 +33,842 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2024-02-28 16:40:55.203574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
+ "2024-06-27 17:31:47.599689: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "import yaml\n",
+ "import warnings\n",
+ "warnings.simplefilter(\"ignore\")\n",
+ "#from joblib import load, dump\n",
+ "import argparse\n",
+ "#import shap\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import functools\n",
+ "print = functools.partial(print, flush=True)\n",
+ "from sklearn.preprocessing import label_binarize, MinMaxScaler\n",
+ "from tensorflow import keras\n",
+ "from sklearn.metrics import (\n",
+ " roc_curve,precision_score,\n",
+ " precision_recall_curve,roc_auc_score,\n",
+ " f1_score,accuracy_score, confusion_matrix, ConfusionMatrixDisplay,\n",
+ " confusion_matrix,\n",
+ " average_precision_score,\n",
+ " recall_score\n",
+ ")\n",
+ "import pickle\n",
+ "from sklearn.utils import class_weight\n",
+ "import shap\n",
+ "# from keras_sequential_ascii import keras2ascii\n",
+ "# from nnv import NNV\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "880823ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "warnings.simplefilter(\"ignore\", category=DeprecationWarning)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "22e98d97",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(\n",
+ " \"../../configs/col_config.yaml\"\n",
+ " ) as fh:\n",
+ " config_dict = yaml.safe_load(fh)\n",
+ "\n",
+ "with open(\n",
+ " \"../../configs/var_class.yaml\"\n",
+ " ) as fh1:\n",
+ " var_dict = yaml.safe_load(fh1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'3 prime UTR',\n",
+ " '5 prime UTR',\n",
+ " 'complex substitution',\n",
+ " 'exon loss variant',\n",
+ " 'frameshift elongation',\n",
+ " 'frameshift truncation',\n",
+ " 'inframe deletion',\n",
+ " 'inframe insertion',\n",
+ " 'intergenic',\n",
+ " 'intron',\n",
+ " 'missense',\n",
+ " 'other',\n",
+ " 'other RNA',\n",
+ " 'splice site',\n",
+ " 'start lost',\n",
+ " 'start retained',\n",
+ " 'stop gained',\n",
+ " 'stop lost',\n",
+ " 'stop retained',\n",
+ " 'synonymous'}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "set(var_dict.values())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "3a4ff3de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#amis = pd.read_csv(\"/Users/tarunmamidi/Downloads/AlphaMissense_hg38.tsv\", low_memory=False, skiprows=3, sep='\\t')\n",
+ "#amis.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "4bcc801e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-06-27 17:31:55.583994: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: \"sequential\"\n",
+ "_________________________________________________________________\n",
+ " Layer (type) Output Shape Param # \n",
+ "=================================================================\n",
+ " dense (Dense) (None, 239) 57360 \n",
+ " \n",
+ " dense_l0 (Dense) (None, 161) 38640 \n",
+ " \n",
+ " dropout (Dropout) (None, 161) 0 \n",
+ " \n",
+ " dense_last (Dense) (None, 1) 162 \n",
+ " \n",
+ "=================================================================\n",
+ "Total params: 96,162\n",
+ "Trainable params: 96,162\n",
+ "Non-trainable params: 0\n",
+ "_________________________________________________________________\n"
+ ]
+ }
+ ],
+ "source": [
+ "clf = keras.models.load_model('../../model/Neural_network/')\n",
+ "clf.load_weights(\"../../model/weights.h5\")\n",
+ "clf.summary()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#keras2ascii(clf)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# layersList = [\n",
+ "# {\"title\":\"Input\\n(239 n)\\n(elu)\", \"units\": 239, \"color\": \"green\", \"edges_color\":\"darkBlue\", \"edges_width\":2},\n",
+ "# {\"title\":\"Dense\\n(161 n)\\n(elu)\", \"units\": 161, \"edges_color\":\"darkBlue\", \"edges_width\":2,\"color\": \"orange\"},\n",
+ "# #{\"title\":\"Dropout\", \"units\": 161, \"edges_color\":\"red\", \"edges_width\":2},\n",
+ "# {\"title\":\"output\\n(1 n)\\n(sigmoid)\", \"units\": 1,\"color\": \"red\"},\n",
+ "# ]\n",
+ "\n",
+ "# NNV(layersList).render(save_to_file=\"DITTO.png\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "276a7133",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(842659, 255)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " transcript | \n",
+ " gene | \n",
+ " consequence | \n",
+ " protein_hgvs | \n",
+ " cdna_hgvs | \n",
+ " chrom | \n",
+ " pos | \n",
+ " ref_base | \n",
+ " alt_base | \n",
+ " clingen.disease | \n",
+ " ... | \n",
+ " mutationtaster.prediction_Automatic Polymorphism | \n",
+ " mutationtaster.prediction_Damaging | \n",
+ " mutationtaster.prediction_Polymorphism | \n",
+ " mutationtaster.model_complex_aae | \n",
+ " mutationtaster.model_simple_aae | \n",
+ " mutationtaster.model_without_aae | \n",
+ " prec.stat_lof-tolerant | \n",
+ " prec.stat_recessive | \n",
+ " sift.confidence_High | \n",
+ " sift.confidence_Low | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENST00000350721 | \n",
+ " ATR | \n",
+ " synonymous_variant | \n",
+ " p.Asp2494= | \n",
+ " c.7482T>C | \n",
+ " chr3 | \n",
+ " 142458979 | \n",
+ " A | \n",
+ " G | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENST00000661310 | \n",
+ " ATR | \n",
+ " synonymous_variant | \n",
+ " p.Asp2430= | \n",
+ " c.7290T>C | \n",
+ " chr3 | \n",
+ " 142458979 | \n",
+ " A | \n",
+ " G | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENST00000310018 | \n",
+ " ATP6V0A4 | \n",
+ " missense_variant | \n",
+ " p.Asp679Tyr | \n",
+ " c.2035G>T | \n",
+ " chr7 | \n",
+ " 138722001 | \n",
+ " C | \n",
+ " A | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENST00000353492 | \n",
+ " ATP6V0A4 | \n",
+ " missense_variant | \n",
+ " p.Asp679Tyr | \n",
+ " c.2035G>T | \n",
+ " chr7 | \n",
+ " 138722001 | \n",
+ " C | \n",
+ " A | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENST00000393054 | \n",
+ " ATP6V0A4 | \n",
+ " missense_variant | \n",
+ " p.Asp679Tyr | \n",
+ " c.2035G>T | \n",
+ " chr7 | \n",
+ " 138722001 | \n",
+ " C | \n",
+ " A | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 255 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " transcript gene consequence protein_hgvs cdna_hgvs \n",
+ "0 ENST00000350721 ATR synonymous_variant p.Asp2494= c.7482T>C \\\n",
+ "1 ENST00000661310 ATR synonymous_variant p.Asp2430= c.7290T>C \n",
+ "2 ENST00000310018 ATP6V0A4 missense_variant p.Asp679Tyr c.2035G>T \n",
+ "3 ENST00000353492 ATP6V0A4 missense_variant p.Asp679Tyr c.2035G>T \n",
+ "4 ENST00000393054 ATP6V0A4 missense_variant p.Asp679Tyr c.2035G>T \n",
+ "\n",
+ " chrom pos ref_base alt_base clingen.disease ... \n",
+ "0 chr3 142458979 A G NaN ... \\\n",
+ "1 chr3 142458979 A G NaN ... \n",
+ "2 chr7 138722001 C A NaN ... \n",
+ "3 chr7 138722001 C A NaN ... \n",
+ "4 chr7 138722001 C A NaN ... \n",
+ "\n",
+ " mutationtaster.prediction_Automatic Polymorphism \n",
+ "0 0 \\\n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ " mutationtaster.prediction_Damaging mutationtaster.prediction_Polymorphism \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "\n",
+ " mutationtaster.model_complex_aae mutationtaster.model_simple_aae \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 1 \n",
+ "3 0 1 \n",
+ "4 0 1 \n",
+ "\n",
+ " mutationtaster.model_without_aae prec.stat_lof-tolerant \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " prec.stat_recessive sift.confidence_High sift.confidence_Low \n",
+ "0 1 1 0 \n",
+ "1 1 0 0 \n",
+ "2 0 1 0 \n",
+ "3 0 1 0 \n",
+ "4 0 1 0 \n",
+ "\n",
+ "[5 rows x 255 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train = pd.read_csv(f\"../../data/train_class_data_80.csv.gz\")\n",
+ "print(X_train.shape)\n",
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "401a4e11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['transcript',\n",
+ " 'gene',\n",
+ " 'consequence',\n",
+ " 'protein_hgvs',\n",
+ " 'cdna_hgvs',\n",
+ " 'chrom',\n",
+ " 'pos',\n",
+ " 'ref_base',\n",
+ " 'alt_base',\n",
+ " 'clingen.disease',\n",
+ " 'clingen.classification',\n",
+ " 'ncbigene.entrez',\n",
+ " 'omim.omim_id',\n",
+ " 'uniprot.acc',\n",
+ " 'dbsnp.rsid',\n",
+ " 'class',\n",
+ " 'aloft.tolerant',\n",
+ " 'aloft.recessive',\n",
+ " 'aloft.dominant',\n",
+ " 'cadd.phred',\n",
+ " 'chasmplus.score',\n",
+ " 'chasmplus.pval',\n",
+ " 'civic.molecular_profile_score',\n",
+ " 'cosmic.variant_count',\n",
+ " 'cosmic_gene.occurrences',\n",
+ " 'cscape.score',\n",
+ " 'cancer_genome_interpreter.resistant',\n",
+ " 'cancer_genome_interpreter.responsive',\n",
+ " 'clinpred.score',\n",
+ " 'dann.score',\n",
+ " 'dann_coding.dann_coding_score',\n",
+ " 'dgi.score',\n",
+ " 'ess_gene.indispensability_score',\n",
+ " 'exac_gene.exac_pli',\n",
+ " 'exac_gene.exac_pnull',\n",
+ " 'exac_gene.exac_del_score',\n",
+ " 'exac_gene.exac_dup_score',\n",
+ " 'exac_gene.exac_cnv_score',\n",
+ " 'fathmm.fathmm_score',\n",
+ " 'fathmm_xf_coding.fathmm_xf_coding_score',\n",
+ " 'funseq2.score',\n",
+ " 'gerp.gerp_rs',\n",
+ " 'ghis.ghis',\n",
+ " 'gwas_catalog.pval',\n",
+ " 'genehancer.score',\n",
+ " 'linsight.value',\n",
+ " 'lrt.lrt_score',\n",
+ " 'lrt.lrt_omega',\n",
+ " 'loftool.loftool_score',\n",
+ " 'mavedb.score',\n",
+ " 'metalr.score',\n",
+ " 'metasvm.score',\n",
+ " 'mutpred1.mutpred_general_score',\n",
+ " 'mutpred_indel.score',\n",
+ " 'mutation_assessor.score',\n",
+ " 'mutationtaster.score',\n",
+ " 'ndex_chd.numhit',\n",
+ " 'ndex.numhit',\n",
+ " 'ndex_signor.numhit',\n",
+ " 'prec.prec',\n",
+ " 'provean.score',\n",
+ " 'pangalodb.sensitivity',\n",
+ " 'pangalodb.specificity',\n",
+ " 'phdsnpg.score',\n",
+ " 'phastcons.phastcons100_vert',\n",
+ " 'phastcons.phastcons30_mamm',\n",
+ " 'phastcons.phastcons17way_primate',\n",
+ " 'phylop.phylop100_vert',\n",
+ " 'phylop.phylop30_mamm',\n",
+ " 'phylop.phylop17_primate',\n",
+ " 'polyphen2.hdiv_rank',\n",
+ " 'polyphen2.hvar_rank',\n",
+ " 'revel.score',\n",
+ " 'rvis.rvis_evs',\n",
+ " 'sift.score',\n",
+ " 'sift.med',\n",
+ " 'sift.seqs',\n",
+ " 'segway.mean_score',\n",
+ " 'siphy.logodds_rank',\n",
+ " 'spliceai.ds_ag',\n",
+ " 'spliceai.ds_al',\n",
+ " 'spliceai.ds_dg',\n",
+ " 'spliceai.ds_dl',\n",
+ " 'spliceai.dp_ag',\n",
+ " 'spliceai.dp_al',\n",
+ " 'spliceai.dp_dg',\n",
+ " 'spliceai.dp_dl',\n",
+ " 'varity_r.varity_r_loo',\n",
+ " 'varity_r.varity_er_loo',\n",
+ " 'vest.score',\n",
+ " 'dbscsnv.ada_score',\n",
+ " 'dbscsnv.rf_score',\n",
+ " 'gnomad.af',\n",
+ " 'gnomad_gene.oe_lof',\n",
+ " 'gnomad_gene.oe_mis',\n",
+ " 'gnomad_gene.oe_syn',\n",
+ " 'gnomad_gene.lof_z',\n",
+ " 'gnomad_gene.mis_z',\n",
+ " 'gnomad_gene.syn_z',\n",
+ " 'gnomad_gene.pLI',\n",
+ " 'gnomad_gene.pRec',\n",
+ " 'gnomad_gene.pNull',\n",
+ " 'gnomad3.af',\n",
+ " 'phi.phi',\n",
+ " 'Adipose_Subcutaneous',\n",
+ " 'Adipose_Visceral_Omentum',\n",
+ " 'Adrenal_Gland',\n",
+ " 'Artery_Aorta',\n",
+ " 'Artery_Coronary',\n",
+ " 'Artery_Tibial',\n",
+ " 'Brain_Amygdala',\n",
+ " 'Brain_Anterior_cingulate_cortex_BA24',\n",
+ " 'Brain_Caudate_basal_ganglia',\n",
+ " 'Brain_Cerebellar_Hemisphere',\n",
+ " 'Brain_Cerebellum',\n",
+ " 'Brain_Cortex',\n",
+ " 'Brain_Frontal_Cortex_BA9',\n",
+ " 'Brain_Hippocampus',\n",
+ " 'Brain_Hypothalamus',\n",
+ " 'Brain_Nucleus_accumbens_basal_ganglia',\n",
+ " 'Brain_Putamen_basal_ganglia',\n",
+ " 'Brain_Spinal_cord_cervical_c-1',\n",
+ " 'Brain_Substantia_nigra',\n",
+ " 'Breast_Mammary_Tissue',\n",
+ " 'Cells_EBV-transformed_lymphocytes',\n",
+ " 'Cells_Transformed_fibroblasts',\n",
+ " 'Colon_Sigmoid',\n",
+ " 'Colon_Transverse',\n",
+ " 'Esophagus_Gastroesophageal_Junction',\n",
+ " 'Esophagus_Mucosa',\n",
+ " 'Esophagus_Muscularis',\n",
+ " 'Heart_Atrial_Appendage',\n",
+ " 'Heart_Left_Ventricle',\n",
+ " 'Liver',\n",
+ " 'Lung',\n",
+ " 'Minor_Salivary_Gland',\n",
+ " 'Muscle_Skeletal',\n",
+ " 'Nerve_Tibial',\n",
+ " 'Ovary',\n",
+ " 'Pancreas',\n",
+ " 'Pituitary',\n",
+ " 'Prostate',\n",
+ " 'Skin_Not_Sun_Exposed_Suprapubic',\n",
+ " 'Skin_Sun_Exposed_Lower_leg',\n",
+ " 'Small_Intestine_Terminal_Ileum',\n",
+ " 'Spleen',\n",
+ " 'Stomach',\n",
+ " 'Testis',\n",
+ " 'Thyroid',\n",
+ " 'Uterus',\n",
+ " 'Vagina',\n",
+ " 'Whole_Blood',\n",
+ " 'activator',\n",
+ " 'adduct',\n",
+ " 'agonist',\n",
+ " 'allosteric modulator',\n",
+ " 'antagonist',\n",
+ " 'antibody',\n",
+ " 'binder',\n",
+ " 'blocker',\n",
+ " 'chaperone',\n",
+ " 'cofactor',\n",
+ " 'inducer',\n",
+ " 'inhibitor',\n",
+ " 'ligand',\n",
+ " 'modulator',\n",
+ " 'negative modulator',\n",
+ " 'positive modulator',\n",
+ " 'potentiator',\n",
+ " 'product of',\n",
+ " 'stimulator',\n",
+ " 'substrate',\n",
+ " 'vaccine',\n",
+ " 'AD',\n",
+ " 'AR',\n",
+ " 'AR ',\n",
+ " 'BG',\n",
+ " 'Digenic',\n",
+ " 'XL',\n",
+ " '2kb_downstream_variant',\n",
+ " '2kb_upstream_variant',\n",
+ " '3_prime_UTR_variant',\n",
+ " '5_prime_UTR_variant',\n",
+ " 'NMD_transcript_variant',\n",
+ " 'NSD_transcript',\n",
+ " 'complex_substitution',\n",
+ " 'exon_loss_variant',\n",
+ " 'frameshift_elongation',\n",
+ " 'frameshift_truncation',\n",
+ " 'inframe_deletion',\n",
+ " 'inframe_insertion',\n",
+ " 'intron_variant',\n",
+ " 'lnc_RNA',\n",
+ " 'miRNA',\n",
+ " 'misc_RNA',\n",
+ " 'missense_variant',\n",
+ " 'polymorphic_pseudogene',\n",
+ " 'processed_transcript',\n",
+ " 'rRNA',\n",
+ " 'ribozyme',\n",
+ " 'scaRNA',\n",
+ " 'snRNA',\n",
+ " 'snoRNA',\n",
+ " 'splice_site_variant',\n",
+ " 'start_lost',\n",
+ " 'start_retained_variant',\n",
+ " 'stop_gained',\n",
+ " 'stop_lost',\n",
+ " 'stop_retained_variant',\n",
+ " 'synonymous_variant',\n",
+ " 'transcript_ablation',\n",
+ " 'LINE',\n",
+ " 'LTR',\n",
+ " 'Low_complexity',\n",
+ " 'SINE',\n",
+ " 'Satellite',\n",
+ " 'Simple_repeat',\n",
+ " 'Enhancer',\n",
+ " 'Promoter',\n",
+ " 'germline',\n",
+ " 'somatic',\n",
+ " 'Oncogene',\n",
+ " 'TSG',\n",
+ " 'fusion',\n",
+ " 'coding_Yes',\n",
+ " 'aloft.pred_Dominant',\n",
+ " 'aloft.pred_Recessive',\n",
+ " 'aloft.pred_Tolerant',\n",
+ " 'aloft.conf_High',\n",
+ " 'aloft.conf_Low',\n",
+ " 'ccre_screen._group_CTCF-only',\n",
+ " 'ccre_screen._group_DNase-H3K4me3',\n",
+ " 'ccre_screen._group_PLS',\n",
+ " 'ccre_screen._group_dELS',\n",
+ " 'ccre_screen._group_pELS',\n",
+ " 'ccre_screen.bound_Yes',\n",
+ " 'ensembl_regulatory_build.region_CTCF_binding_site',\n",
+ " 'ensembl_regulatory_build.region_TF_binding_site',\n",
+ " 'ensembl_regulatory_build.region_enhancer',\n",
+ " 'ensembl_regulatory_build.region_open_chromatin_region',\n",
+ " 'ensembl_regulatory_build.region_promoter',\n",
+ " 'ensembl_regulatory_build.region_promoter_flanking_region',\n",
+ " 'exac_gene.exac_cnv_flag_N',\n",
+ " 'exac_gene.exac_cnv_flag_Y',\n",
+ " 'mutationtaster.prediction_Automatic Disease Causing',\n",
+ " 'mutationtaster.prediction_Automatic Polymorphism',\n",
+ " 'mutationtaster.prediction_Damaging',\n",
+ " 'mutationtaster.prediction_Polymorphism',\n",
+ " 'mutationtaster.model_complex_aae',\n",
+ " 'mutationtaster.model_simple_aae',\n",
+ " 'mutationtaster.model_without_aae',\n",
+ " 'prec.stat_lof-tolerant',\n",
+ " 'prec.stat_recessive',\n",
+ " 'sift.confidence_High',\n",
+ " 'sift.confidence_Low']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "import pandas as pd\n",
- "pd.set_option('display.max_rows', None)\n",
- "import yaml\n",
- "import warnings\n",
- "warnings.simplefilter(\"ignore\")\n",
- "#from joblib import load, dump\n",
- "import argparse\n",
- "#import shap\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "import functools\n",
- "print = functools.partial(print, flush=True)\n",
- "from sklearn.preprocessing import label_binarize, MinMaxScaler\n",
- "from tensorflow import keras\n",
- "from sklearn.metrics import (\n",
- " roc_curve,precision_score,\n",
- " precision_recall_curve,roc_auc_score,\n",
- " f1_score,accuracy_score, confusion_matrix, ConfusionMatrixDisplay,\n",
- " confusion_matrix,\n",
- " average_precision_score,\n",
- " recall_score\n",
- ")\n",
- "import pickle\n",
- "from sklearn.utils import class_weight\n",
- "import shap\n",
- "# from keras_sequential_ascii import keras2ascii\n",
- "# from nnv import NNV\n"
+ "X_train.columns.to_list()"
]
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "880823ee",
+ "execution_count": 5,
+ "id": "9f6eef91",
"metadata": {},
"outputs": [],
"source": [
- "warnings.simplefilter(\"ignore\", category=DeprecationWarning)\n"
+ "X_train[['chrom', 'pos', 'ref_base', 'alt_base','class']].drop_duplicates().to_csv(\"../../data/training_variants.csv\", index=False)"
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "22e98d97",
+ "execution_count": 7,
+ "id": "0a8776c9",
"metadata": {},
"outputs": [],
"source": [
- "with open(\n",
- " \"../../configs/col_config.yaml\"\n",
- " ) as fh:\n",
- " config_dict = yaml.safe_load(fh)\n",
- "\n",
- "with open(\n",
- " \"../../configs/var_class.yaml\"\n",
- " ) as fh1:\n",
- " var_dict = yaml.safe_load(fh1)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'3 prime UTR',\n",
- " '5 prime UTR',\n",
- " 'complex substitution',\n",
- " 'exon loss variant',\n",
- " 'frameshift elongation',\n",
- " 'frameshift truncation',\n",
- " 'inframe deletion',\n",
- " 'inframe insertion',\n",
- " 'intergenic',\n",
- " 'intron',\n",
- " 'missense',\n",
- " 'other',\n",
- " 'other RNA',\n",
- " 'splice site',\n",
- " 'start lost',\n",
- " 'start retained',\n",
- " 'stop gained',\n",
- " 'stop lost',\n",
- " 'stop retained',\n",
- " 'synonymous'}"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "set(var_dict.values())\n"
+ "#X_train.chrom.value_counts()"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "3a4ff3de",
+ "execution_count": 8,
+ "id": "eeae3ed6",
"metadata": {},
"outputs": [],
"source": [
- "#amis = pd.read_csv(\"/Users/tarunmamidi/Downloads/AlphaMissense_hg38.tsv\", low_memory=False, skiprows=3, sep='\\t')\n",
- "#amis.head()\n"
+ "#X_train[X_train.chrom == \"chrY\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
- "id": "4bcc801e",
+ "id": "bd9d4901",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-02-28 16:41:02.474929: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
- "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Model: \"sequential\"\n",
- "_________________________________________________________________\n",
- " Layer (type) Output Shape Param # \n",
- "=================================================================\n",
- " dense (Dense) (None, 239) 57360 \n",
- " \n",
- " dense_l0 (Dense) (None, 161) 38640 \n",
- " \n",
- " dropout (Dropout) (None, 161) 0 \n",
- " \n",
- " dense_last (Dense) (None, 1) 162 \n",
- " \n",
- "=================================================================\n",
- "Total params: 96,162\n",
- "Trainable params: 96,162\n",
- "Non-trainable params: 0\n",
- "_________________________________________________________________\n"
+ "1/1 [==============================] - 0s 137ms/step\n",
+ "[0.94781121]\n"
]
}
],
"source": [
- "clf = keras.models.load_model('../../model/Neural_network/')\n",
- "clf.load_weights(\"../../model/weights.h5\")\n",
- "clf.summary()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "#keras2ascii(clf)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "# layersList = [\n",
- "# {\"title\":\"Input\\n(239 n)\\n(elu)\", \"units\": 239, \"color\": \"green\", \"edges_color\":\"darkBlue\", \"edges_width\":2},\n",
- "# {\"title\":\"Dense\\n(161 n)\\n(elu)\", \"units\": 161, \"edges_color\":\"darkBlue\", \"edges_width\":2,\"color\": \"orange\"},\n",
- "# #{\"title\":\"Dropout\", \"units\": 161, \"edges_color\":\"red\", \"edges_width\":2},\n",
- "# {\"title\":\"output\\n(1 n)\\n(sigmoid)\", \"units\": 1,\"color\": \"red\"},\n",
- "# ]\n",
- "\n",
- "# NNV(layersList).render(save_to_file=\"DITTO.png\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "276a7133",
- "metadata": {},
- "outputs": [],
- "source": [
- "#X_train = pd.read_csv(f\"../../data/train_class_data_80.csv.gz\")\n",
- "#train.head()"
+ "pkl_file = open(\n",
+ " \"../../data/background.pkl\",\n",
+ " \"rb\",\n",
+ " )\n",
+ "background = pickle.load(pkl_file)\n",
+ "pkl_file.close()\n",
+ "explainer = shap.KernelExplainer(clf.predict, background)\n",
+ "print(explainer.expected_value)\n",
+ "del background\n"
]
},
{
"cell_type": "code",
- "execution_count": 7,
- "id": "0a8776c9",
+ "execution_count": 10,
+ "id": "475cd807",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(208167, 255)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(208167, 255)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#X_train.chrom.value_counts()"
+ "X_test = pd.read_csv(\"../../data/test_class_data_20.csv.gz\")\n",
+ "X_test.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
- "id": "eeae3ed6",
+ "id": "c0ae79e4",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1050826, 255)\n"
+ ]
+ }
+ ],
"source": [
- "#X_train[X_train.chrom == \"chrY\"]"
+ "X_test = pd.concat([X_test, X_train]).reset_index(drop=True)\n",
+ "print(X_test.shape)\n"
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "bd9d4901",
+ "execution_count": 11,
+ "id": "fa7f5c32",
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1/1 [==============================] - 0s 272ms/step\n",
- "[0.94781121]\n"
- ]
+ "data": {
+ "text/plain": [
+ "class\n",
+ "low_impact 147809\n",
+ "high_impact 60358\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "pkl_file = open(\n",
- " \"../../data/background.pkl\",\n",
- " \"rb\",\n",
- " )\n",
- "background = pickle.load(pkl_file)\n",
- "pkl_file.close()\n",
- "explainer = shap.KernelExplainer(clf.predict, background)\n",
- "print(explainer.expected_value)\n",
- "del background\n"
+ "X_test['class'].value_counts()"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 7,
"id": "9f23ed49",
"metadata": {},
"outputs": [],
@@ -293,7 +886,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "c802b77d",
"metadata": {},
"outputs": [
@@ -527,7 +1120,7 @@
"[5 rows x 239 columns]"
]
},
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -538,7 +1131,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -704,7 +1297,7 @@
"4 8831.0 NaN Q96PV0 rs781201249 "
]
},
- "execution_count": 49,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -741,7 +1334,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 8,
"id": "baf8c805",
"metadata": {},
"outputs": [],
@@ -782,7 +1375,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"id": "f1f0f9bd",
"metadata": {},
"outputs": [
@@ -920,22 +1513,22 @@
""
],
"text/plain": [
- " transcript gene consequence \\\n",
- "0 ENST00000293748 SYNGAP1 NMD_transcript_variant,synonymous_variant \n",
+ " transcript gene consequence \n",
+ "0 ENST00000293748 SYNGAP1 NMD_transcript_variant,synonymous_variant \\\n",
"1 ENST00000418600 SYNGAP1 synonymous_variant \n",
"2 ENST00000428982 SYNGAP1 synonymous_variant \n",
"3 ENST00000449372 SYNGAP1 synonymous_variant \n",
"4 ENST00000628646 SYNGAP1 synonymous_variant \n",
"\n",
- " protein_hgvs cdna_hgvs chrom pos ref_base alt_base \\\n",
- "0 p.Pro1051= c.3153T>G chr6 33443750 T G \n",
+ " protein_hgvs cdna_hgvs chrom pos ref_base alt_base \n",
+ "0 p.Pro1051= c.3153T>G chr6 33443750 T G \\\n",
"1 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
"2 p.Pro1007= c.3021T>G chr6 33443750 T G \n",
"3 p.Pro1052= c.3156T>G chr6 33443750 T G \n",
"4 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
"\n",
- " clingen.disease clingen.classification \\\n",
- "0 complex neurodevelopmental disorder Definitive \n",
+ " clingen.disease clingen.classification \n",
+ "0 complex neurodevelopmental disorder Definitive \\\n",
"1 complex neurodevelopmental disorder Definitive \n",
"2 complex neurodevelopmental disorder Definitive \n",
"3 complex neurodevelopmental disorder Definitive \n",
@@ -949,7 +1542,7 @@
"4 8831.0 NaN Q96PV0 rs781201249 "
]
},
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -1232,7 +1825,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 9,
"id": "6fc8e643",
"metadata": {},
"outputs": [
@@ -1250,7 +1843,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 10,
"id": "45412118",
"metadata": {},
"outputs": [
@@ -1266,7 +1859,7 @@
" [1.0000000e+00]], dtype=float32)"
]
},
- "execution_count": 14,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -1347,7 +1940,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -1361,7 +1954,7 @@
"Name: spliceai, dtype: float64"
]
},
- "execution_count": 15,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -1671,7 +2264,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 12,
"id": "dcc03dc4",
"metadata": {},
"outputs": [
@@ -1844,7 +2437,7 @@
"4 8831.0 NaN Q96PV0 rs781201249 0.0 "
]
},
- "execution_count": 16,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1856,7 +2449,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 13,
"id": "a04829c0",
"metadata": {},
"outputs": [
@@ -1866,7 +2459,7 @@
"(208167, 257)"
]
},
- "execution_count": 17,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -1878,7 +2471,7 @@
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": 18,
"id": "bd40b6e3",
"metadata": {},
"outputs": [
@@ -1914,6 +2507,7 @@
" alt_base | \n",
" clingen.disease | \n",
" ... | \n",
+ " mutationtaster.prediction_Damaging | \n",
" mutationtaster.prediction_Polymorphism | \n",
" mutationtaster.model_complex_aae | \n",
" mutationtaster.model_simple_aae | \n",
@@ -1923,7 +2517,6 @@
" sift.confidence_High | \n",
" sift.confidence_Low | \n",
" class | \n",
- " spliceai | \n",
" \n",
" \n",
" \n",
@@ -1946,10 +2539,10 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0.0004 | \n",
" \n",
" \n",
" 1 | \n",
@@ -1970,10 +2563,10 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0.0004 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -1994,10 +2587,10 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0.0004 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -2018,10 +2611,10 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0.0004 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -2042,76 +2635,440 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0.0004 | \n",
"
\n",
" \n",
"\n",
- "5 rows × 257 columns
\n",
+ "5 rows × 256 columns
\n",
+ ""
+ ],
+ "text/plain": [
+ " transcript gene consequence \n",
+ "0 ENST00000293748 SYNGAP1 NMD_transcript_variant,synonymous_variant \\\n",
+ "1 ENST00000418600 SYNGAP1 synonymous_variant \n",
+ "2 ENST00000428982 SYNGAP1 synonymous_variant \n",
+ "3 ENST00000449372 SYNGAP1 synonymous_variant \n",
+ "4 ENST00000628646 SYNGAP1 synonymous_variant \n",
+ "\n",
+ " protein_hgvs cdna_hgvs chrom pos ref_base alt_base \n",
+ "0 p.Pro1051= c.3153T>G chr6 33443750 T G \\\n",
+ "1 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
+ "2 p.Pro1007= c.3021T>G chr6 33443750 T G \n",
+ "3 p.Pro1052= c.3156T>G chr6 33443750 T G \n",
+ "4 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
+ "\n",
+ " clingen.disease ... \n",
+ "0 complex neurodevelopmental disorder ... \\\n",
+ "1 complex neurodevelopmental disorder ... \n",
+ "2 complex neurodevelopmental disorder ... \n",
+ "3 complex neurodevelopmental disorder ... \n",
+ "4 complex neurodevelopmental disorder ... \n",
+ "\n",
+ " mutationtaster.prediction_Damaging mutationtaster.prediction_Polymorphism \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " mutationtaster.model_complex_aae mutationtaster.model_simple_aae \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " mutationtaster.model_without_aae prec.stat_lof-tolerant \n",
+ "0 0 0 \\\n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ " prec.stat_recessive sift.confidence_High sift.confidence_Low class \n",
+ "0 0 1 0 0 \n",
+ "1 0 1 0 0 \n",
+ "2 0 1 0 0 \n",
+ "3 0 1 0 0 \n",
+ "4 0 1 0 0 \n",
+ "\n",
+ "[5 rows x 256 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "var.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "a651a2bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "var[['chrom', 'pos', 'ref_base', 'alt_base','class','DITTO']].sort_values(by='DITTO', ascending=False).drop_duplicates(['chrom', 'pos', 'ref_base', 'alt_base']).to_csv(\"../../data/testing_variants.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "92f60d7b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4739"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(var.gene.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "e9e88795",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DITTO | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
"
"
],
"text/plain": [
- " transcript gene consequence \n",
- "0 ENST00000293748 SYNGAP1 NMD_transcript_variant,synonymous_variant \\\n",
- "1 ENST00000418600 SYNGAP1 synonymous_variant \n",
- "2 ENST00000428982 SYNGAP1 synonymous_variant \n",
- "3 ENST00000449372 SYNGAP1 synonymous_variant \n",
- "4 ENST00000628646 SYNGAP1 synonymous_variant \n",
- "\n",
- " protein_hgvs cdna_hgvs chrom pos ref_base alt_base \n",
- "0 p.Pro1051= c.3153T>G chr6 33443750 T G \\\n",
- "1 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
- "2 p.Pro1007= c.3021T>G chr6 33443750 T G \n",
- "3 p.Pro1052= c.3156T>G chr6 33443750 T G \n",
- "4 p.Pro1066= c.3198T>G chr6 33443750 T G \n",
- "\n",
- " clingen.disease ... \n",
- "0 complex neurodevelopmental disorder ... \\\n",
- "1 complex neurodevelopmental disorder ... \n",
- "2 complex neurodevelopmental disorder ... \n",
- "3 complex neurodevelopmental disorder ... \n",
- "4 complex neurodevelopmental disorder ... \n",
- "\n",
- " mutationtaster.prediction_Polymorphism mutationtaster.model_complex_aae \n",
- "0 0 0 \\\n",
- "1 0 0 \n",
- "2 0 0 \n",
- "3 0 0 \n",
- "4 0 0 \n",
- "\n",
- " mutationtaster.model_simple_aae mutationtaster.model_without_aae \n",
- "0 0 0 \\\n",
- "1 0 0 \n",
- "2 0 0 \n",
- "3 0 0 \n",
- "4 0 0 \n",
- "\n",
- " prec.stat_lof-tolerant prec.stat_recessive sift.confidence_High \n",
- "0 0 0 1 \\\n",
- "1 0 0 1 \n",
- "2 0 0 1 \n",
- "3 0 0 1 \n",
- "4 0 0 1 \n",
- "\n",
- " sift.confidence_Low class spliceai \n",
- "0 0 0 0.0004 \n",
- "1 0 0 0.0004 \n",
- "2 0 0 0.0004 \n",
- "3 0 0 0.0004 \n",
- "4 0 0 0.0004 \n",
- "\n",
- "[5 rows x 257 columns]"
+ " DITTO class\n",
+ "0 0.0 0\n",
+ "1 0.0 0\n",
+ "2 0.0 0\n",
+ "3 0.0 0\n",
+ "4 0.0 0"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "var[['DITTO','class']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "97df8911",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(60358, 257)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "var[var['class'] == 1].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "f6d36c30",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "