From b4b172c8f75250f5e870895eacdc9373c6b53fcf Mon Sep 17 00:00:00 2001 From: Goosang Yu Date: Thu, 10 Aug 2023 00:04:59 +0900 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=85=20update=20module=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genet/predict/PredUtils.py | 11 ++++++++++- genet/predict/functional.py | 32 +------------------------------- 2 files changed, 11 insertions(+), 32 deletions(-) diff --git a/genet/predict/PredUtils.py b/genet/predict/PredUtils.py index 3f207a5..2fcf3f5 100644 --- a/genet/predict/PredUtils.py +++ b/genet/predict/PredUtils.py @@ -22,4 +22,13 @@ def preprocess_seq(data, seq_length): print("[Input Error] Non-ATGC character " + data[l]) sys.exit() - return seq_onehot \ No newline at end of file + return seq_onehot + +def reverse_complement(sSeq): + dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '', + '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'} + list_sSeq = list(sSeq) # Turns the sequence in to a gigantic list + list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq] + return ''.join(list_sSeq)[::-1] + +# def END: reverse_complement \ No newline at end of file diff --git a/genet/predict/functional.py b/genet/predict/functional.py index be65d2f..20e2d04 100644 --- a/genet/predict/functional.py +++ b/genet/predict/functional.py @@ -1,6 +1,7 @@ # from genet.utils import * import genet import genet.utils +from genet.predict.PredUtils import * import torch import torch.nn.functional as F @@ -116,28 +117,6 @@ def Model_Finaltest(sess, TEST_X, model): -def preprocess_seq(data, seq_length): - - seq_onehot = np.zeros((len(data), 1, seq_length, 4), dtype=float) - - for l in range(len(data)): - for i in range(seq_length): - try: - data[l][i] - except Exception: - print(data[l], i, seq_length, len(data)) - - if data[l][i] in "Aa": seq_onehot[l, 0, i, 0] = 1 - elif data[l][i] in "Cc": seq_onehot[l, 0, i, 1] = 1 - elif data[l][i] in "Gg": seq_onehot[l, 0, i, 2] = 1 - elif data[l][i] in "Tt": seq_onehot[l, 0, i, 3] = 1 - elif data[l][i] in "Xx": pass - elif data[l][i] in "Nn.": pass - else: - print("[Input Error] Non-ATGC character " + data[l]) - sys.exit() - - return seq_onehot def spcas9_score_tf2(list_target30:list, gpu_env=0): '''Tensorflow2 version function @@ -245,15 +224,6 @@ def spcas9_score(list_target30:list, gpu_env=0): return list_score -def reverse_complement(sSeq): - dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '', - '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'} - list_sSeq = list(sSeq) # Turns the sequence in to a gigantic list - list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq] - return ''.join(list_sSeq)[::-1] - -# def END: reverse_complement - def set_alt_position_window(sStrand, sAltKey, nAltIndex, nIndexStart, nIndexEnd, nAltLen): if sStrand == '+': From 3e0c6ac764e1b8dcc0114178acdc5792977f3962 Mon Sep 17 00:00:00 2001 From: Goosang Yu Date: Thu, 10 Aug 2023 00:05:27 +0900 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=85=20test=20note?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_genet_prd.ipynb | 1576 ++++++++++++++++++------------------------ test_models.ipynb | 2 +- 2 files changed, 685 insertions(+), 893 deletions(-) diff --git a/test_genet_prd.ipynb b/test_genet_prd.ipynb index 70fca99..767b7ad 100644 --- a/test_genet_prd.ipynb +++ b/test_genet_prd.ipynb @@ -2,312 +2,603 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model DeepSpCas9 is not installed. Download checkpoint files.\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 0KB [00:00, ?KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9/__init__.py\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 681KB [00:00, 1495.85KB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.data-00000-of-00001\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 1KB [00:00, ?KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.index\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 29KB [00:00, 240.87KB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.meta\n" - ] - }, - { - "ename": "OSError", - "evalue": "SavedModel file does not exist at: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9\\{saved_model.pbtxt|saved_model.pb}", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[1], line 10\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mgenet\u001b[39;00m \u001b[39mimport\u001b[39;00m predict_dev \u001b[39mas\u001b[39;00m prd\n\u001b[0;32m 3\u001b[0m list_target30 \u001b[39m=\u001b[39m [\n\u001b[0;32m 4\u001b[0m \u001b[39m'\u001b[39m\u001b[39mTCACCTTCGTTTTTTTCCTTCTGCAGGAGG\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 5\u001b[0m \u001b[39m'\u001b[39m\u001b[39mCCTTCGTTTTTTTCCTTCTGCAGGAGGACA\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 6\u001b[0m \u001b[39m'\u001b[39m\u001b[39mCTTTCAAGAACTCTTCCACCTCCATGGTGT\u001b[39m\u001b[39m'\u001b[39m,\n\u001b[0;32m 7\u001b[0m ]\n\u001b[1;32m---> 10\u001b[0m df \u001b[39m=\u001b[39m prd\u001b[39m.\u001b[39;49mspcas9_score_tf2(list_target30)\n\u001b[0;32m 11\u001b[0m df\n", - "File \u001b[1;32mc:\\Users\\home\\Documents\\GitHub\\genet\\genet\\predict_dev\\functional_dev.py:55\u001b[0m, in \u001b[0;36mspcas9_score_tf2\u001b[1;34m(list_target30, gpu_env)\u001b[0m\n\u001b[0;32m 53\u001b[0m model_save \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m'\u001b[39m \u001b[39m%\u001b[39m (model_dir, best_model)\n\u001b[0;32m 54\u001b[0m \u001b[39m# final_model = tf.keras.models.load_model(model_save,compile=False)\u001b[39;00m\n\u001b[1;32m---> 55\u001b[0m final_model \u001b[39m=\u001b[39m tf\u001b[39m.\u001b[39;49msaved_model\u001b[39m.\u001b[39;49mload(model_dir)\n\u001b[0;32m 57\u001b[0m output \u001b[39m=\u001b[39m final_model\u001b[39m.\u001b[39msignatures[\u001b[39m'\u001b[39m\u001b[39mserving_default\u001b[39m\u001b[39m'\u001b[39m](input_1\u001b[39m=\u001b[39mtf\u001b[39m.\u001b[39mconstant(input_data))[\u001b[39m'\u001b[39m\u001b[39mdense_2\u001b[39m\u001b[39m'\u001b[39m]\n\u001b[0;32m 60\u001b[0m dataset_ \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mDataFrame()\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\tensorflow\\python\\saved_model\\load.py:936\u001b[0m, in \u001b[0;36mload\u001b[1;34m(export_dir, tags, options)\u001b[0m\n\u001b[0;32m 845\u001b[0m \u001b[39m@tf_export\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39msaved_model.load\u001b[39m\u001b[39m\"\u001b[39m, v1\u001b[39m=\u001b[39m[\u001b[39m\"\u001b[39m\u001b[39msaved_model.load_v2\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m 846\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mload\u001b[39m(export_dir, tags\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, options\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[0;32m 847\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Load a SavedModel from `export_dir`.\u001b[39;00m\n\u001b[0;32m 848\u001b[0m \n\u001b[0;32m 849\u001b[0m \u001b[39m Signatures associated with the SavedModel are available as functions:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 934\u001b[0m \u001b[39m ValueError: If `tags` don't match a MetaGraph in the SavedModel.\u001b[39;00m\n\u001b[0;32m 935\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 936\u001b[0m result \u001b[39m=\u001b[39m load_internal(export_dir, tags, options)[\u001b[39m\"\u001b[39m\u001b[39mroot\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 937\u001b[0m \u001b[39mreturn\u001b[39;00m result\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\tensorflow\\python\\saved_model\\load.py:949\u001b[0m, in \u001b[0;36mload_internal\u001b[1;34m(export_dir, tags, options, loader_cls, filters)\u001b[0m\n\u001b[0;32m 944\u001b[0m \u001b[39mif\u001b[39;00m tags \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(tags, \u001b[39mset\u001b[39m):\n\u001b[0;32m 945\u001b[0m \u001b[39m# Supports e.g. tags=SERVING and tags=[SERVING]. Sets aren't considered\u001b[39;00m\n\u001b[0;32m 946\u001b[0m \u001b[39m# sequences for nest.flatten, so we put those through as-is.\u001b[39;00m\n\u001b[0;32m 947\u001b[0m tags \u001b[39m=\u001b[39m nest\u001b[39m.\u001b[39mflatten(tags)\n\u001b[0;32m 948\u001b[0m saved_model_proto, debug_info \u001b[39m=\u001b[39m (\n\u001b[1;32m--> 949\u001b[0m loader_impl\u001b[39m.\u001b[39;49mparse_saved_model_with_debug_info(export_dir))\n\u001b[0;32m 951\u001b[0m \u001b[39mif\u001b[39;00m (\u001b[39mlen\u001b[39m(saved_model_proto\u001b[39m.\u001b[39mmeta_graphs) \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39mand\u001b[39;00m\n\u001b[0;32m 952\u001b[0m saved_model_proto\u001b[39m.\u001b[39mmeta_graphs[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mHasField(\u001b[39m\"\u001b[39m\u001b[39mobject_graph_def\u001b[39m\u001b[39m\"\u001b[39m)):\n\u001b[0;32m 953\u001b[0m metrics\u001b[39m.\u001b[39mIncrementReadApi(_LOAD_V2_LABEL)\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\tensorflow\\python\\saved_model\\loader_impl.py:57\u001b[0m, in \u001b[0;36mparse_saved_model_with_debug_info\u001b[1;34m(export_dir)\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mparse_saved_model_with_debug_info\u001b[39m(export_dir):\n\u001b[0;32m 45\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Reads the savedmodel as well as the graph debug info.\u001b[39;00m\n\u001b[0;32m 46\u001b[0m \n\u001b[0;32m 47\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[39m parsed. Missing graph debug info file is fine.\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 57\u001b[0m saved_model \u001b[39m=\u001b[39m parse_saved_model(export_dir)\n\u001b[0;32m 59\u001b[0m debug_info_path \u001b[39m=\u001b[39m file_io\u001b[39m.\u001b[39mjoin(\n\u001b[0;32m 60\u001b[0m saved_model_utils\u001b[39m.\u001b[39mget_debug_dir(export_dir),\n\u001b[0;32m 61\u001b[0m constants\u001b[39m.\u001b[39mDEBUG_INFO_FILENAME_PB)\n\u001b[0;32m 62\u001b[0m debug_info \u001b[39m=\u001b[39m graph_debug_info_pb2\u001b[39m.\u001b[39mGraphDebugInfo()\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\tensorflow\\python\\saved_model\\loader_impl.py:115\u001b[0m, in \u001b[0;36mparse_saved_model\u001b[1;34m(export_dir)\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mIOError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCannot parse file \u001b[39m\u001b[39m{\u001b[39;00mpath_to_pbtxt\u001b[39m}\u001b[39;00m\u001b[39m: \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mstr\u001b[39m(e)\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 114\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 115\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mIOError\u001b[39;00m(\n\u001b[0;32m 116\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mSavedModel file does not exist at: \u001b[39m\u001b[39m{\u001b[39;00mexport_dir\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mos\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39msep\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 117\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mconstants\u001b[39m.\u001b[39mSAVED_MODEL_FILENAME_PBTXT\u001b[39m}\u001b[39;00m\u001b[39m|\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 118\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mconstants\u001b[39m.\u001b[39mSAVED_MODEL_FILENAME_PB\u001b[39m}\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m\"\u001b[39m)\n", - "\u001b[1;31mOSError\u001b[0m: SavedModel file does not exist at: c:\\Users\\home\\Documents\\GitHub\\genet\\genet\\models\\DeepSpCas9\\{saved_model.pbtxt|saved_model.pb}" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TargetSpacerStrandStartEnd
0CCTCCGAGAGCCGCTTCAACACCCTGGCCGCGAGAGCCGCTTCAACACCC+1040
1GCCGCTTCAACACCCTGGCCGAGTTGGTTCCTTCAACACCCTGGCCGAGT+1949
2CCGAGTTGGTTCATCATCATTCAACGGTGGGTTGGTTCATCATCATTCAA+3767
3AGTTGGTTCATCATCATTCAACGGTGGCCGGGTTCATCATCATTCAACGG+4070
4TCATCATCATTCAACGGTGGCCGACGGGCTCATCATTCAACGGTGGCCGA+4777
5CATCATCATTCAACGGTGGCCGACGGGCTCATCATTCAACGGTGGCCGAC+4878
6AAAGCGCAACAAGCCCACTGTCTATGGTGTCGCAACAAGCCCACTGTCTA+104134
7GGTGTGTCCCCCAACTACGACAAGTGGGAGTGTCCCCCAACTACGACAAG+129159
8GTGTGTCCCCCAACTACGACAAGTGGGAGAGTCCCCCAACTACGACAAGT+130160
9CCCCCAACTACGACAAGTGGGAGATGGAACCAACTACGACAAGTGGGAGA+136166
10ACGACAAGTGGGAGATGGAACGCACGGACACAAGTGGGAGATGGAACGCA+145175
11CGGACATCACCATGAAGCACAAGCTGGGCGCATCACCATGAAGCACAAGC+169199
12GGACATCACCATGAAGCACAAGCTGGGCGGATCACCATGAAGCACAAGCT+170200
13CATCACCATGAAGCACAAGCTGGGCGGGGGACCATGAAGCACAAGCTGGG+173203
14ATCACCATGAAGCACAAGCTGGGCGGGGGCCCATGAAGCACAAGCTGGGC+174204
15TCACCATGAAGCACAAGCTGGGCGGGGGCCCATGAAGCACAAGCTGGGCG+175205
16CACCATGAAGCACAAGCTGGGCGGGGGCCAATGAAGCACAAGCTGGGCGG+176206
17GCACAAGCTGGGCGGGGGCCAGTACGGGGAAAGCTGGGCGGGGGCCAGTA+185215
18CACAAGCTGGGCGGGGGCCAGTACGGGGAGAGCTGGGCGGGGGCCAGTAC+186216
19ACAAGCTGGGCGGGGGCCAGTACGGGGAGGGCTGGGCGGGGGCCAGTACG+187217
20AGCTGGGCGGGGGCCAGTACGGGGAGGTGTGGGCGGGGGCCAGTACGGGG+190220
21GGGGCCAGTACGGGGAGGTGTACGAGGGCGCCAGTACGGGGAGGTGTACG+199229
22GGGCCAGTACGGGGAGGTGTACGAGGGCGTCAGTACGGGGAGGTGTACGA+200230
23TACGGGGAGGTGTACGAGGGCGTGTGGAAGGGGAGGTGTACGAGGGCGTG+207237
24GCGTGTGGAAGAAATACAGCCTGACGGTGGGTGGAAGAAATACAGCCTGA+226256
25TGTGGAAGAAATACAGCCTGACGGTGGCCGGAAGAAATACAGCCTGACGG+229259
26CCAGGGTGTTGAAGCGGCTCTCGGAGGAGAGGTGTTGAAGCGGCTCTCGG-737
27CGGCCAGGGTGTTGAAGCGGCTCTCGGAGGCAGGGTGTTGAAGCGGCTCT-1040
28ACCAACTCGGCCAGGGTGTTGAAGCGGCTCACTCGGCCAGGGTGTTGAAG-1747
29AATGATGATGAACCAACTCGGCCAGGGTGTATGATGAACCAACTCGGCCA-2858
30GAATGATGATGAACCAACTCGGCCAGGGTGGATGATGAACCAACTCGGCC-2959
31CCGTTGAATGATGATGAACCAACTCGGCCATGAATGATGATGAACCAACT-3464
32AATGGAGCGTGGTGATGAGCCCGTCGGCCAGAGCGTGGTGATGAGCCCGT-6494
33GCTTTGGGGCTGGATAATGGAGCGTGGTGATGGGGCTGGATAATGGAGCG-79109
34TTGTTGCGCTTTGGGGCTGGATAATGGAGCTGCGCTTTGGGGCTGGATAA-86116
35AGTGGGCTTGTTGCGCTTTGGGGCTGGATAGGCTTGTTGCGCTTTGGGGC-93123
36AGACAGTGGGCTTGTTGCGCTTTGGGGCTGAGTGGGCTTGTTGCGCTTTG-97127
37TAGACAGTGGGCTTGTTGCGCTTTGGGGCTCAGTGGGCTTGTTGCGCTTT-98128
38ATAGACAGTGGGCTTGTTGCGCTTTGGGGCACAGTGGGCTTGTTGCGCTT-99129
39GTTGGGGGACACACCATAGACAGTGGGCTTGGGGACACACCATAGACAGT-114144
40AGTTGGGGGACACACCATAGACAGTGGGCTGGGGGACACACCATAGACAG-115145
41CCATCTCCCACTTGTCGTAGTTGGGGGACACTCCCACTTGTCGTAGTTGG-133163
42TCCATCTCCCACTTGTCGTAGTTGGGGGACTCTCCCACTTGTCGTAGTTG-134164
43TTCCATCTCCCACTTGTCGTAGTTGGGGGAATCTCCCACTTGTCGTAGTT-135165
44GTTCCATCTCCCACTTGTCGTAGTTGGGGGCATCTCCCACTTGTCGTAGT-136166
45GGCCCCCGCCCAGCTTGTGCTTCATGGTGACCCGCCCAGCTTGTGCTTCA-175205
46ACGCCCTCGTACACCTCCCCGTACTGGCCCCCTCGTACACCTCCCCGTAC-200230
47TTCAAGGTCTTCACGGCCACCGTCAGGCTGAGGTCTTCACGGCCACCGTC-242272
\n", + "
" + ], + "text/plain": [ + " Target Spacer Strand Start End\n", + "0 CCTCCGAGAGCCGCTTCAACACCCTGGCCG CGAGAGCCGCTTCAACACCC + 10 40\n", + "1 GCCGCTTCAACACCCTGGCCGAGTTGGTTC CTTCAACACCCTGGCCGAGT + 19 49\n", + "2 CCGAGTTGGTTCATCATCATTCAACGGTGG GTTGGTTCATCATCATTCAA + 37 67\n", + "3 AGTTGGTTCATCATCATTCAACGGTGGCCG GGTTCATCATCATTCAACGG + 40 70\n", + "4 TCATCATCATTCAACGGTGGCCGACGGGCT CATCATTCAACGGTGGCCGA + 47 77\n", + "5 CATCATCATTCAACGGTGGCCGACGGGCTC ATCATTCAACGGTGGCCGAC + 48 78\n", + "6 AAAGCGCAACAAGCCCACTGTCTATGGTGT CGCAACAAGCCCACTGTCTA + 104 134\n", + "7 GGTGTGTCCCCCAACTACGACAAGTGGGAG TGTCCCCCAACTACGACAAG + 129 159\n", + "8 GTGTGTCCCCCAACTACGACAAGTGGGAGA GTCCCCCAACTACGACAAGT + 130 160\n", + "9 CCCCCAACTACGACAAGTGGGAGATGGAAC CAACTACGACAAGTGGGAGA + 136 166\n", + "10 ACGACAAGTGGGAGATGGAACGCACGGACA CAAGTGGGAGATGGAACGCA + 145 175\n", + "11 CGGACATCACCATGAAGCACAAGCTGGGCG CATCACCATGAAGCACAAGC + 169 199\n", + "12 GGACATCACCATGAAGCACAAGCTGGGCGG ATCACCATGAAGCACAAGCT + 170 200\n", + "13 CATCACCATGAAGCACAAGCTGGGCGGGGG ACCATGAAGCACAAGCTGGG + 173 203\n", + "14 ATCACCATGAAGCACAAGCTGGGCGGGGGC CCATGAAGCACAAGCTGGGC + 174 204\n", + "15 TCACCATGAAGCACAAGCTGGGCGGGGGCC CATGAAGCACAAGCTGGGCG + 175 205\n", + "16 CACCATGAAGCACAAGCTGGGCGGGGGCCA ATGAAGCACAAGCTGGGCGG + 176 206\n", + "17 GCACAAGCTGGGCGGGGGCCAGTACGGGGA AAGCTGGGCGGGGGCCAGTA + 185 215\n", + "18 CACAAGCTGGGCGGGGGCCAGTACGGGGAG AGCTGGGCGGGGGCCAGTAC + 186 216\n", + "19 ACAAGCTGGGCGGGGGCCAGTACGGGGAGG GCTGGGCGGGGGCCAGTACG + 187 217\n", + "20 AGCTGGGCGGGGGCCAGTACGGGGAGGTGT GGGCGGGGGCCAGTACGGGG + 190 220\n", + "21 GGGGCCAGTACGGGGAGGTGTACGAGGGCG CCAGTACGGGGAGGTGTACG + 199 229\n", + "22 GGGCCAGTACGGGGAGGTGTACGAGGGCGT CAGTACGGGGAGGTGTACGA + 200 230\n", + "23 TACGGGGAGGTGTACGAGGGCGTGTGGAAG GGGAGGTGTACGAGGGCGTG + 207 237\n", + "24 GCGTGTGGAAGAAATACAGCCTGACGGTGG GTGGAAGAAATACAGCCTGA + 226 256\n", + "25 TGTGGAAGAAATACAGCCTGACGGTGGCCG GAAGAAATACAGCCTGACGG + 229 259\n", + "26 CCAGGGTGTTGAAGCGGCTCTCGGAGGAGA GGTGTTGAAGCGGCTCTCGG - 7 37\n", + "27 CGGCCAGGGTGTTGAAGCGGCTCTCGGAGG CAGGGTGTTGAAGCGGCTCT - 10 40\n", + "28 ACCAACTCGGCCAGGGTGTTGAAGCGGCTC ACTCGGCCAGGGTGTTGAAG - 17 47\n", + "29 AATGATGATGAACCAACTCGGCCAGGGTGT ATGATGAACCAACTCGGCCA - 28 58\n", + "30 GAATGATGATGAACCAACTCGGCCAGGGTG GATGATGAACCAACTCGGCC - 29 59\n", + "31 CCGTTGAATGATGATGAACCAACTCGGCCA TGAATGATGATGAACCAACT - 34 64\n", + "32 AATGGAGCGTGGTGATGAGCCCGTCGGCCA GAGCGTGGTGATGAGCCCGT - 64 94\n", + "33 GCTTTGGGGCTGGATAATGGAGCGTGGTGA TGGGGCTGGATAATGGAGCG - 79 109\n", + "34 TTGTTGCGCTTTGGGGCTGGATAATGGAGC TGCGCTTTGGGGCTGGATAA - 86 116\n", + "35 AGTGGGCTTGTTGCGCTTTGGGGCTGGATA GGCTTGTTGCGCTTTGGGGC - 93 123\n", + "36 AGACAGTGGGCTTGTTGCGCTTTGGGGCTG AGTGGGCTTGTTGCGCTTTG - 97 127\n", + "37 TAGACAGTGGGCTTGTTGCGCTTTGGGGCT CAGTGGGCTTGTTGCGCTTT - 98 128\n", + "38 ATAGACAGTGGGCTTGTTGCGCTTTGGGGC ACAGTGGGCTTGTTGCGCTT - 99 129\n", + "39 GTTGGGGGACACACCATAGACAGTGGGCTT GGGGACACACCATAGACAGT - 114 144\n", + "40 AGTTGGGGGACACACCATAGACAGTGGGCT GGGGGACACACCATAGACAG - 115 145\n", + "41 CCATCTCCCACTTGTCGTAGTTGGGGGACA CTCCCACTTGTCGTAGTTGG - 133 163\n", + "42 TCCATCTCCCACTTGTCGTAGTTGGGGGAC TCTCCCACTTGTCGTAGTTG - 134 164\n", + "43 TTCCATCTCCCACTTGTCGTAGTTGGGGGA ATCTCCCACTTGTCGTAGTT - 135 165\n", + "44 GTTCCATCTCCCACTTGTCGTAGTTGGGGG CATCTCCCACTTGTCGTAGT - 136 166\n", + "45 GGCCCCCGCCCAGCTTGTGCTTCATGGTGA CCCGCCCAGCTTGTGCTTCA - 175 205\n", + "46 ACGCCCTCGTACACCTCCCCGTACTGGCCC CCTCGTACACCTCCCCGTAC - 200 230\n", + "47 TTCAAGGTCTTCACGGCCACCGTCAGGCTG AGGTCTTCACGGCCACCGTC - 242 272" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from genet import predict_dev as prd\n", + "'''\n", + "if 'NRCH' in pe_system: # for NRCH-PE PAM\n", + " dict_sRE = {'+': '[ACGT][ACGT]G[ACGT]|[ACGT][CG]A[ACGT]|[ACGT][AG]CC|[ATCG]ATG', \n", + " '-': '[ACGT]C[ACGT][ACGT]|[ACGT]T[CG][ACGT]|G[GT]T[ACGT]|ATT[ACGT]|CAT[ACGT]|GGC[ACGT]|GTA[ACGT]'} \n", + "else:\n", + " dict_sRE = {'+': '[ACGT]GG[ACGT]', '-': '[ACGT]CC[ACGT]'} # for Original-PE PAM\n", + "\n", + "for sStrand in ['+', '-']:\n", + "\n", + " sRE = dict_sRE[sStrand]\n", + "'''\n", + "\n", + "import regex\n", + "import pandas as pd\n", + "from genet.models import LoadModel\n", + "\n", + "\n", + "def reverse_complement(sSeq):\n", + " dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '',\n", + " '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}\n", + " list_sSeq = list(sSeq) # Turns the sequence in to a gigantic list\n", + " list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq]\n", + " return ''.join(list_sSeq)[::-1]\n", + "\n", + "# def END: reverse_complement\n", "\n", - "list_target30 = [\n", - " 'TCACCTTCGTTTTTTTCCTTCTGCAGGAGG',\n", - " 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA',\n", - " 'CTTTCAAGAACTCTTCCACCTCCATGGTGT',\n", - " ]\n", + "model = LoadModel('DeepSpCas9', 'SpCas9')\n", + "\n", + "'''\n", + "dict_pattern = {\n", + " '+': '[ATGC]{25}GG[ATGC]{3}',\n", + " '-': '[ATGC]{3}CC[ATGC]{25}',\n", + "}\n", + "'''\n", + "\n", + "dict_re = model.info['regex']\n", + "\n", + "seq_input = 'ctctacgtctcctccgagagccgcttcaacaccctggccgagttggttcatcatcattcaacggtggccgacgggctcatcaccacgctccattatccagccccaaagcgcaacaagcccactgtctatggtgtgtcccccaactacgacaagtgggagatggaacgcacggacatcaccatgaagcacaagctgggcgggggccagtacggggaggtgtacgagggcgtgtggaagaaatacagcctgacggtggccgtgaagaccttgaag'\n", + "seq_input = seq_input.upper()\n", + "\n", + "seq_target, seq_guide, seq_strand, pos_start, pos_end = [], [], [], [], []\n", + "\n", + "for strand in ['+', '-']:\n", + " ptn = dict_re[strand]\n", + "\n", + " for re_idx in regex.finditer(ptn, seq_input, overlapped=True):\n", + " \n", + " if strand == '+': match = re_idx.group()\n", + " else : match = reverse_complement(re_idx.group())\n", + " \n", + " seq_target.append(match)\n", + " seq_guide.append(match[4:24])\n", + " seq_strand.append(strand)\n", + " pos_start.append(re_idx.start())\n", + " pos_end.append(re_idx.end())\n", + " \n", "\n", + "df_out = pd.DataFrame({'Target': seq_target,\n", + " 'Spacer': seq_guide,\n", + " 'Strand': seq_strand,\n", + " 'Start': pos_start,\n", + " 'End': pos_end})\n", "\n", - "df = prd.spcas9_score_tf2(list_target30)\n", - "df\n" + "df_out\n", + " " ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of model \n" - ] - }, - { - "ename": "ValueError", - "evalue": "Expected `model` argument to be a `Model` instance. Received: model=", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mgenet\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpredict_dev\u001b[39;00m \u001b[39mimport\u001b[39;00m spcas9_save\n\u001b[1;32m----> 3\u001b[0m spcas9_save(\u001b[39m'\u001b[39;49m\u001b[39m./\u001b[39;49m\u001b[39m'\u001b[39;49m)\n", - "File \u001b[1;32mc:\\Users\\home\\Documents\\GitHub\\genet\\genet\\predict_dev\\functional_dev.py:179\u001b[0m, in \u001b[0;36mspcas9_save\u001b[1;34m(save_dir)\u001b[0m\n\u001b[0;32m 176\u001b[0m saver\u001b[39m.\u001b[39mrestore(sess, model_save)\n\u001b[0;32m 178\u001b[0m \u001b[39m# TensorFlow 2의 Keras 모델로 변환\u001b[39;00m\n\u001b[1;32m--> 179\u001b[0m keras_model \u001b[39m=\u001b[39m tf\u001b[39m.\u001b[39;49mkeras\u001b[39m.\u001b[39;49mmodels\u001b[39m.\u001b[39;49mclone_model(Deep_xCas9)\n\u001b[0;32m 180\u001b[0m keras_model\u001b[39m.\u001b[39mset_weights(sess\u001b[39m.\u001b[39mrun(tf\u001b[39m.\u001b[39mcompat\u001b[39m.\u001b[39mv1\u001b[39m.\u001b[39mtrainable_variables()))\n\u001b[0;32m 182\u001b[0m \u001b[39m# 모델을 .h5 파일로 저장\u001b[39;00m\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\keras\\models.py:456\u001b[0m, in \u001b[0;36mclone_model\u001b[1;34m(model, input_tensors, clone_function)\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[39mreturn\u001b[39;00m _clone_sequential_model(\n\u001b[0;32m 454\u001b[0m model, input_tensors\u001b[39m=\u001b[39minput_tensors, layer_fn\u001b[39m=\u001b[39mclone_function)\n\u001b[0;32m 455\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 456\u001b[0m \u001b[39mreturn\u001b[39;00m _clone_functional_model(\n\u001b[0;32m 457\u001b[0m model, input_tensors\u001b[39m=\u001b[39;49minput_tensors, layer_fn\u001b[39m=\u001b[39;49mclone_function)\n", - "File \u001b[1;32mc:\\Users\\home\\anaconda3\\envs\\genet\\lib\\site-packages\\keras\\models.py:159\u001b[0m, in \u001b[0;36m_clone_functional_model\u001b[1;34m(model, input_tensors, layer_fn)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Clone a functional `Model` instance.\u001b[39;00m\n\u001b[0;32m 130\u001b[0m \n\u001b[0;32m 131\u001b[0m \u001b[39mModel cloning is similar to calling a model on new inputs,\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[39m argument value.\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 158\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, Model):\n\u001b[1;32m--> 159\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m'\u001b[39m\u001b[39mExpected `model` argument \u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 160\u001b[0m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mto be a `Model` instance. Received: model=\u001b[39m\u001b[39m{\u001b[39;00mmodel\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[0;32m 161\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(model, Sequential):\n\u001b[0;32m 162\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m'\u001b[39m\u001b[39mExpected `model` argument \u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 163\u001b[0m \u001b[39m'\u001b[39m\u001b[39mto be a functional `Model` instance, \u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 164\u001b[0m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mgot a `Sequential` instance instead: \u001b[39m\u001b[39m{\u001b[39;00mmodel\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n", - "\u001b[1;31mValueError\u001b[0m: Expected `model` argument to be a `Model` instance. Received: model=" - ] - } - ], + "outputs": [], "source": [ - "from genet.predict_dev import spcas9_save\n", + "import logging\n", "\n", - "spcas9_save('./')\n", - "\n" + "class LogStreamHandler(logging.StreamHandler):\n", + " def __init__(self, stream=None):\n", + " super().__init__(stream)\n", + " self.setFormatter(logging.Formatter(\n", + " '%(levelname)-5s @ %(asctime)s:\\n\\t %(message)s \\n',\n", + " datefmt='%a, %d %b %Y %H:%M:%S',\n", + " ))\n", + " self.setLevel(logging.INFO)\n", + "\n", + "class StatusFormatter(logging.Formatter):\n", + " def format(self, record):\n", + " record.percent_complete = ''\n", + " if record.args and 'percent_complete' in record.args:\n", + " record.percent_complete = '{0:.2f}% '.format(record.args['percent_complete'])\n", + " self.last_percent_complete = record.percent_complete\n", + " elif hasattr(self, 'last_percent_complete'): # if we don't have a percent complete, use the last one\n", + " record.percent_complete = self.last_percent_complete\n", + " return super().format(record)\n", + "\n", + "class StatusHandler(logging.FileHandler):\n", + " def __init__(self, filename):\n", + " super().__init__(filename, 'w')\n", + " self.setFormatter(StatusFormatter('%(percent_complete)s%(message)s'))\n", + "\n", + " def emit(self, record):\n", + " \"\"\"Overwrite the existing file and write the new log.\"\"\"\n", + " if self.stream is None: # log file is empty\n", + " self.stream = self._open()\n", + " else: # log file is not empty, overwrite\n", + " self.stream.seek(0)\n", + " logging.StreamHandler.emit(self, record)\n", + " self.stream.truncate()\n" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model DeepSpCas9 is not installed. Download checkpoint files.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 0KB [00:00, ?KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/__init__.py\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 681KB [00:00, 837.64KB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.data-00000-of-00001\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 1KB [00:00, 496.78KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.index\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 29KB [00:00, 221.38KB/s] " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.meta\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from genet.models import LoadModel\n", - "\n", - "model = LoadModel('SpCas9')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq_wt = 'ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT'\n", - "seq_ed = 'ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT'\n", - "alt_type = 'sub1'\n", - "\n", - "df_pe = prd.pe_score(seq_wt, seq_ed, alt_type)\n", - "df_pe.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_test = df_pe['Edited74_On'].str.replace('x', '')\n", - "df_test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selected = ['WT74_On', 'PBSlen', 'RTlen', 'RT-PBSlen', 'Edit_pos', 'Edit_len', 'RHA_len', 'PE2max_score']\n", - "df_pe_summary = df_pe[selected]\n", - "df_pe_summary.insert(1, 'RT-PBSseq', list_rtpbs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# genet ToDo\n", - "\n", - "# 1. output dataframe 간단하게 표현하기\n", - "# 2. GetGene 불러와서 Base editing용 gRNA 디자인해주기" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "\n", - "class LogStreamHandler(logging.StreamHandler):\n", - " def __init__(self, stream=None):\n", - " super().__init__(stream)\n", - " self.setFormatter(logging.Formatter(\n", - " '%(levelname)-5s @ %(asctime)s:\\n\\t %(message)s \\n',\n", - " datefmt='%a, %d %b %Y %H:%M:%S',\n", - " ))\n", - " self.setLevel(logging.INFO)\n", - "\n", - "class StatusFormatter(logging.Formatter):\n", - " def format(self, record):\n", - " record.percent_complete = ''\n", - " if record.args and 'percent_complete' in record.args:\n", - " record.percent_complete = '{0:.2f}% '.format(record.args['percent_complete'])\n", - " self.last_percent_complete = record.percent_complete\n", - " elif hasattr(self, 'last_percent_complete'): # if we don't have a percent complete, use the last one\n", - " record.percent_complete = self.last_percent_complete\n", - " return super().format(record)\n", - "\n", - "class StatusHandler(logging.FileHandler):\n", - " def __init__(self, filename):\n", - " super().__init__(filename, 'w')\n", - " self.setFormatter(StatusFormatter('%(percent_complete)s%(message)s'))\n", - "\n", - " def emit(self, record):\n", - " \"\"\"Overwrite the existing file and write the new log.\"\"\"\n", - " if self.stream is None: # log file is empty\n", - " self.stream = self._open()\n", - " else: # log file is not empty, overwrite\n", - " self.stream.seek(0)\n", - " logging.StreamHandler.emit(self, record)\n", - " self.stream.truncate()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -417,655 +708,156 @@ " self.console_handler.setFormatter(self.formatter)\n", " self.logger.addHandler(self.console_handler)\n", "\n", - " self.info('DeepPrime: pegRNA activity prediction models\\n\\t version: %s' % genet.__version__)\n", - "\n", - "\n", - " return None\n", - "\n", - " # def set_logging: END\n", - "\n", - "\n", - " def check_input(self):\n", - " \n", - " if self.pbs_min < 1:\n", - " self.error('sID:%s\\nPlease set PBS max length at least 1nt' % self.sID)\n", - " raise ValueError('Please check your input: pbs_min')\n", - " \n", - " if self.pbs_max > 17:\n", - " self.error('sID:%s\\nPlease set PBS max length upto 17nt' % self.sID)\n", - " raise ValueError('Please check your input: pbs_max')\n", - " \n", - " if self.rtt_max > 40:\n", - " self.error('sID:%s\\nPlease set RTT max length upto 40nt' % self.sID)\n", - " raise ValueError('Please check your input: rtt_max')\n", - "\n", - " if self.edit_type not in ['sub', 'ins', 'del']:\n", - " self.error('sID:%s\\n\\t Please select proper edit type.\\n\\t Available edit tyle: sub, ins, del' % self.sID)\n", - " raise ValueError('Please check your input: edit_type')\n", - "\n", - " if self.edit_len > 3:\n", - " self.error('sID:%s\\n\\t Please set edit length upto 3nt. Available edit length range: 1~3nt' % self.sID)\n", - " raise ValueError('Please check your input: edit_len')\n", - " \n", - " if self.edit_len < 1:\n", - " self.error('sID:%s\\n\\t Please set edit length at least 1nt. Available edit length range: 1~3nt' % self.sID)\n", - " raise ValueError('Please check your input: edit_len')\n", - "\n", - " self.info('Input information\\n\\t ID: %s\\n\\t Refseq: %s\\n\\t EDseq :%s' % (self.sID, self.Ref_seq, self.ED_seq))\n", - "\n", - " return None\n", - " \n", - " # def check_input: END\n", - "\n", - "\n", - " def do_something(self):\n", - " self.logger.info('Something happened.')\n", - "\n", - " return None\n", - "\n", - " # def do_something: END\n", - " \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO @ Tue, 02 May 2023 19:22:44:\n", - "\t DeepPrime: pegRNA activity prediction models\n", - "\t version: 0.5.2 \n", - "\n", - "INFO @ Tue, 02 May 2023 19:22:44:\n", - "\t Input information\n", - "\t ID: Sample_1\n", - "\t Refseq: ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT\n", - "\t EDseq :ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT \n", - "\n", - "INFO @ Tue, 02 May 2023 19:22:44:\n", - "\t Created an instance of DeepPrime \n", - "\n" - ] - } - ], - "source": [ - "ins_a = DeepPrime(sID='Sample_1',\n", - " Ref_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT', \n", - " ED_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT',\n", - " edit_type='sub',\n", - " edit_len=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO @ Tue, 02 May 2023 19:22:45:\n", - "\t Something happened. \n", - "\n" - ] - } - ], - "source": [ - "ins_a.do_something()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO @ Tue, 02 May 2023 19:22:46:\n", - "\t Something happened. \n", - "\n" - ] - } - ], - "source": [ - "ins_a.do_something()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "ins_b = DeepPrime(sID='Sample_2',\n", - " Ref_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT', \n", - " ED_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT',\n", - " edit_type='sub',\n", - " edit_len=1,\n", - " silence=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "ins_b.do_something()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO @ Tue, 02 May 2023 19:22:54:\n", - "\t DeepPrime: pegRNA activity prediction models\n", - "\t version: 0.5.2 \n", - "\n", - "ERROR @ Tue, 02 May 2023 19:22:54:\n", - "\t sID:Sample_3\n", - "\t Please set edit length upto 3nt. Available edit length range: 1~3nt \n", - "\n", - "INFO @ Tue, 02 May 2023 19:22:54:\n", - "\t Input information\n", - "\t ID: Sample_3\n", - "\t Refseq: ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT\n", - "\t EDseq :ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT \n", - "\n", - "INFO @ Tue, 02 May 2023 19:22:54:\n", - "\t Created an instance of DeepPrime \n", - "\n" - ] - } - ], - "source": [ - "ins_c = DeepPrime(sID='Sample_3',\n", - " Ref_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT', \n", - " ED_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT',\n", - " edit_type='sub',\n", - " edit_len=4,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO @ Tue, 02 May 2023 19:17:52:\n", - "\t Something happened. \n", - "\n" - ] - } - ], - "source": [ - "ins_c.do_something()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Error!@@!@!@", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32me:\\github_project\\genet\\test_genet_prd.ipynb Cell 20\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msys\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39mlogging\u001b[39;00m\n\u001b[0;32m 4\u001b[0m logger \u001b[39m=\u001b[39m logging\u001b[39m.\u001b[39mgetLogger(\u001b[39m'\u001b[39m\u001b[39mexample\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m'\u001b[39m\u001b[39mError!@@!@!@\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m 9\u001b[0m logger\u001b[39m.\u001b[39merror(\u001b[39m'\u001b[39m\u001b[39mError\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m 13\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mError occured\u001b[39m\u001b[39m'\u001b[39m)\n", - "\u001b[1;31mValueError\u001b[0m: Error!@@!@!@" - ] - } - ], - "source": [ - "import sys, logging\n", - "\n", - "\n", - "logger = logging.getLogger('example')\n", - "\n", - "\n", - "raise ValueError('Error!@@!@!@')\n", - "\n", - "logger.error('Error')\n", - "\n", - "\n", - "\n", - "print('Error occured')\n", - "sys.exit(1)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model DeepSpCas9 is not installed. Download checkpoint files.\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 0KB [00:00, ?KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/__init__.py\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 681KB [00:01, 469.31KB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.data-00000-of-00001\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 1KB [00:00, 332.93KB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.index\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading: 29KB [00:00, 198.62KB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File downloaded successfully: e:\\github_project\\genet\\genet\\models\\DeepSpCas9/PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60.meta\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "e:\\github_project\\genet\\genet\\predict\\DeepSpCas9.py:100: UserWarning: `tf.layers.dropout` is deprecated and will be removed in a future version. Please use `tf.keras.layers.Dropout` instead.\n", - " out_layer = tf.compat.v1.layers.dropout(tf.compat.v1.nn.relu(out_layer), 0.3, self.is_training)\n", - "c:\\Users\\gsyu\\miniconda3\\envs\\genet\\lib\\site-packages\\keras\\legacy_tf_layers\\core.py:413: UserWarning: `layer.apply` is deprecated and will be removed in a future version. Please use `layer.__call__` method instead.\n", - " return layer.apply(inputs, training=training)\n", - "e:\\github_project\\genet\\genet\\predict\\DeepSpCas9.py:133: UserWarning: `tf.layers.dropout` is deprecated and will be removed in a future version. Please use `tf.keras.layers.Dropout` instead.\n", - " L_fcl1_drop = tf.compat.v1.layers.dropout(L_fcl1, 0.3, self.is_training)\n", - "e:\\github_project\\genet\\genet\\predict\\DeepSpCas9.py:140: UserWarning: `tf.layers.dropout` is deprecated and will be removed in a future version. Please use `tf.keras.layers.Dropout` instead.\n", - " L_fcl2_drop = tf.compat.v1.layers.dropout(L_fcl2, 0.3, self.is_training)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SequenceSpCas9
0TCACCTTCGTTTTTTTCCTTCTGCAGGAGG2.801168
1CCTTCGTTTTTTTCCTTCTGCAGGAGGACA2.253283
2CTTTCAAGAACTCTTCCACCTCCATGGTGT53.431831
\n", - "
" - ], - "text/plain": [ - " Sequence SpCas9\n", - "0 TCACCTTCGTTTTTTTCCTTCTGCAGGAGG 2.801168\n", - "1 CCTTCGTTTTTTTCCTTCTGCAGGAGGACA 2.253283\n", - "2 CTTTCAAGAACTCTTCCACCTCCATGGTGT 53.431831" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from genet.predict import SpCas9\n", - "\n", - "spcas = SpCas9()\n", + " self.info('DeepPrime: pegRNA activity prediction models\\n\\t version: %s' % genet.__version__)\n", "\n", - "list_target30 = [\n", - " 'TCACCTTCGTTTTTTTCCTTCTGCAGGAGG',\n", - " 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA',\n", - " 'CTTTCAAGAACTCTTCCACCTCCATGGTGT',\n", - " ]\n", "\n", - "df_out = spcas.predict(list_target30)\n", + " return None\n", "\n", - "df_out" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SequenceSpCas9
0TCACCTTCGTTTTTTTCCTTCTGCAGGAGG2.801168
1CCTTCGTTTTTTTCCTTCTGCAGGAGGACA2.253283
2CTTTCAAGAACTCTTCCACCTCCATGGTGT53.431831
\n", - "
" - ], - "text/plain": [ - " Sequence SpCas9\n", - "0 TCACCTTCGTTTTTTTCCTTCTGCAGGAGG 2.801168\n", - "1 CCTTCGTTTTTTTCCTTCTGCAGGAGGACA 2.253283\n", - "2 CTTTCAAGAACTCTTCCACCTCCATGGTGT 53.431831" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from genet.predict import SpCas9\n", + " # def set_logging: END\n", + "\n", + "\n", + " def check_input(self):\n", + " \n", + " if self.pbs_min < 1:\n", + " self.error('sID:%s\\nPlease set PBS max length at least 1nt' % self.sID)\n", + " raise ValueError('Please check your input: pbs_min')\n", + " \n", + " if self.pbs_max > 17:\n", + " self.error('sID:%s\\nPlease set PBS max length upto 17nt' % self.sID)\n", + " raise ValueError('Please check your input: pbs_max')\n", + " \n", + " if self.rtt_max > 40:\n", + " self.error('sID:%s\\nPlease set RTT max length upto 40nt' % self.sID)\n", + " raise ValueError('Please check your input: rtt_max')\n", + "\n", + " if self.edit_type not in ['sub', 'ins', 'del']:\n", + " self.error('sID:%s\\n\\t Please select proper edit type.\\n\\t Available edit tyle: sub, ins, del' % self.sID)\n", + " raise ValueError('Please check your input: edit_type')\n", + "\n", + " if self.edit_len > 3:\n", + " self.error('sID:%s\\n\\t Please set edit length upto 3nt. Available edit length range: 1~3nt' % self.sID)\n", + " raise ValueError('Please check your input: edit_len')\n", + " \n", + " if self.edit_len < 1:\n", + " self.error('sID:%s\\n\\t Please set edit length at least 1nt. Available edit length range: 1~3nt' % self.sID)\n", + " raise ValueError('Please check your input: edit_len')\n", + "\n", + " self.info('Input information\\n\\t ID: %s\\n\\t Refseq: %s\\n\\t EDseq :%s' % (self.sID, self.Ref_seq, self.ED_seq))\n", + "\n", + " return None\n", + " \n", + " # def check_input: END\n", "\n", - "spcas = SpCas9()\n", "\n", - "list_target30 = [\n", - " 'TCACCTTCGTTTTTTTCCTTCTGCAGGAGG',\n", - " 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA',\n", - " 'CTTTCAAGAACTCTTCCACCTCCATGGTGT',\n", - " ]\n", + " def do_something(self):\n", + " self.logger.info('Something happened.')\n", "\n", - "df_out = spcas.predict_dev(list_target30)\n", + " return None\n", "\n", - "df_out" + " # def do_something: END\n", + " \n", + "\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SequenceSpCas9
0TCACCTTCGTTTTTTTCCTTCTGCAGGAGG2.801168
1CCTTCGTTTTTTTCCTTCTGCAGGAGGACA2.253283
2CTTTCAAGAACTCTTCCACCTCCATGGTGT53.431831
\n", - "
" - ], - "text/plain": [ - " Sequence SpCas9\n", - "0 TCACCTTCGTTTTTTTCCTTCTGCAGGAGG 2.801168\n", - "1 CCTTCGTTTTTTTCCTTCTGCAGGAGGACA 2.253283\n", - "2 CTTTCAAGAACTCTTCCACCTCCATGGTGT 53.431831" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO @ Tue, 02 May 2023 19:22:44:\n", + "\t DeepPrime: pegRNA activity prediction models\n", + "\t version: 0.5.2 \n", + "\n", + "INFO @ Tue, 02 May 2023 19:22:44:\n", + "\t Input information\n", + "\t ID: Sample_1\n", + "\t Refseq: ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT\n", + "\t EDseq :ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT \n", + "\n", + "INFO @ Tue, 02 May 2023 19:22:44:\n", + "\t Created an instance of DeepPrime \n", + "\n" + ] } ], "source": [ - "from genet.predict import SpCas9\n", - "\n", - "# dev 중인 것\n", - "\n", - "spcas = SpCas9()\n", - "\n", - "list_target30 = [\n", - " 'TCACCTTCGTTTTTTTCCTTCTGCAGGAGG',\n", - " 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA',\n", - " 'CTTTCAAGAACTCTTCCACCTCCATGGTGT',\n", - " ]\n", - "\n", - "df_out = spcas.predict_dev(list_target30)\n", - "\n", - "df_out" + "ins_a = DeepPrime(sID='Sample_1',\n", + " Ref_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT', \n", + " ED_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT',\n", + " edit_type='sub',\n", + " edit_len=1)" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\gsyu\\miniconda3\\envs\\genet\\lib\\site-packages\\tqdm\\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "INFO @ Tue, 02 May 2023 19:22:45:\n", + "\t Something happened. \n", + "\n" ] } ], "source": [ - "from genet.predict import CasVariant\n", - "\n", - "cas_ng = CasVariant('SpCas9-NG')" + "ins_a.do_something()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ins_b = DeepPrime(sID='Sample_2',\n", + " Ref_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGAAGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT', \n", + " ED_seq='ATGACAATAAAAGACAACACCCTTGCCTTGTGGAGTTTTCAAAGCTCCCAGAAACTGAGACGAACTATAACCTGCAAATGTCAACTGAAACCTTAAAGTGAGTATTTAATTGAGCTGAAGT',\n", + " edit_type='sub',\n", + " edit_len=1,\n", + " silence=True)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SequenceSpCas9-NG
0TCACCTTCGTTTTTTTCCTTCTGCAGGAGG0.618299
1CCTTCGTTTTTTTCCTTCTGCAGGAGGACA1.134845
2CTTTCAAGAACTCTTCCACCTCCATGGTGT36.743580
\n", - "
" - ], - "text/plain": [ - " Sequence SpCas9-NG\n", - "0 TCACCTTCGTTTTTTTCCTTCTGCAGGAGG 0.618299\n", - "1 CCTTCGTTTTTTTCCTTCTGCAGGAGGACA 1.134845\n", - "2 CTTTCAAGAACTCTTCCACCTCCATGGTGT 36.743580" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" + "ename": "ValueError", + "evalue": "Error!@@!@!@", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32me:\\github_project\\genet\\test_genet_prd.ipynb Cell 20\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msys\u001b[39;00m\u001b[39m,\u001b[39m \u001b[39mlogging\u001b[39;00m\n\u001b[0;32m 4\u001b[0m logger \u001b[39m=\u001b[39m logging\u001b[39m.\u001b[39mgetLogger(\u001b[39m'\u001b[39m\u001b[39mexample\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m'\u001b[39m\u001b[39mError!@@!@!@\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m 9\u001b[0m logger\u001b[39m.\u001b[39merror(\u001b[39m'\u001b[39m\u001b[39mError\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m 13\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mError occured\u001b[39m\u001b[39m'\u001b[39m)\n", + "\u001b[1;31mValueError\u001b[0m: Error!@@!@!@" + ] } ], "source": [ - "list_target30 = [\n", - " 'TCACCTTCGTTTTTTTCCTTCTGCAGGAGG',\n", - " 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA',\n", - " 'CTTTCAAGAACTCTTCCACCTCCATGGTGT',\n", - " ]\n", + "import sys, logging\n", + "\n", + "\n", + "logger = logging.getLogger('example')\n", "\n", - "df_out = cas_ng.predict(list_target30)\n", - "df_out\n" + "\n", + "raise ValueError('Error!@@!@!@')\n", + "\n", + "logger.error('Error')\n", + "\n", + "\n", + "\n", + "print('Error occured')\n", + "sys.exit(1)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1084,7 +876,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { diff --git a/test_models.ipynb b/test_models.ipynb index c025a44..a30d30c 100644 --- a/test_models.ipynb +++ b/test_models.ipynb @@ -655,7 +655,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.12" }, "orig_nbformat": 4 }, From 1cfe71ba1c895ac7a7e155fae9e181acba701f34 Mon Sep 17 00:00:00 2001 From: Goosang Yu Date: Thu, 10 Aug 2023 00:06:54 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=85=20update=20search=20method=20in?= =?UTF-8?q?=20SpCas9=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genet/models/constants.py | 8 +++- genet/predict/DeepSpCas9.py | 59 +++++++++++++++++++++++++---- genet/predict/DeepSpCas9Variants.py | 58 ++++++++++++++++++++++++---- 3 files changed, 107 insertions(+), 18 deletions(-) diff --git a/genet/models/constants.py b/genet/models/constants.py index b6b4be9..55af537 100644 --- a/genet/models/constants.py +++ b/genet/models/constants.py @@ -10,14 +10,18 @@ 'SpCas9': { 'type': 'DeepSpCas9', 'repo': 'Goosang-Yu/genet-models/main/genet_models', - 'path': 'DeepSpCas9' + 'path': 'DeepSpCas9', + 'regex': {'+': '[ATGC]{25}GG[ATGC]{3}', + '-': '[ATGC]{3}CC[ATGC]{25}',}, }, # DeepSpCas9variants 'SpCas9-NG': { 'type': 'DeepSpCas9variants', 'repo': 'Goosang-Yu/genet-models/main/genet_models', - 'path': 'DeepSpCas9variants/PAM_variant_NG' + 'path': 'DeepSpCas9variants/PAM_variant_NG', + 'regex': {'+': '[ATGC]{25}G[ATGC]{4}', + '-': '[ATGC]{4}C[ATGC]{25}',}, }, 'SpCas9-NRCH': { 'type': 'DeepSpCas9variants', diff --git a/genet/predict/DeepSpCas9.py b/genet/predict/DeepSpCas9.py index 53bfa7b..e5bae73 100644 --- a/genet/predict/DeepSpCas9.py +++ b/genet/predict/DeepSpCas9.py @@ -1,4 +1,4 @@ -import os, sys +import os, sys, regex import numpy as np import pandas as pd @@ -26,10 +26,9 @@ def __init__(self, gpu_env=0): 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA', 'CTTTCAAGAACTCTTCCACCTCCATGGTGT', ] - - >>> list_out = spcas9_score(list_target30) + >>> deepspcas9 = genet.predict.SpCas9() - >>> list_out = [2.80322408676147, 2.25273704528808, 53.4233360290527] + >>> spcas_score = deepspcas9(list_target30) ''' # TensorFlow config @@ -37,8 +36,8 @@ def __init__(self, gpu_env=0): self.conf.gpu_options.allow_growth = True os.environ['CUDA_VISIBLE_DEVICES'] = '%d' % gpu_env - model_info = LoadModel('DeepSpCas9', 'SpCas9') - model_dir = model_info.model_dir + self.model = LoadModel('DeepSpCas9', 'SpCas9') + model_dir = self.model.model_dir best_model = 'PreTrain-Final-3-5-7-100-70-40-0.001-550-80-60' self.model_save = '%s/%s' % (model_dir, best_model) @@ -59,12 +58,12 @@ def predict(self, list_target30: list) -> pd.DataFrame: with tf.compat.v1.Session(config=self.conf) as sess: sess.run(tf.compat.v1.global_variables_initializer()) - model = DeepCas9(self.params[0], self.params[1], 80, 60, self.params[2]) + interpreter = DeepCas9(self.params[0], self.params[1], 80, 60, self.params[2]) saver = tf.compat.v1.train.Saver() saver.restore(sess, self.model_save) - list_score = Model_Finaltest(sess, seq_processed, model) + list_score = Model_Finaltest(sess, seq_processed, interpreter) df_out = pd.DataFrame() df_out['Target'] = list_target30 @@ -73,6 +72,50 @@ def predict(self, list_target30: list) -> pd.DataFrame: return df_out + def search(self, seq: str) -> pd.DataFrame: + '''주어진 sequence 내에 가능한 모든 target sequence를 찾고, + 그 정보와 예측 점수를 계산하는 method + ''' + + self.seq = seq.upper() + dict_re = self.model.info['regex'] + + seq_target, seq_guide, seq_strand, pos_start, pos_end = [], [], [], [], [] + + for strand in ['+', '-']: + ptn = dict_re[strand] + + for re_idx in regex.finditer(ptn, self.seq, overlapped=True): + if strand == '+': match = re_idx.group() + else : match = reverse_complement(re_idx.group()) + + seq_target.append(match) + seq_guide.append(match[4:24]) + seq_strand.append(strand) + pos_start.append(re_idx.start()) + pos_end.append(re_idx.end()) + + + seq_processed = preprocess_seq(seq_target, 30) + + with tf.compat.v1.Session(config=self.conf) as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + interpreter = DeepCas9(self.params[0], self.params[1], 80, 60, self.params[2]) + + saver = tf.compat.v1.train.Saver() + saver.restore(sess, self.model_save) + + list_score = Model_Finaltest(sess, seq_processed, interpreter) + + df_out = pd.DataFrame({'Target': seq_target, + 'Spacer': seq_guide, + 'Strand': seq_strand, + 'Start' : pos_start, + 'End' : pos_end, + 'SpCas9': list_score}) + + return df_out + def Model_Finaltest(sess, TEST_X, model): test_batch = 500 diff --git a/genet/predict/DeepSpCas9Variants.py b/genet/predict/DeepSpCas9Variants.py index 931e2fb..7d1bf3f 100644 --- a/genet/predict/DeepSpCas9Variants.py +++ b/genet/predict/DeepSpCas9Variants.py @@ -1,8 +1,8 @@ # Reference: https://blog.naver.com/PostView.naver?blogId=seodaewoo&logNo=222043145688&parentCategoryNo=&categoryNo=62&viewDate=&isShowPopularPosts=false&from=postView - -import tensorflow as tf +import regex import numpy as np import pandas as pd +import tensorflow as tf from genet.predict.PredUtils import * from genet.models import LoadModel @@ -20,13 +20,11 @@ def __init__(self, effector:str): 'CCTTCGTTTTTTTCCTTCTGCAGGAGGACA', 'CTTTCAAGAACTCTTCCACCTCCATGGTGT', ] - \n ''' - self.effector = effector - - self.model_info = LoadModel('DeepSpCas9variants', effector) - self.model_dir = self.model_info.model_dir + self.effector = effector + self.model = LoadModel('DeepSpCas9variants', effector) + self.model_dir = self.model.model_dir def predict(self, list_target30: list) -> pd.DataFrame: @@ -71,7 +69,51 @@ def predict(self, list_target30: list) -> pd.DataFrame: df_out[self.effector] = list_out - return df_out + return df_out + + def search(self, seq: str) -> pd.DataFrame: + '''주어진 sequence 내에 가능한 모든 target sequence를 찾고, + 그 정보와 예측 점수를 계산하는 method + ''' + + self.seq = seq.upper() + dict_re = self.model.info['regex'] + + seq_target, seq_guide, seq_strand, pos_start, pos_end = [], [], [], [], [] + + for strand in ['+', '-']: + ptn = dict_re[strand] + + for re_idx in regex.finditer(ptn, self.seq, overlapped=True): + if strand == '+': match = re_idx.group() + else : match = reverse_complement(re_idx.group()) + + seq_target.append(match) + seq_guide.append(match[4:24]) + seq_strand.append(strand) + pos_start.append(re_idx.start()) + pos_end.append(re_idx.end()) + + + seq_processed = preprocess_seq(seq_target, 30) + + with tf.compat.v1.Session(config=self.conf) as sess: + sess.run(tf.compat.v1.global_variables_initializer()) + interpreter = DeepCas9(self.params[0], self.params[1], 80, 60, self.params[2]) + + saver = tf.compat.v1.train.Saver() + saver.restore(sess, self.model_save) + + list_score = Model_Finaltest(sess, seq_processed, interpreter) + + df_out = pd.DataFrame({'Target': seq_target, + 'Spacer': seq_guide, + 'Strand': seq_strand, + 'Start' : pos_start, + 'End' : pos_end, + 'SpCas9': list_score}) + + return df_out From dd4f7d21072c50a3bc3c57832168d599cd76f6b7 Mon Sep 17 00:00:00 2001 From: Goosang Yu Date: Thu, 10 Aug 2023 00:07:38 +0900 Subject: [PATCH 4/5] =?UTF-8?q?=E2=9C=85=20change=20obj=20name=20of=20Load?= =?UTF-8?q?Model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- genet/models/functional.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/genet/models/functional.py b/genet/models/functional.py index fb25006..b07e915 100644 --- a/genet/models/functional.py +++ b/genet/models/functional.py @@ -27,13 +27,13 @@ def __init__(self, model:str, effector:str, cell_type=None): # 이 모델이 genet에서 지원하는 것인지 확인하기 try: - self.model_info = models.constants.dict_model_info[model_type] + self.info = models.constants.dict_model_info[model_type] except: print('[Warning] Not available model in GenET!') sys.exit() # model_dir: - self.model_dir = inspect.getfile(models).replace('__init__.py', '') + self.model_info['path'] + self.model_dir = inspect.getfile(models).replace('__init__.py', '') + self.info['path'] # 만약 모델이 아직 다운로드 되지 않았다면, 다운로드 하기. if not os.path.exists(self.model_dir): @@ -42,9 +42,9 @@ def __init__(self, model:str, effector:str, cell_type=None): dict_files = models.constants.dict_model_requests self.download_from_github( - repo = self.model_info['repo'], - path = self.model_info['path'], - files = dict_files[self.model_info['type']], + repo = self.info['repo'], + path = self.info['path'], + files = dict_files[self.info['type']], save_dir = self.model_dir, ) From 16bdf57bb8fec04aa27a04620a37c4280632846b Mon Sep 17 00:00:00 2001 From: Goosang Yu Date: Thu, 10 Aug 2023 00:07:57 +0900 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9D=20Update=20readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 547b5ca..d0f7ce0 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,26 @@ df_out = spcas.predict(list_target) | 1 | CCTTCGTTTTTTTCCTTCTGCAGGAGGACA | CGTTTTTTTCCTTCTGCAGG | 2.253288 | | 2 | CTTTCAAGAACTCTTCCACCTCCATGGTGT | CAAGAACTCTTCCACCTCCA | 53.43182 | +Alternatively, you can identify all possible SpCas9 target sites within an extensive gene sequence and obtain predictive scores. +```python +from genet.predict import SpCas9 + +# Put the whole sequence context that you want to find Cas9 target site. +gene = 'ttcagctctacgtctcctccgagagccgcttcaacaccctggccgagttggttcatcatcattcaacggtggccgacgggctcatcaccacgctccattatccagccccaaagcgcaacaagcccactgtctatggtgtgtcccccaactacgacaagtgggagatggaacgcacggacatcaccatgaagcacaagctgggcgggggccagtacggggaggtgtacgagggcgtgtggaagaaatacagcctgacggtggccgtgaagaccttgaaggtagg' + +spcas = SpCas9() +df_out = spcas.search(gene) + +>>> df_out.head() +``` +| | Target | Spacer | Strand | Start | End | SpCas9 | +| - | ------------------------------ | -------------------- | ------ | ----- | --- | -------- | +| 0 | CCTCCGAGAGCCGCTTCAACACCCTGGCCG | CGAGAGCCGCTTCAACACCC | + | 15 | 45 | 67.39446 | +| 1 | GCCGCTTCAACACCCTGGCCGAGTTGGTTC | CTTCAACACCCTGGCCGAGT | + | 24 | 54 | 27.06508 | +| 2 | CCGAGTTGGTTCATCATCATTCAACGGTGG | GTTGGTTCATCATCATTCAA | + | 42 | 72 | 34.11356 | +| 3 | AGTTGGTTCATCATCATTCAACGGTGGCCG | GGTTCATCATCATTCAACGG | + | 45 | 75 | 76.43662 | +| 4 | TCATCATCATTCAACGGTGGCCGACGGGCT | CATCATTCAACGGTGGCCGA | + | 52 | 82 | 29.63767 | + ## Tutorial 2: Predict SpCas9variants activity (by DeepSpCas9variants) DeepSpCas9 is a prediction model developed to evaluate to indel frequency introduced by sgRNAs at specific target sites mediated by the SpCas9 PAM variants ([Kim et al. Nat.Biotechnol. 2020](https://doi.org/10.1038/s41587-020-0537-9)). The model was developed on tensorflow (version >= 2.6). Any dependent packages will be installed along with the GenET package.