base kegg

mpg-age-bioinformatics · Oct 18, 2024 · dff1ab7 · dff1ab7
1 parent 761d2b0
commit dff1ab7
Show file tree

Hide file tree

Showing 2 changed files with 244 additions and 24 deletions.
diff --git a/routes/apps/_kegg.py b/routes/apps/_kegg.py
@@ -5,6 +5,18 @@
 from io import StringIO
 from datetime import datetime
 import os
+import glob
+import time
+
+import Bio
+from Bio import SeqIO
+from Bio.KEGG.REST import *
+from Bio.KEGG.KGML import KGML_parser
+from Bio.Graphics.KGML_vis import KGMLCanvas
+from Bio.Graphics.ColorSpiral import ColorSpiral
+#from IPython.display import Image, HTML
+import tempfile
+
 
 PYFLASKI_VERSION=os.environ['PYFLASKI_VERSION']
 PYFLASKI_VERSION=str(PYFLASKI_VERSION)
@@ -18,27 +30,177 @@ def _read_compound_pathway(path_to_files=path_to_files):
         return df.to_json()
     return pd.read_json(StringIO(_read_compound_pathway()))
 
-def read_pathway(cache, path_to_files=path_to_files):
+def read_pathway_organism(cache, path_to_files=path_to_files):
     @cache.memoize(60*60*2) 
-    def _read_pathway(path_to_files=path_to_files):
-        df=pd.read_csv(f"{path_to_files}/pathway_list.tsv", sep="\t", names=['pathway_id', 'pathway_name'])
+    def _read_pathway_organism(path_to_files=path_to_files):
+        df=pd.read_csv(f"{path_to_files}/pathway_organisms.tsv", sep="\t", names=['pathway_id', 'pathway_name', 'organisms'])
         return df.to_json()
-    return pd.read_json(StringIO(_read_pathway()))
+    return pd.read_json(StringIO(_read_pathway_organism()))
 
 def compound_options(cache):
     compound_pathway_data=read_compound_pathway(cache)
     return [{'label': f"{cid}: {cname}", 'value': cid} for cid, cname in zip(compound_pathway_data['compound_id'], compound_pathway_data['compound_name'])]
 
 def pathway_options(cache, compound_list):
     compound_pathway_data=read_compound_pathway(cache)
-    pathway_data=read_pathway(cache)
+    pathway_organism_data=read_pathway_organism(cache)
 
     cp_row=compound_pathway_data[compound_pathway_data['compound_id'].isin(compound_list)]
     pathways_values = cp_row['pathways'].tolist()
     if not pathways_values or pathways_values==[None]:
         return None
     pathways_list = list(set([path.strip() for sublist in pathways_values for path in sublist.split(',')]))
-    pw_rows = pathway_data[pathway_data['pathway_id'].isin(pathways_list)]
+    pw_rows = pathway_organism_data[pathway_organism_data['pathway_id'].isin(pathways_list)]
 
     return [{'label': f"{pid}: {pname}", 'value': pid} for pid, pname in zip(pw_rows['pathway_id'], pw_rows['pathway_name'])]
 
+def organism_options(cache, pathway_id):
+    pod=read_pathway_organism(cache)
+    org_value=pod.loc[pod['pathway_id'] == pathway_id, 'organisms'].values[0] if not pod.loc[pod['pathway_id'] == pathway_id, 'organisms'].empty else None
+    if org_value is None:
+        return None
+
+    return [{'label': org, 'value': org} for org in org_value.split(',')]
+
+
+
+
+def network_pdf(selected_compound, pathway_id, organism_id):
+    # Clean up previous kegg files, clean all if total pdfs more than 50, else clean 30 mins or older files
+    kegg_files = glob.glob("/tmp/kegg-*.pdf")
+    if len(kegg_files) > 20:
+        [os.remove(file) for file in kegg_files if os.path.exists(file) or True]
+    else:
+        [os.remove(file) for file in kegg_files if os.path.exists(file) and (time.time() - os.path.getmtime(file)) > 1800]
+
+    try:
+        pathname = pathway_id.replace("map", organism_id)    
+        pathway=KGML_parser.read(kegg_get(pathname, "kgml"))
+        canvas = KGMLCanvas(pathway, import_imagemap=True)
+
+        for compound in pathway.compounds :
+            c=compound.name.split(":")[-1]
+            if c in selected_compound:
+                compound.graphics[0].bgcolor="#FF0000"
+
+        temp_pdf = tempfile.NamedTemporaryFile(delete=False, prefix="kegg-", suffix=".pdf", dir="/tmp").name
+        canvas.draw(temp_pdf)
+        return temp_pdf
+    except Exception as e:
+        return None
+
+
+
+
+####### Generate/organize kegg data for faster use #######
+### Generate pathway_organisms.tsv with pathway_id, pathway_name, available_organisms
+
+# from Bio.KEGG import REST
+# import csv
+
+# # Fetch all pathways
+# pathway_list = REST.kegg_list('pathway').read()
+
+# # Write pathways to a TSV file without a header
+# with open('pathway_organisms.tsv', 'w', newline='') as outfile:
+#     tsv_writer = csv.writer(outfile, delimiter='\t')
+#     for line in pathway_list.splitlines():
+#         pathway_id, pathway_name = line.split('\t')
+#         tsv_writer.writerow([pathway_id, pathway_name])
+
+# # Fetch the list of all available organisms
+# organism_list = REST.kegg_list('organism').read()
+
+# # Extract the organism codes (e.g., 'hsa', 'ptr', 'pps', etc.)
+# organism_codes = [line.split('\t')[1] for line in organism_list.splitlines()]
+
+
+# # Function to get pathways for a given organism
+# def get_pathways_from_org(organism_code):
+#     pathway_list = REST.kegg_list('pathway', organism_code).read()
+#     return [line.split('\t')[0][3:] for line in pathway_list.splitlines()]
+
+# # Read TSV file, check pathways, and append organism code
+# def update_tsv_with_organism(tsv_file, organism_code):
+#     # Get the list of pathways for the organism
+#     organism_pathways = get_pathways_from_org(organism_code)
+
+#     # Read the contents of the TSV file into memory
+#     updated_rows = []
+#     with open(tsv_file, 'r') as infile:
+#         tsv_reader = csv.reader(infile, delimiter='\t')
+
+#         # Process each line of the TSV
+#         for row in tsv_reader:
+#             # Extract the pathway ID (e.g., 'map01100')
+#             pathway_id = row[0][3:]  # Remove 'map' to get the numeric part
+
+#             # Check if this pathway exists in the organism's pathways
+#             if pathway_id in organism_pathways:
+#                 # Append the organism code to the row
+#                 if len(row) < 3:
+#                     row.append(organism_code)
+#                 else:
+#                     row[2] += f",{organism_code}"  # If third column exists, append to it
+
+#             # Add the updated row to the list
+#             updated_rows.append(row)
+
+#     # Overwrite the original file with the updated data
+#     with open(tsv_file, 'w', newline='') as outfile:
+#         tsv_writer = csv.writer(outfile, delimiter='\t')
+#         tsv_writer.writerows(updated_rows)
+
+# # Update pathway_organisms.tsv file with organisms
+# tsv_file = 'pathway_organisms.tsv'
+# for org in organism_codes:
+#     update_tsv_with_organism(tsv_file, org)
+
+### Generate compound_pathways.tsv with compound_id, compound_name, available_pathways
+
+# # Function to get pathways associated with a given compound using KEGG REST API
+# def get_pathways_for_compound(compound_id):
+#     # Fetch pathways linked to the compound using the KEGG REST API
+#     link_url = f"https://rest.kegg.jp/link/pathway/{compound_id}"
+#     response = requests.get(link_url)
+
+#     # Check if the response is empty (i.e., no linked pathways found)
+#     if response.status_code != 200 or not response.text.strip():
+#         return None
+
+#     # Parse the linked pathways and extract pathway IDs (e.g., map00190)
+#     pathway_ids = [line.split("\t")[1].split(":")[1] for line in response.text.strip().splitlines()]
+
+#     # Return the comma-separated list of pathway IDs (e.g., map00190, map00195, etc.)
+#     return ",".join(pathway_ids)
+
+# # Function to append pathway list to the compounds and save the result to a TSV file
+# def append_pathways_to_compounds(output_file):
+#     # Fetch the list of all compounds from KEGG
+#     request = REST.kegg_list("compound")
+#     compound_data = request.read()
+
+#     # Get the compound lines
+#     compound_lines = compound_data.splitlines()[18800:]
+
+#     # Process each compound and append data to the file one by one
+#     for index, line in enumerate(compound_lines):
+#         compound_id, compound_name = line.split("\t")
+
+#         # Get the associated pathways for this compound
+#         pathways = get_pathways_for_compound(compound_id)
+#         if not pathways:
+#             pathways = "NA"
+
+#         mode = "w" if index == 0 else "a"
+
+#         # Open the output file in the appropriate mode
+#         with open(output_file, mode) as f:
+#             # Write the compound ID, compound name, and pathway list to the file
+#             f.write(f"{compound_id}\t{compound_name}\t{pathways}\n")
+
+# # Generate the TSV file and append data one by one
+# output_file = "compound_pathways.tsv"
+# append_pathways_to_compounds(output_file)
+
+# print(f"TSV file '{output_file}' generated successfully.")
diff --git a/routes/apps/kegg.py b/routes/apps/kegg.py
@@ -1,7 +1,7 @@
 from myapp import app, PAGE_PREFIX, PRIVATE_ROUTES
 from flask_login import current_user
 from flask_caching import Cache
-from flask import session
+from flask import session, send_from_directory, abort, send_file
 import dash
 from dash import dcc, html
 from dash.dependencies import Input, Output, State, MATCH, ALL
@@ -27,9 +27,13 @@
 import plotly.express as px
 # from plotly.io import write_image
 import plotly.graph_objects as go
-from ._kegg import compound_options,pathway_options
+from ._kegg import compound_options, pathway_options, organism_options, network_pdf
 from dash import dash_table
 
+import io
+from Bio.KEGG.KGML.KGML_parser import read
+from Bio.Graphics.KGML_vis import KGMLCanvas
+
 
 FONT_AWESOME = "https://use.fontawesome.com/releases/v5.7.2/css/all.css"
 
@@ -54,6 +58,23 @@
         ],
         'CACHE_REDIS_SENTINEL_MASTER': os.environ.get('CACHE_REDIS_SENTINEL_MASTER')
     })
+
+# Route to serve any file from /tmp directory
+@dashapp.server.route(f"{PAGE_PREFIX}/kegg/tmp/<path:filename>")
+def serve_file_from_tmp(filename):
+    tmp_dir = "/tmp"
+
+    if not filename.startswith("kegg-") or not filename.endswith(".pdf"):
+        return abort(403, description="Forbidden: Only PDF files with kegg initials are allowed")
+
+    file_path = os.path.abspath(os.path.join(tmp_dir, filename))
+
+    # Check if the file exists and is within the /tmp directory
+    if os.path.exists(file_path):
+        return send_file(file_path, mimetype='application/pdf')
+    else:
+        return abort(404, description="File not found")
+
 
 dashapp.layout=html.Div( 
     [ 
@@ -67,11 +88,6 @@
 card_input_style={"width":"100%","height":"35px"}
 card_body_style={ "padding":"2px", "padding-top":"4px"}
 
-
-
-
-
-
 @dashapp.callback(
     Output('protected-content', 'children'),
     Input('session-id', 'data')
@@ -91,8 +107,6 @@ def make_loading(children,i):
             children=children,
         )
 
-
-
     # HTML content from here
     protected_content=html.Div(
         [
@@ -130,7 +144,13 @@ def make_loading(children,i):
                     ),               
                     dbc.Col(
                         [
-                          dcc.Markdown("Based on GTEx Analysis Release V8 - https://gtexportal.org", style={"margin-top":"15px"}),
+                          dcc.Loading(
+                              id="loading-output-1",
+                              type="default",
+                              children=[ html.Div(id="my-output")],
+                              style={"margin-top":"50%"} 
+                          ),  
+                          dcc.Markdown("Based on KEGG (Kyoto Encyclopedia of Genes and Genomes) - https://www.genome.jp/kegg/", style={"margin-top":"15px", "margin-left":"15px"}),
                         ],
                         xs=12,sm=12,md=6,lg=8,xl=9,
                         style={"margin-bottom":"50px"}
@@ -147,18 +167,13 @@ def make_loading(children,i):
     )
     return protected_content
 
-
+# Callback to load compound list on session load
 @dashapp.callback(
     Output(component_id='opt-compound', component_property='options'),
-    #Output(component_id='opt-pathway', component_property='options'),
-    #Output(component_id='opt-organism', component_property='options'),
     Input('session-id', 'data')
     )
 def update_menus(session_id):
-    compound=compound_options(cache)
-
-    return compound
-
+    return compound_options(cache)
 
 # Callback to update opt-pathway based on selected compounds
 @dashapp.callback(
@@ -172,4 +187,47 @@ def update_pathways(selected_compounds):
     pw_options=pathway_options(cache, selected_compounds)
     if pw_options is None:
         return [], "No Pathway Found for Selected Compound" 
-    return pw_options, "Select Pathway"
+    return pw_options, "Select Pathway"
+
+# Callback to update opt-organism based on selected pathway
+@dashapp.callback(
+    Output('opt-organism', 'options'),
+    Output('opt-organism', 'placeholder'),
+    Input('opt-pathway', 'value')
+)
+def update_organisms(selected_pathway):
+    if selected_pathway is None:
+        return [], "No Pathway Selected"    
+    org_options=organism_options(cache, selected_pathway)
+    if org_options is None:
+        return [], "No Organism Found for Selected Pathway" 
+    return org_options, "Select Organism"
+
+
+# Callback on submit
+@dashapp.callback(
+    Output('my-output','children'),
+    Input('session-id', 'data'),
+    Input('submit-button-state', 'n_clicks'),
+    State("opt-compound", "value"),
+    State("opt-pathway", "value"),
+    State("opt-organism", "value"),
+    State('download_name','value'),
+)
+def update_output(session_id, n_clicks, compound, pathway, organism, download_name):
+    if not n_clicks:
+        return html.Div([])
+
+    if not compound or pathway is None or organism is None:
+        return html.Div([dcc.Markdown("*** Please select at least a compound, pathway and organism!", style={"margin-top":"15px","margin-left":"15px"})])
+
+    pdf_path=network_pdf(compound,pathway,organism)
+    if pdf_path is None:
+        return html.Div([dcc.Markdown("*** Failed to generate network pdf!", style={"margin-top":"15px","margin-left":"15px"})])
+
+    output= html.Div([
+        html.Iframe(src=f"{PAGE_PREFIX}/kegg{pdf_path}", style={"width": "100%", "height": "700px"}),
+    ])
+
+    return output
+