Skip to content

Commit

Permalink
base kegg
Browse files Browse the repository at this point in the history
  • Loading branch information
hamin committed Oct 18, 2024
1 parent 761d2b0 commit dff1ab7
Show file tree
Hide file tree
Showing 2 changed files with 244 additions and 24 deletions.
174 changes: 168 additions & 6 deletions routes/apps/_kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@
from io import StringIO
from datetime import datetime
import os
import glob
import time

import Bio
from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas
from Bio.Graphics.ColorSpiral import ColorSpiral
#from IPython.display import Image, HTML
import tempfile


PYFLASKI_VERSION=os.environ['PYFLASKI_VERSION']
PYFLASKI_VERSION=str(PYFLASKI_VERSION)
Expand All @@ -18,27 +30,177 @@ def _read_compound_pathway(path_to_files=path_to_files):
return df.to_json()
return pd.read_json(StringIO(_read_compound_pathway()))

def read_pathway(cache, path_to_files=path_to_files):
def read_pathway_organism(cache, path_to_files=path_to_files):
@cache.memoize(60*60*2)
def _read_pathway(path_to_files=path_to_files):
df=pd.read_csv(f"{path_to_files}/pathway_list.tsv", sep="\t", names=['pathway_id', 'pathway_name'])
def _read_pathway_organism(path_to_files=path_to_files):
df=pd.read_csv(f"{path_to_files}/pathway_organisms.tsv", sep="\t", names=['pathway_id', 'pathway_name', 'organisms'])
return df.to_json()
return pd.read_json(StringIO(_read_pathway()))
return pd.read_json(StringIO(_read_pathway_organism()))

def compound_options(cache):
compound_pathway_data=read_compound_pathway(cache)
return [{'label': f"{cid}: {cname}", 'value': cid} for cid, cname in zip(compound_pathway_data['compound_id'], compound_pathway_data['compound_name'])]

def pathway_options(cache, compound_list):
compound_pathway_data=read_compound_pathway(cache)
pathway_data=read_pathway(cache)
pathway_organism_data=read_pathway_organism(cache)

cp_row=compound_pathway_data[compound_pathway_data['compound_id'].isin(compound_list)]
pathways_values = cp_row['pathways'].tolist()
if not pathways_values or pathways_values==[None]:
return None
pathways_list = list(set([path.strip() for sublist in pathways_values for path in sublist.split(',')]))
pw_rows = pathway_data[pathway_data['pathway_id'].isin(pathways_list)]
pw_rows = pathway_organism_data[pathway_organism_data['pathway_id'].isin(pathways_list)]

return [{'label': f"{pid}: {pname}", 'value': pid} for pid, pname in zip(pw_rows['pathway_id'], pw_rows['pathway_name'])]

def organism_options(cache, pathway_id):
pod=read_pathway_organism(cache)
org_value=pod.loc[pod['pathway_id'] == pathway_id, 'organisms'].values[0] if not pod.loc[pod['pathway_id'] == pathway_id, 'organisms'].empty else None
if org_value is None:
return None

return [{'label': org, 'value': org} for org in org_value.split(',')]




def network_pdf(selected_compound, pathway_id, organism_id):
# Clean up previous kegg files, clean all if total pdfs more than 50, else clean 30 mins or older files
kegg_files = glob.glob("/tmp/kegg-*.pdf")
if len(kegg_files) > 20:
[os.remove(file) for file in kegg_files if os.path.exists(file) or True]
else:
[os.remove(file) for file in kegg_files if os.path.exists(file) and (time.time() - os.path.getmtime(file)) > 1800]

try:
pathname = pathway_id.replace("map", organism_id)
pathway=KGML_parser.read(kegg_get(pathname, "kgml"))
canvas = KGMLCanvas(pathway, import_imagemap=True)

for compound in pathway.compounds :
c=compound.name.split(":")[-1]
if c in selected_compound:
compound.graphics[0].bgcolor="#FF0000"

temp_pdf = tempfile.NamedTemporaryFile(delete=False, prefix="kegg-", suffix=".pdf", dir="/tmp").name
canvas.draw(temp_pdf)
return temp_pdf
except Exception as e:
return None




####### Generate/organize kegg data for faster use #######
### Generate pathway_organisms.tsv with pathway_id, pathway_name, available_organisms

# from Bio.KEGG import REST
# import csv

# # Fetch all pathways
# pathway_list = REST.kegg_list('pathway').read()

# # Write pathways to a TSV file without a header
# with open('pathway_organisms.tsv', 'w', newline='') as outfile:
# tsv_writer = csv.writer(outfile, delimiter='\t')
# for line in pathway_list.splitlines():
# pathway_id, pathway_name = line.split('\t')
# tsv_writer.writerow([pathway_id, pathway_name])

# # Fetch the list of all available organisms
# organism_list = REST.kegg_list('organism').read()

# # Extract the organism codes (e.g., 'hsa', 'ptr', 'pps', etc.)
# organism_codes = [line.split('\t')[1] for line in organism_list.splitlines()]


# # Function to get pathways for a given organism
# def get_pathways_from_org(organism_code):
# pathway_list = REST.kegg_list('pathway', organism_code).read()
# return [line.split('\t')[0][3:] for line in pathway_list.splitlines()]

# # Read TSV file, check pathways, and append organism code
# def update_tsv_with_organism(tsv_file, organism_code):
# # Get the list of pathways for the organism
# organism_pathways = get_pathways_from_org(organism_code)

# # Read the contents of the TSV file into memory
# updated_rows = []
# with open(tsv_file, 'r') as infile:
# tsv_reader = csv.reader(infile, delimiter='\t')

# # Process each line of the TSV
# for row in tsv_reader:
# # Extract the pathway ID (e.g., 'map01100')
# pathway_id = row[0][3:] # Remove 'map' to get the numeric part

# # Check if this pathway exists in the organism's pathways
# if pathway_id in organism_pathways:
# # Append the organism code to the row
# if len(row) < 3:
# row.append(organism_code)
# else:
# row[2] += f",{organism_code}" # If third column exists, append to it

# # Add the updated row to the list
# updated_rows.append(row)

# # Overwrite the original file with the updated data
# with open(tsv_file, 'w', newline='') as outfile:
# tsv_writer = csv.writer(outfile, delimiter='\t')
# tsv_writer.writerows(updated_rows)

# # Update pathway_organisms.tsv file with organisms
# tsv_file = 'pathway_organisms.tsv'
# for org in organism_codes:
# update_tsv_with_organism(tsv_file, org)

### Generate compound_pathways.tsv with compound_id, compound_name, available_pathways

# # Function to get pathways associated with a given compound using KEGG REST API
# def get_pathways_for_compound(compound_id):
# # Fetch pathways linked to the compound using the KEGG REST API
# link_url = f"https://rest.kegg.jp/link/pathway/{compound_id}"
# response = requests.get(link_url)

# # Check if the response is empty (i.e., no linked pathways found)
# if response.status_code != 200 or not response.text.strip():
# return None

# # Parse the linked pathways and extract pathway IDs (e.g., map00190)
# pathway_ids = [line.split("\t")[1].split(":")[1] for line in response.text.strip().splitlines()]

# # Return the comma-separated list of pathway IDs (e.g., map00190, map00195, etc.)
# return ",".join(pathway_ids)

# # Function to append pathway list to the compounds and save the result to a TSV file
# def append_pathways_to_compounds(output_file):
# # Fetch the list of all compounds from KEGG
# request = REST.kegg_list("compound")
# compound_data = request.read()

# # Get the compound lines
# compound_lines = compound_data.splitlines()[18800:]

# # Process each compound and append data to the file one by one
# for index, line in enumerate(compound_lines):
# compound_id, compound_name = line.split("\t")

# # Get the associated pathways for this compound
# pathways = get_pathways_for_compound(compound_id)
# if not pathways:
# pathways = "NA"

# mode = "w" if index == 0 else "a"

# # Open the output file in the appropriate mode
# with open(output_file, mode) as f:
# # Write the compound ID, compound name, and pathway list to the file
# f.write(f"{compound_id}\t{compound_name}\t{pathways}\n")

# # Generate the TSV file and append data one by one
# output_file = "compound_pathways.tsv"
# append_pathways_to_compounds(output_file)

# print(f"TSV file '{output_file}' generated successfully.")
94 changes: 76 additions & 18 deletions routes/apps/kegg.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from myapp import app, PAGE_PREFIX, PRIVATE_ROUTES
from flask_login import current_user
from flask_caching import Cache
from flask import session
from flask import session, send_from_directory, abort, send_file
import dash
from dash import dcc, html
from dash.dependencies import Input, Output, State, MATCH, ALL
Expand All @@ -27,9 +27,13 @@
import plotly.express as px
# from plotly.io import write_image
import plotly.graph_objects as go
from ._kegg import compound_options,pathway_options
from ._kegg import compound_options, pathway_options, organism_options, network_pdf
from dash import dash_table

import io
from Bio.KEGG.KGML.KGML_parser import read
from Bio.Graphics.KGML_vis import KGMLCanvas


FONT_AWESOME = "https://use.fontawesome.com/releases/v5.7.2/css/all.css"

Expand All @@ -54,6 +58,23 @@
],
'CACHE_REDIS_SENTINEL_MASTER': os.environ.get('CACHE_REDIS_SENTINEL_MASTER')
})

# Route to serve any file from /tmp directory
@dashapp.server.route(f"{PAGE_PREFIX}/kegg/tmp/<path:filename>")
def serve_file_from_tmp(filename):
tmp_dir = "/tmp"

if not filename.startswith("kegg-") or not filename.endswith(".pdf"):
return abort(403, description="Forbidden: Only PDF files with kegg initials are allowed")

file_path = os.path.abspath(os.path.join(tmp_dir, filename))

# Check if the file exists and is within the /tmp directory
if os.path.exists(file_path):
return send_file(file_path, mimetype='application/pdf')
else:
return abort(404, description="File not found")


dashapp.layout=html.Div(
[
Expand All @@ -67,11 +88,6 @@
card_input_style={"width":"100%","height":"35px"}
card_body_style={ "padding":"2px", "padding-top":"4px"}






@dashapp.callback(
Output('protected-content', 'children'),
Input('session-id', 'data')
Expand All @@ -91,8 +107,6 @@ def make_loading(children,i):
children=children,
)



# HTML content from here
protected_content=html.Div(
[
Expand Down Expand Up @@ -130,7 +144,13 @@ def make_loading(children,i):
),
dbc.Col(
[
dcc.Markdown("Based on GTEx Analysis Release V8 - https://gtexportal.org", style={"margin-top":"15px"}),
dcc.Loading(
id="loading-output-1",
type="default",
children=[ html.Div(id="my-output")],
style={"margin-top":"50%"}
),
dcc.Markdown("Based on KEGG (Kyoto Encyclopedia of Genes and Genomes) - https://www.genome.jp/kegg/", style={"margin-top":"15px", "margin-left":"15px"}),
],
xs=12,sm=12,md=6,lg=8,xl=9,
style={"margin-bottom":"50px"}
Expand All @@ -147,18 +167,13 @@ def make_loading(children,i):
)
return protected_content


# Callback to load compound list on session load
@dashapp.callback(
Output(component_id='opt-compound', component_property='options'),
#Output(component_id='opt-pathway', component_property='options'),
#Output(component_id='opt-organism', component_property='options'),
Input('session-id', 'data')
)
def update_menus(session_id):
compound=compound_options(cache)

return compound

return compound_options(cache)

# Callback to update opt-pathway based on selected compounds
@dashapp.callback(
Expand All @@ -172,4 +187,47 @@ def update_pathways(selected_compounds):
pw_options=pathway_options(cache, selected_compounds)
if pw_options is None:
return [], "No Pathway Found for Selected Compound"
return pw_options, "Select Pathway"
return pw_options, "Select Pathway"

# Callback to update opt-organism based on selected pathway
@dashapp.callback(
Output('opt-organism', 'options'),
Output('opt-organism', 'placeholder'),
Input('opt-pathway', 'value')
)
def update_organisms(selected_pathway):
if selected_pathway is None:
return [], "No Pathway Selected"
org_options=organism_options(cache, selected_pathway)
if org_options is None:
return [], "No Organism Found for Selected Pathway"
return org_options, "Select Organism"


# Callback on submit
@dashapp.callback(
Output('my-output','children'),
Input('session-id', 'data'),
Input('submit-button-state', 'n_clicks'),
State("opt-compound", "value"),
State("opt-pathway", "value"),
State("opt-organism", "value"),
State('download_name','value'),
)
def update_output(session_id, n_clicks, compound, pathway, organism, download_name):
if not n_clicks:
return html.Div([])

if not compound or pathway is None or organism is None:
return html.Div([dcc.Markdown("*** Please select at least a compound, pathway and organism!", style={"margin-top":"15px","margin-left":"15px"})])

pdf_path=network_pdf(compound,pathway,organism)
if pdf_path is None:
return html.Div([dcc.Markdown("*** Failed to generate network pdf!", style={"margin-top":"15px","margin-left":"15px"})])

output= html.Div([
html.Iframe(src=f"{PAGE_PREFIX}/kegg{pdf_path}", style={"width": "100%", "height": "700px"}),
])

return output

0 comments on commit dff1ab7

Please sign in to comment.