Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
djcomlab committed Apr 5, 2017
2 parents 3630ef8 + 42c2cd9 commit a2a9cdd
Showing 7 changed files with 337 additions and 55 deletions.
2 changes: 1 addition & 1 deletion isatools/io/mtbls.md
Original file line number Diff line number Diff line change
@@ -35,4 +35,4 @@ The slicer produces:
- The list of URLs for Raw data files/study files to be downloaded by another tool.

The slicer needs to produce:
- A reduced ISA file after the slicing.
- A collection of data files that match the query.
174 changes: 167 additions & 7 deletions isatools/io/mtbls.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,9 @@
import glob
from isatools.convert import isatab2json
from isatools import isatab
from isatools.model.v1 import OntologyAnnotation
from isatools.model.v1 import OntologyAnnotation, Process, ParameterValue
import networkx as nx
import pandas as pd

MTBLS_FTP_SERVER = 'ftp.ebi.ac.uk'
MTBLS_BASE_DIR = '/pub/databases/metabolights/studies/public'
@@ -126,7 +128,6 @@ def slice_data_files(dir, factor_selection=None):
}
}
"""
from isatools import isatab
results = list()
# first collect matching samples
for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')):
@@ -246,7 +247,9 @@ def get_factors_summary(mtbls_study_id):
This function generates a factors summary for a MetaboLights study
:param mtbls_study_id: Accession number of the MetaboLights study
:return: A list of dicts summarising the set of factor names associated with each data file
:return: A list of dicts summarising the set of factor names and values associated with each sample
Note: it only returns a summary of factors with variable values.
Example usage:
factor_summary = get_factors_summary('MTBLS1')
@@ -258,10 +261,9 @@ def get_factors_summary(mtbls_study_id):
},
{
"name": "ADG10003u_162",
"Metabolic syndrome":
"diabetes mellitus",
"Metabolic syndrome": "diabetes mellitus",
"Gender": "Female"
},
},
]
@@ -282,4 +284,162 @@ def get_factors_summary(mtbls_study_id):
fv_value = fv.value.term
sample_and_fvs[fv.factor_name.name] = fv_value
samples_and_fvs.append(sample_and_fvs)
return samples_and_fvs
df = pd.DataFrame(samples_and_fvs)
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df = df.drop(cols_to_drop, axis=1)
return df.to_dict(orient='records')


def get_characteristics_summary(mtbls_study_id):
"""
This function generates a characteristics summary for a MetaboLights study
:param mtbls_study_id: Accession number of the MetaboLights study
:return: A list of dicts summarising the set of characteristic names and values associated with each sample
Note: it only returns a summary of characteristics with variable values.
Example usage:
characteristics_summary = get_characteristics_summary('MTBLS5')
[
{
"name": "6089if_9",
"Variant": "Synechocystis sp. PCC 6803.sll0171.ko"
},
{
"name": "6089if_43",
"Variant": "Synechocystis sp. PCC 6803.WT.none"
},
]
"""
ISA = load(mtbls_study_id=mtbls_study_id)
all_samples = []
for study in ISA.studies:
all_samples.extend(study.materials['samples'])
samples_and_characs = []
for sample in all_samples:
sample_and_characs = {
"name": sample.name
}
for source in sample.derives_from:
for c in source.characteristics:
if isinstance(c.value, (str, int, float)):
c_value = c.value
elif isinstance(c.value, OntologyAnnotation):
c_value = c.value.term
sample_and_characs[c.category.term] = c_value
samples_and_characs.append(sample_and_characs)
df = pd.DataFrame(samples_and_characs)
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df = df.drop(cols_to_drop, axis=1)
return df.to_dict(orient='records')


# PVs don't seem to vary in MTBLS, so maybe skip this function
# def get_parameter_value_summary(mtbls_study_id):
# """
# This function generates a parameter values summary for a MetaboLights study
#
# :param mtbls_study_id: Accession number of the MetaboLights study
# :return: A list of dicts summarising the set of parameters and values associated with each sample
#
# Note: it only returns a summary of parameter values with variable values.
#
# """
# ISA = load(mtbls_study_id=mtbls_study_id)
# all_samples = []
# for study in ISA.studies:
# all_samples.extend(study.materials['samples'])
# samples_and_pvs = []
# for sample in all_samples:
# sample_and_pvs = {
# "name": sample.name
# }
# for study in ISA.studies:
# s_processes_linked_to_sample = [x for x in nx.algorithms.ancestors(study.graph, sample) if
# isinstance(x, Process)]
# for process in s_processes_linked_to_sample:
# for pv in process.parameter_values:
# if isinstance(pv, ParameterValue):
# if isinstance(pv.value, (str, int, float)):
# pv_value = pv.value
# elif isinstance(pv.value, OntologyAnnotation):
# pv_value = pv.value.term
# sample_and_pvs[pv.category.parameter_name.term] = pv_value
# for assay in study.assays:
# for sample in assay.materials['samples']:
# a_processes_linked_to_sample = [x for x in nx.algorithms.descendants(assay.graph, sample) if
# isinstance(x, Process)]
# for process in a_processes_linked_to_sample:
# for pv in process.parameter_values:
# if isinstance(pv, ParameterValue):
# if isinstance(pv.value, (str, int, float)):
# pv_value = pv.value
# elif isinstance(pv.value, OntologyAnnotation):
# pv_value = pv.value.term
# sample_and_pvs[pv.category.parameter_name.term] = pv_value
# samples_and_pvs.append(sample_and_pvs)
# df = pd.DataFrame(samples_and_pvs)
# nunique = df.apply(pd.Series.nunique)
# cols_to_drop = nunique[nunique == 1].index
# df = df.drop(cols_to_drop, axis=1)
# return df.to_dict(orient='records')


def get_study_variable_summary(mtbls_study_id):
ISA = load(mtbls_study_id=mtbls_study_id)
all_samples = []
for study in ISA.studies:
all_samples.extend(study.materials['samples'])
samples_and_variables = []
for sample in all_samples:
sample_and_vars = {
"name": sample.name
}
for fv in sample.factor_values:
if isinstance(fv.value, (str, int, float)):
fv_value = fv.value
elif isinstance(fv.value, OntologyAnnotation):
fv_value = fv.value.term
sample_and_vars[fv.factor_name.name] = fv_value
for source in sample.derives_from:
for c in source.characteristics:
if isinstance(c.value, (str, int, float)):
c_value = c.value
elif isinstance(c.value, OntologyAnnotation):
c_value = c.value.term
sample_and_vars[c.category.term] = c_value
# Don't think pvs vary, so maybe skip this section
# for study in ISA.studies:
# s_processes_linked_to_sample = [x for x in nx.algorithms.ancestors(study.graph, sample) if
# isinstance(x, Process)]
# for process in s_processes_linked_to_sample:
# for pv in process.parameter_values:
# if isinstance(pv, ParameterValue):
# if isinstance(pv.value, (str, int, float)):
# pv_value = pv.value
# elif isinstance(pv.value, OntologyAnnotation):
# pv_value = pv.value.term
# sample_and_vars[pv.category.parameter_name.term] = pv_value
# for assay in study.assays:
# for sample in assay.materials['samples']:
# a_processes_linked_to_sample = [x for x in nx.algorithms.descendants(assay.graph, sample) if
# isinstance(x, Process)]
# for process in a_processes_linked_to_sample:
# for pv in process.parameter_values:
# if isinstance(pv, ParameterValue):
# if isinstance(pv.value, (str, int, float)):
# pv_value = pv.value
# elif isinstance(pv.value, OntologyAnnotation):
# pv_value = pv.value.term
# sample_and_vars[pv.category.parameter_name.term] = pv_value
samples_and_variables.append(sample_and_vars)
df = pd.DataFrame(samples_and_variables)
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df = df.drop(cols_to_drop, axis=1)
return df.to_dict(orient='records')
11 changes: 8 additions & 3 deletions isatools/isatab.py
Original file line number Diff line number Diff line change
@@ -906,6 +906,9 @@ def _read_tab_section(f, sec_key, next_sec_key=None):
return memf

def _build_section_df(f):
# find tab dimension
print('Max width = {}'.format(max([len(line.split('\t')) for line in f])))
f.seek(0)
try:
df = pd.read_csv(f, sep='\t').T # Load and transpose ISA file section
except CParserError:
@@ -3059,16 +3062,18 @@ def pairwise(iterable):
return zip(a, b)


def read_tfile(tfile_path, index_col=None):
def read_tfile(tfile_path, index_col=None, factor_filter=None):

with open(tfile_path, encoding='utf-8') as tfile_fp:
reader = csv.reader(tfile_fp, delimiter='\t')
header = list(next(reader))
tfile_fp.seek(0)
tfile_df = pd.read_csv(tfile_fp, sep='\t', index_col=index_col, memory_map=True, comment='#').fillna('')
tfile_df.isatab_header = header

return tfile_df
if factor_filter:
return tfile_df[tfile_df['Factor Value[{}]'.format(factor_filter[0])] == factor_filter[1]]
else:
return tfile_df


def get_multiple_index(file_index, key):
40 changes: 33 additions & 7 deletions isatools/magetab.py
Original file line number Diff line number Diff line change
@@ -352,20 +352,31 @@ def dump(inv_obj, output_path):
return inv_obj


def load(FP):
def load(FP): # loads IDF file
# first cast to IDF
idf_FP = cast_idf_to_inv(FP)
df = pd.read_csv(idf_FP, names=range(0, 128), sep='\t', engine='python').dropna(axis=1, how='all')
inv_fp = cast_idf_to_inv(FP)
df = pd.read_csv(inv_fp, names=range(0, 128), sep='\t', engine='python').dropna(axis=1, how='all')
df = df.T # transpose
df.reset_index(inplace=True) # Reset index so it is accessible as column
df.columns = df.iloc[0] # If all was OK, promote this row to the column headers
# second set output s_ and a_ files
sdrf_file = df["Comment[SDRF File]"].iloc[1]
study_df, assay_df = split_tables(sdrf_path=os.path.join(os.path.dirname(FP.name), sdrf_file))
study_df.columns = study_df.isatab_header
print("s_" + os.path.basename(sdrf_file))
assay_df.columns = assay_df.isatab_header
print("a_" + os.path.basename(sdrf_file))
# write out ISA files
tmp = "/Users/dj/PycharmProjects/isa-api/tests/data/tmp"
inv_fp.seek(0)
# print("Writing i_investigation.txt to {}".format(tmp))
print("Writing s_{0} to {1}".format(tmp, os.path.basename(sdrf_file)))
with open(os.path.join(tmp, "s_" + os.path.basename(sdrf_file)), "w") as s_fp:
study_df.to_csv(path_or_buf=s_fp, mode='a', sep='\t', encoding='utf-8', index=False,)
print("Writing a_{0} to {1}".format(tmp, os.path.basename(sdrf_file)))
with open(os.path.join(tmp, "a_" + os.path.basename(sdrf_file)), "w") as a_fp:
assay_df.to_csv(path_or_buf=a_fp, mode='a', sep='\t', encoding='utf-8', index=False,)
with open(os.path.join(tmp, "i_investigation.txt")) as tmp_inv_fp:
ISA = isatab.load(inv_fp)
return ISA


inv_to_idf_map = {
@@ -424,16 +435,31 @@ def cast_idf_to_inv(FP):
# Cast relevant sections from IDF file into comments
# insert Additional Investigation file labels
idf_FP = StringIO()
idf_dict = {}
for line in FP:
if line.startswith(tuple(inv_to_idf_map.values())) or line.startswith("Comment["):
for k, v in inv_to_idf_map.items():
line = line.replace(v, k) # note k-v is reversed
else:
first_token = line[:line.index('\t')]
line = line.replace(first_token, "Comment[{}]".format(first_token))
idf_FP.write(line)
idf_FP.seek(0)
# idf_FP.write(line)
idf_dict[line[:line.index('\t')]] = line
# idf_FP.seek(0)
idf_FP.name = FP.name
with open(os.path.join(os.path.dirname(__file__), 'resources', 'tab_templates', 'i_template.txt')) as i_template_FP:
for line in i_template_FP:
try:
try:
line = idf_dict[line[:line.index('\t')]]
except ValueError:
line = idf_dict[line[:line.index('\n')]]
except KeyError:
pass
idf_FP.write(line)
for key in [x for x in idf_dict.keys() if x.startswith("Comment[")]:
idf_FP.write(idf_dict[key])
idf_FP.seek(0)
return idf_FP


Loading

0 comments on commit a2a9cdd

Please sign in to comment.