Skip to content

Commit

Permalink
Implement writing assay file from SampleTab towards #190
Browse files Browse the repository at this point in the history
  • Loading branch information
djcomlab committed Mar 1, 2017
1 parent 8a6c6b7 commit 3588cfc
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 32 deletions.
6 changes: 1 addition & 5 deletions isatools/isatab.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,11 +837,7 @@ def write_value_columns(df_dict, label, x):
df_dict[label][-1] = x.value
df_dict[label + ".Unit"][-1] = x.unit
elif isinstance(x.value, OntologyAnnotation):
try:
df_dict[label][-1] = x.value.term
except KeyError:
print(df_dict.keys())
raise KeyError
df_dict[label][-1] = x.value.term
df_dict[label + ".Term Source REF"][-1] = x.value.term_source.name if x.value.term_source else ""
df_dict[label + ".Term Accession Number"][-1] = x.value.term_accession
else:
Expand Down
6 changes: 3 additions & 3 deletions isatools/model/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,11 +394,11 @@ def __init__(self, id_='', filename="", identifier="", title="", description=""
'other_material': list()
}
if not (sources is None):
self.materials['sources'].append(sources)
self.materials['sources'] = sources
if not (samples is None):
self.materials['samples'].append(samples)
self.materials['samples'] = samples
if not (other_material is None):
self.materials['other_material'].append(other_material)
self.materials['other_material'] = other_material

if process_sequence is None:
self.process_sequence = list()
Expand Down
242 changes: 218 additions & 24 deletions isatools/sampletab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,33 @@
from isatools.model.v1 import *


def read_sampletab_msi(fp):

def _peek(f):
position = f.tell()
l = f.readline()
f.seek(position)
return l
def _peek(f):
position = f.tell()
l = f.readline()
f.seek(position)
return l

def _read_tab_section(f, sec_key, next_sec_key=None):

def _read_tab_section(f, sec_key, next_sec_key=None):
line = f.readline()
normed_line = line.rstrip()
if normed_line[0] == '"':
normed_line = normed_line[1:]
if normed_line[len(normed_line) - 1] == '"':
normed_line = normed_line[:len(normed_line) - 1]
if not normed_line == sec_key:
raise IOError("Expected: " + sec_key + " section, but got: " + normed_line)
memf = io.StringIO()
while not _peek(f=f).rstrip() == next_sec_key:
line = f.readline()
normed_line = line.rstrip()
if normed_line[0] == '"':
normed_line = normed_line[1:]
if normed_line[len(normed_line)-1] == '"':
normed_line = normed_line[:len(normed_line)-1]
if not normed_line == sec_key:
raise IOError("Expected: " + sec_key + " section, but got: " + normed_line)
memf = io.StringIO()
while not _peek(f=f).rstrip() == next_sec_key:
line = f.readline()
if not line:
break
memf.write(line.rstrip() + '\n')
memf.seek(0)
return memf
if not line:
break
memf.write(line.rstrip() + '\n')
memf.seek(0)
return memf


def read_sampletab_msi(fp):

def _build_msi_df(f):
import numpy as np
Expand All @@ -50,6 +51,77 @@ def _build_msi_df(f):
return msi_df


def get_value(object_column, column_group, object_series, ontology_source_map, unit_categories):

cell_value = object_series[object_column]

# if cell_value == '':
# return cell_value, None

column_index = list(column_group).index(object_column)

try:
offset_1r_col = column_group[column_index + 1]
offset_2r_col = column_group[column_index + 2]
except IndexError:
return cell_value, None

if offset_1r_col.startswith('Term Source REF') and offset_2r_col.startswith('Term Source ID'):

value = OntologyAnnotation(term=str(cell_value))

term_source_value = object_series[offset_1r_col]

if term_source_value is not '':

try:
value.term_source = ontology_source_map[term_source_value]
except KeyError:
print('term source: ', term_source_value, ' not found')

term_accession_value = str(object_series[offset_2r_col])

if term_accession_value is not '':
value.term_accession = term_accession_value

return value, None

try:
offset_3r_col = column_group[column_index + 3]
except IndexError:
return cell_value, None

if offset_1r_col.startswith('Unit') and offset_2r_col.startswith('Term Source REF') \
and offset_3r_col.startswith('Term Source ID'):

category_key = object_series[offset_1r_col]

try:
unit_term_value = unit_categories[category_key]
except KeyError:
unit_term_value = OntologyAnnotation(term=category_key)
unit_categories[category_key] = unit_term_value

unit_term_source_value = object_series[offset_2r_col]

if unit_term_source_value is not '':

try:
unit_term_value.term_source = ontology_source_map[unit_term_source_value]
except KeyError:
print('term source: ', unit_term_source_value, ' not found')

term_accession_value = object_series[offset_3r_col]

if term_accession_value is not '':
unit_term_value.term_accession = term_accession_value

return cell_value, unit_term_value

else:
return cell_value, None


def load(FP):

msi_df = read_sampletab_msi(FP)
Expand Down Expand Up @@ -99,4 +171,126 @@ def load(FP):
Comment(name="Organization Role.{}".format(i), value=row["Organization Role"])
])

return investigation
# Read in SCD section into DataFrame first
scd_df = pd.read_csv(_read_tab_section(
f=FP,
sec_key='[SCD]'
), sep='\t').fillna('')

sources = {}
samples = {}
characteristic_categories = {}
unit_categories = {}
processes = {}

try:
samples = dict(map(lambda x: (x, Sample(comments=[Comment(name="Sample Accession", value=x)])),
scd_df["Sample Accession"].drop_duplicates()))
except KeyError:
pass

study = Study(filename='s_{}.txt'.format(investigation.identifier), samples=list(samples.values()))
study.protocols = [Protocol(name='sample collection', protocol_type=OntologyAnnotation(term='sample collection'))]
investigation.studies = [study]

for _, object_series in scd_df.drop_duplicates().iterrows():

node_key = object_series["Sample Accession"]
sample = samples[node_key]

if sample is not None:

sample.name = object_series["Sample Name"]
sample.comments.append(Comment(name="Sample Description", value=object_series['Sample Description']))
sample.comments.append(Comment(name="Child Of", value=object_series['Child Of']))

# create Material to Characteristic Material Type
if len(object_series["Material"]) > 0:
category_key = "Material Type"
try:
category = characteristic_categories[category_key]
except KeyError:
category = OntologyAnnotation(term=category_key)
characteristic_categories[category_key] = category
characteristic = Characteristic(category=category)
v, u = get_value("Material", scd_df.columns, object_series, ontology_source_map,
unit_categories)
characteristic.value = v
characteristic.unit = u
sample.characteristics.append(characteristic)

if len(object_series["Organism"]) > 0:
category_key = "Organism"
try:
category = characteristic_categories[category_key]
except KeyError:
category = OntologyAnnotation(term=category_key)
characteristic_categories[category_key] = category
characteristic = Characteristic(category=category)
v, u = get_value("Organism", scd_df.columns, object_series, ontology_source_map,
unit_categories)
characteristic.value = v
characteristic.unit = u
sample.characteristics.append(characteristic)

if len(object_series["Sex"]) > 0:
category_key = "Sex"
try:
category = characteristic_categories[category_key]
except KeyError:
category = OntologyAnnotation(term=category_key)
characteristic_categories[category_key] = category
characteristic = Characteristic(category=category)
v, u = get_value("Sex", scd_df.columns, object_series, ontology_source_map,
unit_categories)
characteristic.value = v
characteristic.unit = u
sample.characteristics.append(characteristic)

for charac_column in [c for c in scd_df.columns if c.startswith('Characteristic[')]:
category_key = charac_column[15:-1]

try:
category = characteristic_categories[category_key]
except KeyError:
category = OntologyAnnotation(term=category_key)
characteristic_categories[category_key] = category

characteristic = Characteristic(category=category)

v, u = get_value(charac_column, scd_df.columns, object_series, ontology_source_map,
unit_categories)

characteristic.value = v
characteristic.unit = u

sample.characteristics.append(characteristic)

if len(object_series["Derived From"]) > 0:
try:
source = sources[object_series["Derived From"]]
except KeyError:
source_sample = samples[object_series["Derived From"]]
source = Source(name=source_sample.name,
characteristics=source_sample.characteristics,
comments=source_sample.comments)
sources[source.name] = source
sample.derives_from.append(source)
process_key = ":".join([source.name, 'sample collection'])
try:
process = processes[process_key]
except KeyError:
process = Process(executes_protocol=study.protocols[0])
processes.update(dict([(process_key, process)]))
if source.name not in [x.name for x in process.inputs]:
process.inputs.append(source)
if sample.name not in [x.name for x in process.outputs]:
process.outputs.append(sample)

samples[sample.name] = sample

study.materials['sources'] = list(sources.values())
study.materials['samples'] = [x for x in list(samples.values()) if x not in list(sources.values())]
study.process_sequence = list(processes.values())

return investigation

0 comments on commit 3588cfc

Please sign in to comment.