Implement writing assay file from SampleTab towards #190

ISA-tools · Mar 1, 2017 · 3588cfc · 3588cfc
1 parent 8a6c6b7
commit 3588cfc
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 32 deletions.
diff --git a/isatools/isatab.py b/isatools/isatab.py
@@ -837,11 +837,7 @@ def write_value_columns(df_dict, label, x):
             df_dict[label][-1] = x.value
             df_dict[label + ".Unit"][-1] = x.unit
     elif isinstance(x.value, OntologyAnnotation):
-        try:
-            df_dict[label][-1] = x.value.term
-        except KeyError:
-            print(df_dict.keys())
-            raise KeyError
+        df_dict[label][-1] = x.value.term
         df_dict[label + ".Term Source REF"][-1] = x.value.term_source.name if x.value.term_source else ""
         df_dict[label + ".Term Accession Number"][-1] = x.value.term_accession
     else:

diff --git a/isatools/model/v1.py b/isatools/model/v1.py
@@ -394,11 +394,11 @@ def __init__(self, id_='', filename="", identifier="",  title="", description=""
             'other_material': list()
         }
         if not (sources is None):
-            self.materials['sources'].append(sources)
+            self.materials['sources'] = sources
         if not (samples is None):
-            self.materials['samples'].append(samples)
+            self.materials['samples'] = samples
         if not (other_material is None):
-            self.materials['other_material'].append(other_material)
+            self.materials['other_material'] = other_material
 
         if process_sequence is None:
             self.process_sequence = list()

diff --git a/isatools/sampletab.py b/isatools/sampletab.py
@@ -4,32 +4,33 @@
 from isatools.model.v1 import *
 
 
-def read_sampletab_msi(fp):
-
-    def _peek(f):
-        position = f.tell()
-        l = f.readline()
-        f.seek(position)
-        return l
+def _peek(f):
+    position = f.tell()
+    l = f.readline()
+    f.seek(position)
+    return l
 
-    def _read_tab_section(f, sec_key, next_sec_key=None):
 
+def _read_tab_section(f, sec_key, next_sec_key=None):
+    line = f.readline()
+    normed_line = line.rstrip()
+    if normed_line[0] == '"':
+        normed_line = normed_line[1:]
+    if normed_line[len(normed_line) - 1] == '"':
+        normed_line = normed_line[:len(normed_line) - 1]
+    if not normed_line == sec_key:
+        raise IOError("Expected: " + sec_key + " section, but got: " + normed_line)
+    memf = io.StringIO()
+    while not _peek(f=f).rstrip() == next_sec_key:
         line = f.readline()
-        normed_line = line.rstrip()
-        if normed_line[0] == '"':
-            normed_line = normed_line[1:]
-        if normed_line[len(normed_line)-1] == '"':
-            normed_line = normed_line[:len(normed_line)-1]
-        if not normed_line == sec_key:
-            raise IOError("Expected: " + sec_key + " section, but got: " + normed_line)
-        memf = io.StringIO()
-        while not _peek(f=f).rstrip() == next_sec_key:
-            line = f.readline()
-            if not line:
-                break
-            memf.write(line.rstrip() + '\n')
-        memf.seek(0)
-        return memf
+        if not line:
+            break
+        memf.write(line.rstrip() + '\n')
+    memf.seek(0)
+    return memf
+
+
+def read_sampletab_msi(fp):
 
     def _build_msi_df(f):
         import numpy as np
@@ -50,6 +51,77 @@ def _build_msi_df(f):
     return msi_df
 
 
+def get_value(object_column, column_group, object_series, ontology_source_map, unit_categories):
+
+    cell_value = object_series[object_column]
+
+    # if cell_value == '':
+    #     return cell_value, None
+
+    column_index = list(column_group).index(object_column)
+
+    try:
+        offset_1r_col = column_group[column_index + 1]
+        offset_2r_col = column_group[column_index + 2]
+    except IndexError:
+        return cell_value, None
+
+    if offset_1r_col.startswith('Term Source REF') and offset_2r_col.startswith('Term Source ID'):
+
+        value = OntologyAnnotation(term=str(cell_value))
+
+        term_source_value = object_series[offset_1r_col]
+
+        if term_source_value is not '':
+
+            try:
+                value.term_source = ontology_source_map[term_source_value]
+            except KeyError:
+                print('term source: ', term_source_value, ' not found')
+
+        term_accession_value = str(object_series[offset_2r_col])
+
+        if term_accession_value is not '':
+            value.term_accession = term_accession_value
+
+        return value, None
+
+    try:
+        offset_3r_col = column_group[column_index + 3]
+    except IndexError:
+        return cell_value, None
+
+    if offset_1r_col.startswith('Unit') and offset_2r_col.startswith('Term Source REF') \
+            and offset_3r_col.startswith('Term Source ID'):
+
+        category_key = object_series[offset_1r_col]
+
+        try:
+            unit_term_value = unit_categories[category_key]
+        except KeyError:
+            unit_term_value = OntologyAnnotation(term=category_key)
+            unit_categories[category_key] = unit_term_value
+
+            unit_term_source_value = object_series[offset_2r_col]
+
+            if unit_term_source_value is not '':
+
+                try:
+                    unit_term_value.term_source = ontology_source_map[unit_term_source_value]
+                except KeyError:
+                    print('term source: ', unit_term_source_value, ' not found')
+
+            term_accession_value = object_series[offset_3r_col]
+
+            if term_accession_value is not '':
+                unit_term_value.term_accession = term_accession_value
+
+        return cell_value, unit_term_value
+
+    else:
+        return cell_value, None
+
+
 def load(FP):
 
     msi_df = read_sampletab_msi(FP)
@@ -99,4 +171,126 @@ def load(FP):
             Comment(name="Organization Role.{}".format(i), value=row["Organization Role"])
         ])
 
-    return investigation
+    # Read in SCD section into DataFrame first
+    scd_df = pd.read_csv(_read_tab_section(
+        f=FP,
+        sec_key='[SCD]'
+    ), sep='\t').fillna('')
+
+    sources = {}
+    samples = {}
+    characteristic_categories = {}
+    unit_categories = {}
+    processes = {}
+
+    try:
+        samples = dict(map(lambda x: (x, Sample(comments=[Comment(name="Sample Accession", value=x)])),
+                           scd_df["Sample Accession"].drop_duplicates()))
+    except KeyError:
+        pass
+
+    study = Study(filename='s_{}.txt'.format(investigation.identifier), samples=list(samples.values()))
+    study.protocols = [Protocol(name='sample collection', protocol_type=OntologyAnnotation(term='sample collection'))]
+    investigation.studies = [study]
+
+    for _, object_series in scd_df.drop_duplicates().iterrows():
+
+        node_key = object_series["Sample Accession"]
+        sample = samples[node_key]
+
+        if sample is not None:
+
+            sample.name = object_series["Sample Name"]
+            sample.comments.append(Comment(name="Sample Description", value=object_series['Sample Description']))
+            sample.comments.append(Comment(name="Child Of", value=object_series['Child Of']))
+
+            # create Material to Characteristic Material Type
+            if len(object_series["Material"]) > 0:
+                category_key = "Material Type"
+                try:
+                    category = characteristic_categories[category_key]
+                except KeyError:
+                    category = OntologyAnnotation(term=category_key)
+                    characteristic_categories[category_key] = category
+                characteristic = Characteristic(category=category)
+                v, u = get_value("Material", scd_df.columns, object_series, ontology_source_map,
+                                 unit_categories)
+                characteristic.value = v
+                characteristic.unit = u
+                sample.characteristics.append(characteristic)
+
+            if len(object_series["Organism"]) > 0:
+                category_key = "Organism"
+                try:
+                    category = characteristic_categories[category_key]
+                except KeyError:
+                    category = OntologyAnnotation(term=category_key)
+                    characteristic_categories[category_key] = category
+                characteristic = Characteristic(category=category)
+                v, u = get_value("Organism", scd_df.columns, object_series, ontology_source_map,
+                                 unit_categories)
+                characteristic.value = v
+                characteristic.unit = u
+                sample.characteristics.append(characteristic)
+
+            if len(object_series["Sex"]) > 0:
+                category_key = "Sex"
+                try:
+                    category = characteristic_categories[category_key]
+                except KeyError:
+                    category = OntologyAnnotation(term=category_key)
+                    characteristic_categories[category_key] = category
+                characteristic = Characteristic(category=category)
+                v, u = get_value("Sex", scd_df.columns, object_series, ontology_source_map,
+                                 unit_categories)
+                characteristic.value = v
+                characteristic.unit = u
+                sample.characteristics.append(characteristic)
+
+            for charac_column in [c for c in scd_df.columns if c.startswith('Characteristic[')]:
+                category_key = charac_column[15:-1]
+
+                try:
+                    category = characteristic_categories[category_key]
+                except KeyError:
+                    category = OntologyAnnotation(term=category_key)
+                    characteristic_categories[category_key] = category
+
+                characteristic = Characteristic(category=category)
+
+                v, u = get_value(charac_column, scd_df.columns, object_series, ontology_source_map,
+                                 unit_categories)
+
+                characteristic.value = v
+                characteristic.unit = u
+
+                sample.characteristics.append(characteristic)
+
+            if len(object_series["Derived From"]) > 0:
+                try:
+                    source = sources[object_series["Derived From"]]
+                except KeyError:
+                    source_sample = samples[object_series["Derived From"]]
+                    source = Source(name=source_sample.name,
+                                    characteristics=source_sample.characteristics,
+                                    comments=source_sample.comments)
+                    sources[source.name] = source
+                sample.derives_from.append(source)
+                process_key = ":".join([source.name, 'sample collection'])
+                try:
+                    process = processes[process_key]
+                except KeyError:
+                    process = Process(executes_protocol=study.protocols[0])
+                    processes.update(dict([(process_key, process)]))
+                if source.name not in [x.name for x in process.inputs]:
+                    process.inputs.append(source)
+                if sample.name not in [x.name for x in process.outputs]:
+                    process.outputs.append(sample)
+
+            samples[sample.name] = sample
+
+    study.materials['sources'] = list(sources.values())
+    study.materials['samples'] = [x for x in list(samples.values()) if x not in list(sources.values())]
+    study.process_sequence = list(processes.values())
+
+    return investigation