Add assay/study information validation

* Add study/assay uniqueness validation (#45) * Check if investigation refers to studies (#17) * Fix Reader usage in isatab2dot app * Modify assay info and assay/study path usage * Add validation of minimal assay/study information (#17) * Rearrange isatab2isatab and isatab2validation apps
bihealth · May 15, 2019 · 665363d · 665363d
1 parent 0738002
commit 665363d
Show file tree

Hide file tree

Showing 19 changed files with 450 additions and 352 deletions.
diff --git a/altamisa/apps/isatab2dot.py b/altamisa/apps/isatab2dot.py
@@ -61,24 +61,17 @@ def run(args):
 
     for s, study_info in enumerate(investigation.studies):
         with open(os.path.join(path, study_info.info.path), "rt") as inputf:
-            study = StudyReader.from_stream(
-                investigation, study_info, "S{}".format(s + 1), inputf
-            ).read()
+            study = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
         print("  /* study {} */".format(study_info.info.path), file=args.output_file)
         print("  subgraph clusterStudy{} {{".format(s), file=args.output_file)
         print('    label = "Study: {}"'.format(study_info.info.path), file=args.output_file)
         print_dot(study, args.output_file)
         print("  }", file=args.output_file)
 
-        for a, assay_info in enumerate(study_info.assays.values()):
+        for a, assay_info in enumerate(study_info.assays):
             with open(os.path.join(path, assay_info.path), "rt") as inputf:
                 assay = AssayReader.from_stream(
-                    investigation,
-                    study_info,
-                    assay_info,
-                    "S{}".format(s + 1),
-                    "A{}".format(a + 1),
-                    inputf,
+                    "S{}".format(s + 1), "A{}".format(a + 1), inputf
                 ).read()
             print("  /* assay {} */".format(assay_info.path), file=args.output_file)
             print("  subgraph clusterAssayS{}A{} {{".format(s, a), file=args.output_file)

diff --git a/altamisa/apps/isatab2isatab.py b/altamisa/apps/isatab2isatab.py
@@ -24,70 +24,90 @@
 def run(args):
     # Collect warnings
     with warnings.catch_warnings(record=True) as records:
+        run_warnings_caught(args)
 
-        # Check if input and output directory are different
-        path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name))
-        path_out = os.path.normpath(os.path.dirname(args.output_investigation_file.name))
-        if path_in == path_out:
-            tpl = "Can't output ISA-tab files to same directory as as input: {} == {}"
-            msg = tpl.format(path_in, path_out)
-            raise IsaException(msg)
-
-        # Read investigation
-        investigation = InvestigationReader.from_stream(args.input_investigation_file).read()
-
-        # Read studies and assays
-        studies = {}
-        assays = {}
-        for s, study_info in enumerate(investigation.studies):
+    # Print warnings
+    if not args.no_warnings:
+        for record in records:
+            warnings.showwarning(
+                record.message, record.category, record.filename, record.lineno, record.line
+            )
+
+
+def run_warnings_caught(args):
+    # Check if input and output directory are different
+    path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name))
+    path_out = os.path.normpath(os.path.dirname(args.output_investigation_file.name))
+    if path_in == path_out:
+        tpl = "Can't output ISA-tab files to same directory as as input: {} == {}"
+        msg = tpl.format(path_in, path_out)
+        raise IsaException(msg)
+
+    investigation, studies, assays = run_reading(args, path_in)
+    run_writing(args, path_out, investigation, studies, assays)
+
+
+def run_reading(args, path_in):
+    # Read investigation
+    investigation = InvestigationReader.from_stream(args.input_investigation_file).read()
+
+    # Validate investigation
+    InvestigationValidator(investigation).validate()
+
+    # Read studies and assays
+    studies = {}
+    assays = {}
+    for s, study_info in enumerate(investigation.studies):
+        if study_info.info.path:
             with open(os.path.join(path_in, study_info.info.path), "rt") as inputf:
                 studies[s] = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
-            if study_info.assays:
-                assays[s] = {}
-            for a, assay_info in enumerate(study_info.assays.values()):
+        if study_info.assays:
+            assays[s] = {}
+        for a, assay_info in enumerate(study_info.assays):
+            if assay_info.path:
                 with open(os.path.join(path_in, assay_info.path), "rt") as inputf:
                     assays[s][a] = AssayReader.from_stream(
                         "S{}".format(s + 1), "A{}".format(a + 1), inputf
                     ).read()
 
-        # Validate investigation
-        InvestigationValidator(investigation).validate()
-
-        # Validate studies and assays
-        for s, study_info in enumerate(investigation.studies):
+    # Validate studies and assays
+    for s, study_info in enumerate(investigation.studies):
+        if study_info.info.path:
             StudyValidator(investigation, study_info, studies[s]).validate()
-            for a, assay_info in enumerate(study_info.assays.values()):
+        for a, assay_info in enumerate(study_info.assays):
+            if assay_info.path:
                 AssayValidator(investigation, study_info, assay_info, assays[s][a]).validate()
 
-        # Write investigation
-        InvestigationWriter.from_stream(
-            investigation, args.output_investigation_file, quote=args.quotes
-        ).write()
+    return investigation, studies, assays
+
 
-        # Write studies and assays
-        for s, study_info in enumerate(investigation.studies):
-            if args.output_investigation_file.name == "<stdout>":
+def run_writing(args, path_out, investigation, studies, assays):
+    # Write investigation
+    InvestigationWriter.from_stream(
+        investigation, args.output_investigation_file, quote=args.quotes
+    ).write()
+
+    # Write studies and assays
+    for s, study_info in enumerate(investigation.studies):
+        if args.output_investigation_file.name == "<stdout>":
+            if study_info.info.path:
                 StudyWriter.from_stream(
                     studies[s], args.output_investigation_file, quote=args.quotes
                 ).write()
-                for a, assay_info in enumerate(study_info.assays.values()):
+            for a, assay_info in enumerate(study_info.assays):
+                if assay_info.path:
                     AssayWriter.from_stream(
                         assays[s][a], args.output_investigation_file, quote=args.quotes
                     ).write()
-            else:
+        else:
+            if study_info.info.path:
                 with open(os.path.join(path_out, study_info.info.path), "wt") as outputf:
                     StudyWriter.from_stream(studies[s], outputf, quote=args.quotes).write()
-                for a, assay_info in enumerate(study_info.assays.values()):
+            for a, assay_info in enumerate(study_info.assays):
+                if assay_info.path:
                     with open(os.path.join(path_out, assay_info.path), "wt") as outputf:
                         AssayWriter.from_stream(assays[s][a], outputf, quote=args.quotes).write()
 
-    # Print warnings
-    if not args.no_warnings:
-        for record in records:
-            warnings.showwarning(
-                record.message, record.category, record.filename, record.lineno, record.line
-            )
-
 
 def main(argv=None):
     parser = argparse.ArgumentParser()

diff --git a/altamisa/apps/isatab2validation.py b/altamisa/apps/isatab2validation.py
@@ -18,46 +18,54 @@
 
 
 def run(args):
+    # Show all warnings of same type and content
     if args.show_duplicate_warnings:
         warnings.simplefilter("always")
 
     # Collect warnings
     with warnings.catch_warnings(record=True) as records:
+        run_warnings_caught(args)
 
-        # Read investigation
-        investigation = InvestigationReader.from_stream(args.input_investigation_file).read()
-        args.input_investigation_file.close()
+    # Print warnings
+    for record in records:
+        warnings.showwarning(
+            record.message, record.category, record.filename, record.lineno, record.line
+        )
+
+
+def run_warnings_caught(args):
+    # Read investigation
+    investigation = InvestigationReader.from_stream(args.input_investigation_file).read()
+    args.input_investigation_file.close()
+
+    # Validate investigation
+    InvestigationValidator(investigation).validate()
 
-        # Read studies and assays
-        path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name))
-        studies = {}
-        assays = {}
-        for s, study_info in enumerate(investigation.studies):
+    # Read studies and assays
+    path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name))
+    studies = {}
+    assays = {}
+    for s, study_info in enumerate(investigation.studies):
+        if study_info.info.path:
             with open(os.path.join(path_in, study_info.info.path), "rt") as inputf:
                 studies[s] = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
-            if study_info.assays:
-                assays[s] = {}
-            for a, assay_info in enumerate(study_info.assays.values()):
+        if study_info.assays:
+            assays[s] = {}
+        for a, assay_info in enumerate(study_info.assays):
+            if assay_info.path:
                 with open(os.path.join(path_in, assay_info.path), "rt") as inputf:
                     assays[s][a] = AssayReader.from_stream(
                         "S{}".format(s + 1), "A{}".format(a + 1), inputf
                     ).read()
 
-        # Validate investigation
-        InvestigationValidator(investigation).validate()
-
-        # Validate studies and assays
-        for s, study_info in enumerate(investigation.studies):
+    # Validate studies and assays
+    for s, study_info in enumerate(investigation.studies):
+        if study_info.info.path:
             StudyValidator(investigation, study_info, studies[s]).validate()
-            for a, assay_info in enumerate(study_info.assays.values()):
+        for a, assay_info in enumerate(study_info.assays):
+            if assay_info.path:
                 AssayValidator(investigation, study_info, assay_info, assays[s][a]).validate()
 
-    # Print warnings
-    for record in records:
-        warnings.showwarning(
-            record.message, record.category, record.filename, record.lineno, record.line
-        )
-
 
 def main(argv=None):
     parser = argparse.ArgumentParser()

diff --git a/altamisa/isatab/models.py b/altamisa/isatab/models.py
@@ -261,8 +261,8 @@ class StudyInfo(NamedTuple):
     publications: Tuple[PublicationInfo]
     #: Study factors by name
     factors: Dict[str, FactorInfo]
-    #: Study assays by name
-    assays: Dict[str, AssayInfo]
+    #: Study assays
+    assays: Tuple[AssayInfo]
     #: Study protocols by name
     protocols: Dict[str, ProtocolInfo]
     #: Study contact list

diff --git a/altamisa/isatab/parse_assay_study.py b/altamisa/isatab/parse_assay_study.py
@@ -182,7 +182,8 @@ def _assign_column_headers(self):  # noqa: C901
             if not is_secondary:
                 prev = header
 
-    def _raise_seen_before(self, name, col_no):
+    @staticmethod
+    def _raise_seen_before(name, col_no):
         tpl = 'Seen "{}" header for same entity in col {}'
         msg = tpl.format(name, col_no)
         raise ParseIsatabException(msg)

diff --git a/altamisa/isatab/parse_investigation.py b/altamisa/isatab/parse_investigation.py
@@ -340,7 +340,6 @@ def _read_contacts(self) -> Iterator[models.ContactInfo]:
             )
 
     def _read_studies(self) -> Iterator[models.StudyInfo]:
-        # TODO: is it legal to have no study in the investigation?
         while self._line:
             # Read STUDY header
             line = self._read_next_line()
@@ -355,7 +354,9 @@ def _read_studies(self) -> Iterator[models.StudyInfo]:
             # From this, parse the basic information from the study
             comments = _parse_comments(section, comment_keys)
             basic_info = models.BasicInfo(
-                Path(section[investigation_headers.STUDY_FILE_NAME]),
+                Path(section[investigation_headers.STUDY_FILE_NAME])
+                if section[investigation_headers.STUDY_FILE_NAME]
+                else None,
                 section[investigation_headers.STUDY_IDENTIFIER],
                 section[investigation_headers.STUDY_TITLE],
                 section[investigation_headers.STUDY_DESCRIPTION],
@@ -369,7 +370,7 @@ def _read_studies(self) -> Iterator[models.StudyInfo]:
             design_descriptors = tuple(self._read_study_design_descriptors())
             publications = tuple(self._read_study_publications())
             factors = {f.name: f for f in self._read_study_factors()}
-            assays = {a.path.name: a for a in self._read_study_assays()}
+            assays = tuple(self._read_study_assays())
             protocols = {p.name: p for p in self._read_study_protocols()}
             contacts = tuple(self._read_study_contacts())
             # Create study object
@@ -470,25 +471,8 @@ def _read_study_assays(self) -> Iterator[models.AssayInfo]:
                 tech_plat,
             ),
         ) in enumerate(columns):
-            if not file_ and any(
+            if any(
                 (
-                    meas_type,
-                    meas_type_term_acc,
-                    meas_type_term_src,
-                    tech_type,
-                    tech_type_term_acc,
-                    tech_type_term_src,
-                    tech_plat,
-                )
-            ):
-                # don't allow assay columns without assay file
-                tpl = (
-                    "Found assay with no {} in {}; found: "
-                    '"{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}"'
-                )
-                msg = tpl.format(
-                    investigation_headers.STUDY_ASSAY_FILE_NAME,
-                    investigation_headers.STUDY_ASSAYS,
                     file_,
                     meas_type,
                     meas_type_term_acc,
@@ -498,13 +482,17 @@ def _read_study_assays(self) -> Iterator[models.AssayInfo]:
                     tech_type_term_src,
                     tech_plat,
                 )
-                raise ParseIsatabException(msg)
-            elif file_:  # if at least a file exists --> AssayInfo
+            ):
                 meas = models.OntologyTermRef(meas_type, meas_type_term_acc, meas_type_term_src)
                 tech = models.OntologyTermRef(tech_type, tech_type_term_acc, tech_type_term_src)
                 comments = _parse_comments(section, comment_keys, i)
                 yield models.AssayInfo(
-                    meas, tech, tech_plat, Path(file_), comments, list(section.keys())
+                    meas,
+                    tech,
+                    tech_plat,
+                    Path(file_) if file_ else None,
+                    comments,
+                    list(section.keys()),
                 )
             # else, i.e. if all assay fields are empty --> Nothing