Merge branch 'master' into factor_value_lists

bihealth · Jan 16, 2024 · 09c300f · 09c300f
2 parents 818b394 + 2572456
commit 09c300f
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 11 deletions.
diff --git a/altamisa/isatab/parse_assay_study.py b/altamisa/isatab/parse_assay_study.py
@@ -817,7 +817,7 @@ def from_stream(cls, study_id: str, input_file: TextIO, filename: Optional[str]
     def __init__(self, study_id: str, input_file: TextIO, filename: Optional[str]):
         self.study_id = study_id
         self.input_file = input_file
-        self.filename = filename or getattr(input_file, "name", "<no file>")
+        self._filename = filename or getattr(input_file, "name", "<no file>")
         self.unique_rows = set()
         self.duplicate_rows = []
         self._reader = csv.reader(input_file, delimiter="\t", quotechar='"')
@@ -848,6 +848,9 @@ def _read_next_line(self):
                 self.unique_rows.add("\t".join(self._line))
         except StopIteration:
             self._line = None
+        except UnicodeDecodeError as e:  # pragma: no cover
+            msg = f"Invalid encoding of study file '{self._filename}' (use Unicode/UTF-8)."
+            raise ParseIsatabException(msg) from e
         return prev_line
 
     def read(self):
@@ -856,7 +859,7 @@ def read(self):
 
         :returns: Nodes per row of the study file
         """
-        builder = _StudyRowBuilder(self.header, self.filename, self.study_id)
+        builder = _StudyRowBuilder(self.header, self._filename, self.study_id)
         while True:
             line = self._read_next_line()
             if line:
@@ -939,7 +942,7 @@ def __init__(self, study_id: str, assay_id: str, input_file: TextIO, filename: O
         self.study_id = study_id
         self.assay_id = assay_id
         self.input_file = input_file
-        self.filename = filename or getattr(input_file, "name", "<no file>")
+        self._filename = filename or getattr(input_file, "name", "<no file>")
         self.unique_rows = set()
         self.duplicate_rows = []
         self._reader = csv.reader(input_file, delimiter="\t", quotechar='"')
@@ -970,6 +973,9 @@ def _read_next_line(self):
                 self.unique_rows.add("\t".join(self._line))
         except StopIteration:
             self._line = None
+        except UnicodeDecodeError as e:  # pragma: no cover
+            msg = f"Invalid encoding of assay file '{self._filename}' (use Unicode/UTF-8)."
+            raise ParseIsatabException(msg) from e
         return prev_line
 
     def read(self):
@@ -978,7 +984,7 @@ def read(self):
 
         :return: Nodes per row of the assay file
         """
-        builder = _AssayRowBuilder(self.header, self.filename, self.study_id, self.assay_id)
+        builder = _AssayRowBuilder(self.header, self._filename, self.study_id, self.assay_id)
         while True:
             line = self._read_next_line()
             if line:

diff --git a/altamisa/isatab/parse_investigation.py b/altamisa/isatab/parse_investigation.py
@@ -128,6 +128,9 @@ def _read_next_line(self) -> Optional[List[str]]:
                 self._line = list_strip(next(self._reader))
         except StopIteration:
             self._line = None
+        except UnicodeDecodeError as e:  # pragma: no cover
+            msg = f"Invalid encoding of investigation file '{self._filename}' (use Unicode/UTF-8)."
+            raise ParseIsatabException(msg) from e
         return prev_line
 
     def _next_line_startswith_comment(self):
@@ -366,7 +369,7 @@ def _read_studies(self) -> Iterator[models.StudyInfo]:
             line = self._read_next_line()
             if not line or not line[0] == investigation_headers.STUDY:  # pragma: no cover
                 tpl = "Expected {} but got {}"
-                msg = tpl.format(investigation_headers.INVESTIGATION, line)
+                msg = tpl.format(investigation_headers.STUDY, line)
                 raise ParseIsatabException(msg)
             # Read the other lines in this section.
             section, comment_keys = self._read_single_column_section(

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -18,3 +18,5 @@ flake8 >=3.5.0
 isort
 
 pyright
+
+syrupy
diff --git a/tests/__snapshots__/test_apps.ambr b/tests/__snapshots__/test_apps.ambr
@@ -0,0 +1,65 @@
+# serializer version: 1
+# name: test_isatab2isatab
+  list([
+    '''
+      Investigation with only one study contains metadata:
+      	ID:	i_minimal
+      	Title:	Minimal Investigation
+      	Path:	i_minimal.txt
+      	Submission Date:	
+      	Public Release Date:	None
+      	Prefer recording metadata in the study section.
+    ''',
+    '''
+      Assay without platform:
+      Path:	a_minimal.txt
+      Measurement Type:	exome sequencing assay
+      Technology Type:	nucleotide sequencing
+      Technology Platform:	
+    ''',
+    'No reference headers available for section INVESTIGATION PUBLICATIONS. Applying default order.',
+    'No reference headers available for section INVESTIGATION CONTACTS. Applying default order.',
+    'No reference headers available for section STUDY DESIGN DESCRIPTORS. Applying default order.',
+    'No reference headers available for section STUDY PUBLICATIONS. Applying default order.',
+    'No reference headers available for section STUDY FACTORS. Applying default order.',
+    'No reference headers available for section STUDY CONTACTS. Applying default order.',
+  ])
+# ---
+# name: test_isatab2isatab_input_is_output
+  '<Result IsaException("Can\'t output ISA-tab files to same directory as as input: /home/runner/work/altamisa/tests/data/i_minimal == /home/runner/work/altamisa/tests/data/i_minimal")>'
+# ---
+# name: test_isatab_validate
+  list([
+    'Incomplete ontology source; found: , Incomplete 1, 1, Incomplete 1, ()',
+    'Incomplete ontology source; found: Incomplete 2, , 2, Incomplete 2, ()',
+    'Ontology source name including whitespace(s); found: Incomplete 2, , 2, Incomplete 2, ()',
+    'Incomplete ontology source; found: Incomplete 3, Incomplete 3, , Incomplete 3, ()',
+    'Ontology source name including whitespace(s); found: Incomplete 3, Incomplete 3, , Incomplete 3, ()',
+    'Incomplete ontology source; found: Incomplete 4, Incomplete 4, 4, , ()',
+    'Ontology source name including whitespace(s); found: Incomplete 4, Incomplete 4, 4, , ()',
+    '''
+      Investigation with only one study contains metadata:
+      	ID:	i_warnings
+      	Title:	Investigation with Warnings
+      	Path:	i_warnings.txt
+      	Submission Date:	
+      	Public Release Date:	None
+      	Prefer recording metadata in the study section.
+    ''',
+    'Invalid mail address: invalid_mail',
+    'Invalid phone/fax number: CALL-ME',
+    'Invalid phone/fax number: FAX-ME',
+    'Invalid pubmed_id string: not-pubmed',
+    'Invalid doi string: not-a-doi',
+    '''
+      Assay without platform:
+      Path:	a_warnings.txt
+      Measurement Type:	exome sequencing assay
+      Technology Type:	nucleotide sequencing
+      Technology Platform:	
+    ''',
+    'Assay path used more than once: a_warnings.txt',
+    "Found samples in assay 'a_warnings.txt' but not in parent study 's_warnings.txt':\\n0815-N2",
+    "Found samples in assay 'a_warnings.txt' but not in parent study 's_warnings.txt':\\n0815-N2",
+  ])
+# ---
diff --git a/tests/test_apps.py b/tests/test_apps.py
@@ -4,6 +4,7 @@
 import os.path
 
 import pytest
+from syrupy.assertion import SnapshotAssertion
 from typer.testing import CliRunner
 
 from altamisa.apps import isatab2dot, isatab2isatab, isatab_validate
@@ -12,18 +13,18 @@
 runner = CliRunner()
 
 
-def test_isatab_validate():
+def test_isatab_validate(snapshot: SnapshotAssertion):
     i_file = os.path.join(os.path.dirname(__file__), "data", "i_warnings", "i_warnings.txt")
     argv = ["--input-investigation-file", i_file, "--show-duplicate-warnings"]
 
     with pytest.warns(IsaWarning) as record:
         result = runner.invoke(isatab_validate.app, argv)
         assert result.exit_code == 0
 
-    assert 17 == len(record)
+    assert snapshot == [str(r.message) for r in record]
 
 
-def test_isatab2isatab(tmpdir):
+def test_isatab2isatab(tmpdir, snapshot: SnapshotAssertion):
     i_file = os.path.join(os.path.dirname(__file__), "data", "i_minimal", "i_minimal.txt")
     argv = [
         "--input-investigation-file",
@@ -38,10 +39,10 @@ def test_isatab2isatab(tmpdir):
         result = runner.invoke(isatab2isatab.app, argv)
         assert result.exit_code == 0
 
-    assert 8 == len(record)
+    assert snapshot == [str(r.message) for r in record]
 
 
-def test_isatab2isatab_input_is_output(tmpdir):
+def test_isatab2isatab_input_is_output(tmpdir, snapshot: SnapshotAssertion):
     i_file = os.path.join(os.path.dirname(__file__), "data", "i_minimal", "i_minimal.txt")
     argv = [
         "--input-investigation-file",
@@ -54,7 +55,9 @@ def test_isatab2isatab_input_is_output(tmpdir):
 
     result = runner.invoke(isatab2isatab.app, argv)
     assert result.exit_code == 1
-    assert "Can't output ISA-tab files to same directory as as input" in str(result)
+    assert snapshot == str(result).replace(
+        os.path.dirname(__file__), "/home/runner/work/altamisa/tests"
+    )
 
 
 def test_isatab2dot(tmpdir):