Add support for UBAM upload and optional referenceArn (#23)

awslabs · Nov 10, 2023 · b1d50f2 · b1d50f2
1 parent 850602b
commit b1d50f2
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 16 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -40,6 +40,24 @@ GitHub provides additional document on [forking a repository](https://help.githu
 [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
 
 
+### Set-up
+When using the Omics Transfer Manager, it's important to install any necessary required dependencies and models. This includes installation of the latest Omics service model if it is not latest updated model.
+
+To install Omics Transfer Manager dependencies, use the pip command. If using Python3, use the pip3 command.
+
+The Omics Transfer Manager contains dependencies that are reliant on having Python 3.7 or later.
+
+Omics Transfer Manager uses the poetry library for dependency management and packaging.
+
+```
+pip install botocore3
+pip install mypy-boto3-omics
+pip install poetry
+poetry install
+```
+
+After running `poetry install`, the Omics Transfer Manager should be ready for usage.
+
 ## Finding contributions to work on
 Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
 

diff --git a/README.md b/README.md
@@ -4,6 +4,14 @@ Tools for working with the Amazon Omics Service.
 
 ## Using the Omics Transfer Manager
 
+### Installation
+Installation
+Amazon Omics Tools is available through pypi. To install, type:
+
+```python
+pip install amazon-omics-tools
+```
+
 ### Basic Usage
 The `TransferManager` class makes it easy to download files for an Omics reference or read set.  By default the files are saved to the current directory, or you can specify a custom location with the `directory` parameter.
 
@@ -57,7 +65,7 @@ For paired end reads, you can define `fileobjs` as a list of files.
 read_set_id = manager.upload_read_set(
     "my-sequence-data/read-set-file.bam",
     SEQUENCE_STORE_ID,
-    ReadSetFileType.BAM,
+    "BAM",
     "name",
     "subject-id",
     "sample-id",
@@ -68,7 +76,7 @@ read_set_id = manager.upload_read_set(
 read_set_id = manager.upload_read_set(
     ["my-sequence-data/read-set-file_1.fastq.gz", "my-sequence-data/read-set-file_2.fastq.gz"],
     SEQUENCE_STORE_ID,
-    ReadSetFileType.FASTQ,
+    "FASTQ",
     "name",
     "subject-id",
     "sample-id",

diff --git a/omics/common/omics_file_types.py b/omics/common/omics_file_types.py
@@ -73,3 +73,4 @@ class ReadSetFileType(ExtendedEnum):
     FASTQ = "FASTQ"
     BAM = "BAM"
     CRAM = "CRAM"
+    UBAM = "UBAM"
diff --git a/omics/transfer/__init__.py b/omics/transfer/__init__.py
@@ -3,7 +3,7 @@
 from s3transfer.futures import TransferFuture
 from s3transfer.subscribers import BaseSubscriber
 
-from omics.common.omics_file_types import OmicsFileType, ReadSetFileType
+from omics.common.omics_file_types import OmicsFileType
 
 
 class OmicsTransferSubscriber(BaseSubscriber):
@@ -60,12 +60,12 @@ class ReadSetUpload:
     def __init__(
         self,
         store_id: str,
-        file_type: ReadSetFileType,
+        file_type: str,
         name: str,
         subject_id: str,
         sample_id: str,
-        reference_arn: str,
         fileobj: Union[IO[Any], str],
+        reference_arn: Optional[str] = None,
         generated_from: Optional[str] = None,
         description: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,

diff --git a/omics/transfer/manager.py b/omics/transfer/manager.py
@@ -25,7 +25,6 @@
 from omics.common.omics_file_types import (
     OmicsFileType,
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer import (
@@ -52,6 +51,7 @@
     "FASTQ": "fastq",
     "BAM": "bam",
     "CRAM": "cram",
+    "UBAM": "bam",
 }
 
 # Map of file type to index file extension.
@@ -353,11 +353,11 @@ def upload_read_set(
         self,
         fileobjs: Union[IO[Any], str, List[Union[IO[Any], str]]],
         sequence_store_id: str,
-        file_type: ReadSetFileType,
+        file_type: str,
         name: str,
         subject_id: str,
         sample_id: str,
-        reference_arn: str,
+        reference_arn: Optional[str] = None,
         generated_from: Optional[str] = None,
         description: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,
@@ -387,9 +387,12 @@ def upload_read_set(
         if len(fileobjs) > 2:
             raise AttributeError("at most two files can be uploaded to a read set")
 
-        if len(fileobjs) > 1 and file_type is not ReadSetFileType.FASTQ:
+        if len(fileobjs) > 1 and file_type != "FASTQ":
             raise AttributeError("paired end read files only supported for FASTQ")
 
+        if (reference_arn is None) and (file_type not in ["FASTQ", "UBAM"]):
+            raise AttributeError("Unlinked read set file types must specify a reference ARN")
+
         transfer_coordinator = self._get_future_coordinator()
         transfer_futures = []
         for fileobj in fileobjs:

diff --git a/omics/transfer/read_set_upload.py b/omics/transfer/read_set_upload.py
@@ -38,7 +38,7 @@ def _main(
         """
         args = {
             "sequenceStoreId": create_args.store_id,
-            "sourceFileType": create_args.file_type.value,
+            "sourceFileType": create_args.file_type,
             "subjectId": create_args.subject_id,
             "sampleId": create_args.sample_id,
             "generatedFrom": create_args.generated_from,
@@ -53,6 +53,9 @@ def _main(
         )
         upload_id = response["uploadId"]
 
+        if (args["referenceArn"] == "" and args["sourceFileType" != "FASTQ" or "UBAM"]):
+            raise AttributeError("Unlinked read set file types must specify a reference ARN")
+
         # Add a cleanup if the multipart upload fails at any point.
         self._transfer_coordinator.add_failure_cleanup(
             client.abort_multipart_read_set_upload,

diff --git a/tests/transfer/functional/test_manager.py b/tests/transfer/functional/test_manager.py
@@ -9,7 +9,6 @@
 from omics.common.omics_file_types import (
     OmicsFileType,
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer.config import TransferConfig
@@ -214,7 +213,7 @@ def test_upload_read_set(self):
         read_set_id = self.manager.upload_read_set(
             io.BytesIO(os.urandom(MIB_BYTES * 250)),
             TEST_CONSTANTS["sequence_store_id"],
-            ReadSetFileType.CRAM,
+            "CRAM",
             "name",
             "subjectId",
             "sampleId",
@@ -359,7 +358,7 @@ def test_upload_read_set(self):
         read_set_id = self.manager.upload_read_set(
             io.BytesIO(os.urandom(MIB_BYTES)),
             TEST_CONSTANTS["sequence_store_id"],
-            ReadSetFileType.CRAM,
+            "CRAM",
             "name",
             "subjectId",
             "sampleId",

diff --git a/tests/transfer/unit/test_manager.py b/tests/transfer/unit/test_manager.py
@@ -7,7 +7,6 @@
 
 from omics.common.omics_file_types import (
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer.manager import TransferManager, _format_local_filename
@@ -129,6 +128,11 @@ def test_format_fastq_index_local_filename(self):
         filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "FASTQ")
         self.assertEqual(filename, "test-filename.index")
 
+    # UBAM should not have an .index file but we include this for consistency.
+    def test_format_ubam_index_local_filename(self):
+        filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "UBAM")
+        self.assertEqual(filename, "test-filename.index")
+
     def test_format_gz_local_filename(self):
         filename = _format_local_filename("test-filename", ReadSetFileName.SOURCE1, "FASTQ", True)
         self.assertEqual(filename, "test-filename_1.fastq")
@@ -193,12 +197,37 @@ def test_upload_too_many_files_throws_exception(self):
     def test_upload_paired_with_wrong_file_type_throws_exception(self):
         with self.assertRaises(AttributeError):
             self.run_simple_upload(
-                [io.BytesIO(b"content1"), io.BytesIO(b"content2")], ReadSetFileType.BAM
+                [io.BytesIO(b"content1"), io.BytesIO(b"content2")], "BAM"
             ).result()
         self.stubber.assert_no_pending_responses()
 
+    def test_upload_no_reference_with_BAM_file_type_exception(self):
+        with self.assertRaises(AttributeError):
+            self.self.transfer_manager.upload_read_set(
+                io.BytesIO(b"some file content1"),
+                TEST_CONSTANTS["sequence_store_id"],
+                "BAM",
+                "name",
+                "subjectId",
+                "sampleId",
+            ).result()
+
+        self.stubber.assert_no_pending_responses()
+
+    def test_upload_no_reference_with_BAM_file_type_exception(self):
+        with self.assertRaises(AttributeError):
+            self.self.transfer_manager.upload_read_set(
+                io.BytesIO(b"some file content1"),
+                TEST_CONSTANTS["sequence_store_id"],
+                "CRAM",
+                "name",
+                "subjectId",
+                "sampleId",
+            ).result()
+
+        self.stubber.assert_no_pending_responses()
     def run_simple_upload(
-        self, files: any, file_type: ReadSetFileType = ReadSetFileType.FASTQ
+        self, files: any, file_type: str = "FASTQ"
     ) -> TransferFuture:
         return self.transfer_manager.upload_read_set(
             files,