From b1d50f2eac330ef6cb7d46c7f6942812bb9976ed Mon Sep 17 00:00:00 2001
From: duongwilAWS <143557586+duongwilAWS@users.noreply.github.com>
Date: Fri, 10 Nov 2023 08:47:55 -0800
Subject: [PATCH] Add support for UBAM upload and optional referenceArn (#23)

---
 CONTRIBUTING.md                           | 18 ++++++++++++
 README.md                                 | 12 ++++++--
 omics/common/omics_file_types.py          |  1 +
 omics/transfer/__init__.py                |  6 ++--
 omics/transfer/manager.py                 | 11 ++++---
 omics/transfer/read_set_upload.py         |  5 +++-
 tests/transfer/functional/test_manager.py |  5 ++--
 tests/transfer/unit/test_manager.py       | 35 +++++++++++++++++++++--
 8 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c4b6a1c..cca064a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -40,6 +40,24 @@ GitHub provides additional document on [forking a repository](https://help.githu
 [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
 
 
+### Set-up
+When using the Omics Transfer Manager, it's important to install any necessary required dependencies and models. This includes installation of the latest Omics service model if it is not latest updated model.
+
+To install Omics Transfer Manager dependencies, use the pip command. If using Python3, use the pip3 command.
+
+The Omics Transfer Manager contains dependencies that are reliant on having Python 3.7 or later.
+
+Omics Transfer Manager uses the poetry library for dependency management and packaging.
+
+```
+pip install botocore3
+pip install mypy-boto3-omics
+pip install poetry
+poetry install
+```
+
+After running `poetry install`, the Omics Transfer Manager should be ready for usage.
+
 ## Finding contributions to work on
 Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
 
diff --git a/README.md b/README.md
index fb01913..ce440da 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,14 @@ Tools for working with the Amazon Omics Service.
 
 ## Using the Omics Transfer Manager
 
+### Installation
+Installation
+Amazon Omics Tools is available through pypi. To install, type:
+
+```python
+pip install amazon-omics-tools
+```
+
 ### Basic Usage
 The `TransferManager` class makes it easy to download files for an Omics reference or read set.  By default the files are saved to the current directory, or you can specify a custom location with the `directory` parameter.
 
@@ -57,7 +65,7 @@ For paired end reads, you can define `fileobjs` as a list of files.
 read_set_id = manager.upload_read_set(
     "my-sequence-data/read-set-file.bam",
     SEQUENCE_STORE_ID,
-    ReadSetFileType.BAM,
+    "BAM",
     "name",
     "subject-id",
     "sample-id",
@@ -68,7 +76,7 @@ read_set_id = manager.upload_read_set(
 read_set_id = manager.upload_read_set(
     ["my-sequence-data/read-set-file_1.fastq.gz", "my-sequence-data/read-set-file_2.fastq.gz"],
     SEQUENCE_STORE_ID,
-    ReadSetFileType.FASTQ,
+    "FASTQ",
     "name",
     "subject-id",
     "sample-id",
diff --git a/omics/common/omics_file_types.py b/omics/common/omics_file_types.py
index 941f5ac..4296486 100644
--- a/omics/common/omics_file_types.py
+++ b/omics/common/omics_file_types.py
@@ -73,3 +73,4 @@ class ReadSetFileType(ExtendedEnum):
     FASTQ = "FASTQ"
     BAM = "BAM"
     CRAM = "CRAM"
+    UBAM = "UBAM"
diff --git a/omics/transfer/__init__.py b/omics/transfer/__init__.py
index f0b2479..c586f4b 100644
--- a/omics/transfer/__init__.py
+++ b/omics/transfer/__init__.py
@@ -3,7 +3,7 @@
 from s3transfer.futures import TransferFuture
 from s3transfer.subscribers import BaseSubscriber
 
-from omics.common.omics_file_types import OmicsFileType, ReadSetFileType
+from omics.common.omics_file_types import OmicsFileType
 
 
 class OmicsTransferSubscriber(BaseSubscriber):
@@ -60,12 +60,12 @@ class ReadSetUpload:
     def __init__(
         self,
         store_id: str,
-        file_type: ReadSetFileType,
+        file_type: str,
         name: str,
         subject_id: str,
         sample_id: str,
-        reference_arn: str,
         fileobj: Union[IO[Any], str],
+        reference_arn: Optional[str] = None,
         generated_from: Optional[str] = None,
         description: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,
diff --git a/omics/transfer/manager.py b/omics/transfer/manager.py
index 5fbcb34..c4c4576 100644
--- a/omics/transfer/manager.py
+++ b/omics/transfer/manager.py
@@ -25,7 +25,6 @@
 from omics.common.omics_file_types import (
     OmicsFileType,
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer import (
@@ -52,6 +51,7 @@
     "FASTQ": "fastq",
     "BAM": "bam",
     "CRAM": "cram",
+    "UBAM": "bam",
 }
 
 # Map of file type to index file extension.
@@ -353,11 +353,11 @@ def upload_read_set(
         self,
         fileobjs: Union[IO[Any], str, List[Union[IO[Any], str]]],
         sequence_store_id: str,
-        file_type: ReadSetFileType,
+        file_type: str,
         name: str,
         subject_id: str,
         sample_id: str,
-        reference_arn: str,
+        reference_arn: Optional[str] = None,
         generated_from: Optional[str] = None,
         description: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,
@@ -387,9 +387,12 @@ def upload_read_set(
         if len(fileobjs) > 2:
             raise AttributeError("at most two files can be uploaded to a read set")
 
-        if len(fileobjs) > 1 and file_type is not ReadSetFileType.FASTQ:
+        if len(fileobjs) > 1 and file_type != "FASTQ":
             raise AttributeError("paired end read files only supported for FASTQ")
 
+        if (reference_arn is None) and (file_type not in ["FASTQ", "UBAM"]):
+            raise AttributeError("Unlinked read set file types must specify a reference ARN")
+
         transfer_coordinator = self._get_future_coordinator()
         transfer_futures = []
         for fileobj in fileobjs:
diff --git a/omics/transfer/read_set_upload.py b/omics/transfer/read_set_upload.py
index eeafe67..5b9676d 100644
--- a/omics/transfer/read_set_upload.py
+++ b/omics/transfer/read_set_upload.py
@@ -38,7 +38,7 @@ def _main(
         """
         args = {
             "sequenceStoreId": create_args.store_id,
-            "sourceFileType": create_args.file_type.value,
+            "sourceFileType": create_args.file_type,
             "subjectId": create_args.subject_id,
             "sampleId": create_args.sample_id,
             "generatedFrom": create_args.generated_from,
@@ -53,6 +53,9 @@ def _main(
         )
         upload_id = response["uploadId"]
 
+        if (args["referenceArn"] == "" and args["sourceFileType" != "FASTQ" or "UBAM"]):
+            raise AttributeError("Unlinked read set file types must specify a reference ARN")
+
         # Add a cleanup if the multipart upload fails at any point.
         self._transfer_coordinator.add_failure_cleanup(
             client.abort_multipart_read_set_upload,
diff --git a/tests/transfer/functional/test_manager.py b/tests/transfer/functional/test_manager.py
index af0796b..fdf283c 100644
--- a/tests/transfer/functional/test_manager.py
+++ b/tests/transfer/functional/test_manager.py
@@ -9,7 +9,6 @@
 from omics.common.omics_file_types import (
     OmicsFileType,
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer.config import TransferConfig
@@ -214,7 +213,7 @@ def test_upload_read_set(self):
         read_set_id = self.manager.upload_read_set(
             io.BytesIO(os.urandom(MIB_BYTES * 250)),
             TEST_CONSTANTS["sequence_store_id"],
-            ReadSetFileType.CRAM,
+            "CRAM",
             "name",
             "subjectId",
             "sampleId",
@@ -359,7 +358,7 @@ def test_upload_read_set(self):
         read_set_id = self.manager.upload_read_set(
             io.BytesIO(os.urandom(MIB_BYTES)),
             TEST_CONSTANTS["sequence_store_id"],
-            ReadSetFileType.CRAM,
+            "CRAM",
             "name",
             "subjectId",
             "sampleId",
diff --git a/tests/transfer/unit/test_manager.py b/tests/transfer/unit/test_manager.py
index f6ff577..1f181d4 100644
--- a/tests/transfer/unit/test_manager.py
+++ b/tests/transfer/unit/test_manager.py
@@ -7,7 +7,6 @@
 
 from omics.common.omics_file_types import (
     ReadSetFileName,
-    ReadSetFileType,
     ReferenceFileName,
 )
 from omics.transfer.manager import TransferManager, _format_local_filename
@@ -129,6 +128,11 @@ def test_format_fastq_index_local_filename(self):
         filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "FASTQ")
         self.assertEqual(filename, "test-filename.index")
 
+    # UBAM should not have an .index file but we include this for consistency.
+    def test_format_ubam_index_local_filename(self):
+        filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "UBAM")
+        self.assertEqual(filename, "test-filename.index")
+
     def test_format_gz_local_filename(self):
         filename = _format_local_filename("test-filename", ReadSetFileName.SOURCE1, "FASTQ", True)
         self.assertEqual(filename, "test-filename_1.fastq")
@@ -193,12 +197,37 @@ def test_upload_too_many_files_throws_exception(self):
     def test_upload_paired_with_wrong_file_type_throws_exception(self):
         with self.assertRaises(AttributeError):
             self.run_simple_upload(
-                [io.BytesIO(b"content1"), io.BytesIO(b"content2")], ReadSetFileType.BAM
+                [io.BytesIO(b"content1"), io.BytesIO(b"content2")], "BAM"
             ).result()
         self.stubber.assert_no_pending_responses()
 
+    def test_upload_no_reference_with_BAM_file_type_exception(self):
+        with self.assertRaises(AttributeError):
+            self.self.transfer_manager.upload_read_set(
+                io.BytesIO(b"some file content1"),
+                TEST_CONSTANTS["sequence_store_id"],
+                "BAM",
+                "name",
+                "subjectId",
+                "sampleId",
+            ).result()
+
+        self.stubber.assert_no_pending_responses()
+
+    def test_upload_no_reference_with_BAM_file_type_exception(self):
+        with self.assertRaises(AttributeError):
+            self.self.transfer_manager.upload_read_set(
+                io.BytesIO(b"some file content1"),
+                TEST_CONSTANTS["sequence_store_id"],
+                "CRAM",
+                "name",
+                "subjectId",
+                "sampleId",
+            ).result()
+
+        self.stubber.assert_no_pending_responses()
     def run_simple_upload(
-        self, files: any, file_type: ReadSetFileType = ReadSetFileType.FASTQ
+        self, files: any, file_type: str = "FASTQ"
     ) -> TransferFuture:
         return self.transfer_manager.upload_read_set(
             files,