Skip to content

Commit

Permalink
Add UBAM support and replace ReadSetFileType with str
Browse files Browse the repository at this point in the history
  • Loading branch information
duongwilAWS committed Oct 26, 2023
1 parent 318636e commit c6807bf
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 26 deletions.
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ Tools for working with the Amazon Omics Service.

## Using the Omics Transfer Manager

### Set-up
When using the Omics Transfer Manager, it's important to install any necessary required dependencies and models. This includes installation of the latest Omics service model if it is not latest updated model.

To install Omics Transfer Manager dependencies, use the pip command. If using Python3, use the pip3 command.

The Omics Transfer Manager contains dependencies that are reliant on having Python 3.7 or later.

Omics Transfer Manager uses the poetry library for dependency management and packaging.

```
pip install botocore3
pip install poetry
poetry install
```

After running `poetry install`, the Omics Transfer Manager should be ready for usage.

### Basic Usage
The `TransferManager` class makes it easy to download files for an Omics reference or read set. By default the files are saved to the current directory, or you can specify a custom location with the `directory` parameter.

Expand Down Expand Up @@ -57,7 +74,7 @@ For paired end reads, you can define `fileobjs` as a list of files.
read_set_id = manager.upload_read_set(
"my-sequence-data/read-set-file.bam",
SEQUENCE_STORE_ID,
ReadSetFileType.BAM,
"BAM",
"name",
"subject-id",
"sample-id",
Expand All @@ -68,7 +85,7 @@ read_set_id = manager.upload_read_set(
read_set_id = manager.upload_read_set(
["my-sequence-data/read-set-file_1.fastq.gz", "my-sequence-data/read-set-file_2.fastq.gz"],
SEQUENCE_STORE_ID,
ReadSetFileType.FASTQ,
"FASTQ",
"name",
"subject-id",
"sample-id",
Expand Down
1 change: 1 addition & 0 deletions omics/common/omics_file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,4 @@ class ReadSetFileType(ExtendedEnum):
FASTQ = "FASTQ"
BAM = "BAM"
CRAM = "CRAM"
UBAM = "UBAM"
26 changes: 13 additions & 13 deletions omics/transfer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from s3transfer.futures import TransferFuture
from s3transfer.subscribers import BaseSubscriber

from omics.common.omics_file_types import OmicsFileType, ReadSetFileType
from omics.common.omics_file_types import OmicsFileType


class OmicsTransferSubscriber(BaseSubscriber):
Expand Down Expand Up @@ -58,18 +58,18 @@ class ReadSetUpload:
"""Details of an Omics read set upload."""

def __init__(
self,
store_id: str,
file_type: ReadSetFileType,
name: str,
subject_id: str,
sample_id: str,
reference_arn: str,
fileobj: Union[IO[Any], str],
generated_from: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
subscribers: Optional[List[BaseSubscriber]] = None,
self,
store_id: str,
file_type: str,
name: str,
subject_id: str,
sample_id: str,
fileobj: Union[IO[Any], str],
reference_arn: Optional[str] = None,
generated_from: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
subscribers: Optional[List[BaseSubscriber]] = None,
):
"""Details of a read set upload.
Expand Down
12 changes: 8 additions & 4 deletions omics/transfer/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from omics.common.omics_file_types import (
OmicsFileType,
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer import (
Expand All @@ -52,6 +51,7 @@
"FASTQ": "fastq",
"BAM": "bam",
"CRAM": "cram",
"UBAM": "bam",
}

# Map of file type to index file extension.
Expand Down Expand Up @@ -353,11 +353,11 @@ def upload_read_set(
self,
fileobjs: Union[IO[Any], str, List[Union[IO[Any], str]]],
sequence_store_id: str,
file_type: ReadSetFileType,
file_type: str,
name: str,
subject_id: str,
sample_id: str,
reference_arn: str,
reference_arn: Optional[str] = None,
generated_from: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
Expand Down Expand Up @@ -387,9 +387,13 @@ def upload_read_set(
if len(fileobjs) > 2:
raise AttributeError("at most two files can be uploaded to a read set")

if len(fileobjs) > 1 and file_type is not ReadSetFileType.FASTQ:
if len(fileobjs) > 1 and file_type != "FASTQ":
raise AttributeError("paired end read files only supported for FASTQ")

if not reference_arn and (file_type != "FASTQ" or file_type != "UBAM"):
raise AttributeError("Unlinked readset file types must specify a referenceArn")


transfer_coordinator = self._get_future_coordinator()
transfer_futures = []
for fileobj in fileobjs:
Expand Down
5 changes: 4 additions & 1 deletion omics/transfer/read_set_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _main(
"""
args = {
"sequenceStoreId": create_args.store_id,
"sourceFileType": create_args.file_type.value,
"sourceFileType": create_args.file_type,
"subjectId": create_args.subject_id,
"sampleId": create_args.sample_id,
"generatedFrom": create_args.generated_from,
Expand All @@ -53,6 +53,9 @@ def _main(
)
upload_id = response["uploadId"]

if (args["referenceArn"] == "" and args["sourceFileType" != "FASTQ" or "UBAM"]):
raise AttributeError("Unlinked readset file types must specify a referenceArn")

# Add a cleanup if the multipart upload fails at any point.
self._transfer_coordinator.add_failure_cleanup(
client.abort_multipart_read_set_upload,
Expand Down
5 changes: 2 additions & 3 deletions tests/transfer/functional/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from omics.common.omics_file_types import (
OmicsFileType,
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer.config import TransferConfig
Expand Down Expand Up @@ -214,7 +213,7 @@ def test_upload_read_set(self):
read_set_id = self.manager.upload_read_set(
io.BytesIO(os.urandom(MIB_BYTES * 250)),
TEST_CONSTANTS["sequence_store_id"],
ReadSetFileType.CRAM,
"CRAM",
"name",
"subjectId",
"sampleId",
Expand Down Expand Up @@ -359,7 +358,7 @@ def test_upload_read_set(self):
read_set_id = self.manager.upload_read_set(
io.BytesIO(os.urandom(MIB_BYTES)),
TEST_CONSTANTS["sequence_store_id"],
ReadSetFileType.CRAM,
"CRAM",
"name",
"subjectId",
"sampleId",
Expand Down
35 changes: 32 additions & 3 deletions tests/transfer/unit/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from omics.common.omics_file_types import (
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer.manager import TransferManager, _format_local_filename
Expand Down Expand Up @@ -129,6 +128,11 @@ def test_format_fastq_index_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "FASTQ")
self.assertEqual(filename, "test-filename.index")

#UBAM should not have an .index file but we include this for consistency.
def test_format_ubam_index_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "UBAM")
self.assertEqual(filename, "test-filename.index")

def test_format_gz_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.SOURCE1, "FASTQ", True)
self.assertEqual(filename, "test-filename_1.fastq")
Expand Down Expand Up @@ -193,12 +197,37 @@ def test_upload_too_many_files_throws_exception(self):
def test_upload_paired_with_wrong_file_type_throws_exception(self):
with self.assertRaises(AttributeError):
self.run_simple_upload(
[io.BytesIO(b"content1"), io.BytesIO(b"content2")], ReadSetFileType.BAM
[io.BytesIO(b"content1"), io.BytesIO(b"content2")], "BAM"
).result()
self.stubber.assert_no_pending_responses()

def test_upload_no_reference_with_BAM_file_type_exception(self):
with self.assertRaises(AttributeError):
self.self.transfer_manager.upload_read_set(
io.BytesIO(b"some file content1"),
TEST_CONSTANTS["sequence_store_id"],
"BAM",
"name",
"subjectId",
"sampleId",
).result()

self.stubber.assert_no_pending_responses()

def test_upload_no_reference_with_BAM_file_type_exception(self):
with self.assertRaises(AttributeError):
self.self.transfer_manager.upload_read_set(
io.BytesIO(b"some file content1"),
TEST_CONSTANTS["sequence_store_id"],
"CRAM",
"name",
"subjectId",
"sampleId",
).result()

self.stubber.assert_no_pending_responses()
def run_simple_upload(
self, files: any, file_type: ReadSetFileType = ReadSetFileType.FASTQ
self, files: any, file_type: str = "FASTQ"
) -> TransferFuture:
return self.transfer_manager.upload_read_set(
files,
Expand Down

0 comments on commit c6807bf

Please sign in to comment.