Skip to content

Commit

Permalink
Add support for UBAM upload and optional referenceArn (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
duongwilAWS authored Nov 10, 2023
1 parent 850602b commit b1d50f2
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 16 deletions.
18 changes: 18 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,24 @@ GitHub provides additional document on [forking a repository](https://help.githu
[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).


### Set-up
When using the Omics Transfer Manager, it's important to install any necessary required dependencies and models. This includes installation of the latest Omics service model if it is not latest updated model.

To install Omics Transfer Manager dependencies, use the pip command. If using Python3, use the pip3 command.

The Omics Transfer Manager contains dependencies that are reliant on having Python 3.7 or later.

Omics Transfer Manager uses the poetry library for dependency management and packaging.

```
pip install botocore3
pip install mypy-boto3-omics
pip install poetry
poetry install
```

After running `poetry install`, the Omics Transfer Manager should be ready for usage.

## Finding contributions to work on
Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.

Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ Tools for working with the Amazon Omics Service.

## Using the Omics Transfer Manager

### Installation
Installation
Amazon Omics Tools is available through pypi. To install, type:

```python
pip install amazon-omics-tools
```

### Basic Usage
The `TransferManager` class makes it easy to download files for an Omics reference or read set. By default the files are saved to the current directory, or you can specify a custom location with the `directory` parameter.

Expand Down Expand Up @@ -57,7 +65,7 @@ For paired end reads, you can define `fileobjs` as a list of files.
read_set_id = manager.upload_read_set(
"my-sequence-data/read-set-file.bam",
SEQUENCE_STORE_ID,
ReadSetFileType.BAM,
"BAM",
"name",
"subject-id",
"sample-id",
Expand All @@ -68,7 +76,7 @@ read_set_id = manager.upload_read_set(
read_set_id = manager.upload_read_set(
["my-sequence-data/read-set-file_1.fastq.gz", "my-sequence-data/read-set-file_2.fastq.gz"],
SEQUENCE_STORE_ID,
ReadSetFileType.FASTQ,
"FASTQ",
"name",
"subject-id",
"sample-id",
Expand Down
1 change: 1 addition & 0 deletions omics/common/omics_file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,4 @@ class ReadSetFileType(ExtendedEnum):
FASTQ = "FASTQ"
BAM = "BAM"
CRAM = "CRAM"
UBAM = "UBAM"
6 changes: 3 additions & 3 deletions omics/transfer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from s3transfer.futures import TransferFuture
from s3transfer.subscribers import BaseSubscriber

from omics.common.omics_file_types import OmicsFileType, ReadSetFileType
from omics.common.omics_file_types import OmicsFileType


class OmicsTransferSubscriber(BaseSubscriber):
Expand Down Expand Up @@ -60,12 +60,12 @@ class ReadSetUpload:
def __init__(
self,
store_id: str,
file_type: ReadSetFileType,
file_type: str,
name: str,
subject_id: str,
sample_id: str,
reference_arn: str,
fileobj: Union[IO[Any], str],
reference_arn: Optional[str] = None,
generated_from: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
Expand Down
11 changes: 7 additions & 4 deletions omics/transfer/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from omics.common.omics_file_types import (
OmicsFileType,
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer import (
Expand All @@ -52,6 +51,7 @@
"FASTQ": "fastq",
"BAM": "bam",
"CRAM": "cram",
"UBAM": "bam",
}

# Map of file type to index file extension.
Expand Down Expand Up @@ -353,11 +353,11 @@ def upload_read_set(
self,
fileobjs: Union[IO[Any], str, List[Union[IO[Any], str]]],
sequence_store_id: str,
file_type: ReadSetFileType,
file_type: str,
name: str,
subject_id: str,
sample_id: str,
reference_arn: str,
reference_arn: Optional[str] = None,
generated_from: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
Expand Down Expand Up @@ -387,9 +387,12 @@ def upload_read_set(
if len(fileobjs) > 2:
raise AttributeError("at most two files can be uploaded to a read set")

if len(fileobjs) > 1 and file_type is not ReadSetFileType.FASTQ:
if len(fileobjs) > 1 and file_type != "FASTQ":
raise AttributeError("paired end read files only supported for FASTQ")

if (reference_arn is None) and (file_type not in ["FASTQ", "UBAM"]):
raise AttributeError("Unlinked read set file types must specify a reference ARN")

transfer_coordinator = self._get_future_coordinator()
transfer_futures = []
for fileobj in fileobjs:
Expand Down
5 changes: 4 additions & 1 deletion omics/transfer/read_set_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _main(
"""
args = {
"sequenceStoreId": create_args.store_id,
"sourceFileType": create_args.file_type.value,
"sourceFileType": create_args.file_type,
"subjectId": create_args.subject_id,
"sampleId": create_args.sample_id,
"generatedFrom": create_args.generated_from,
Expand All @@ -53,6 +53,9 @@ def _main(
)
upload_id = response["uploadId"]

if (args["referenceArn"] == "" and args["sourceFileType" != "FASTQ" or "UBAM"]):
raise AttributeError("Unlinked read set file types must specify a reference ARN")

# Add a cleanup if the multipart upload fails at any point.
self._transfer_coordinator.add_failure_cleanup(
client.abort_multipart_read_set_upload,
Expand Down
5 changes: 2 additions & 3 deletions tests/transfer/functional/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from omics.common.omics_file_types import (
OmicsFileType,
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer.config import TransferConfig
Expand Down Expand Up @@ -214,7 +213,7 @@ def test_upload_read_set(self):
read_set_id = self.manager.upload_read_set(
io.BytesIO(os.urandom(MIB_BYTES * 250)),
TEST_CONSTANTS["sequence_store_id"],
ReadSetFileType.CRAM,
"CRAM",
"name",
"subjectId",
"sampleId",
Expand Down Expand Up @@ -359,7 +358,7 @@ def test_upload_read_set(self):
read_set_id = self.manager.upload_read_set(
io.BytesIO(os.urandom(MIB_BYTES)),
TEST_CONSTANTS["sequence_store_id"],
ReadSetFileType.CRAM,
"CRAM",
"name",
"subjectId",
"sampleId",
Expand Down
35 changes: 32 additions & 3 deletions tests/transfer/unit/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from omics.common.omics_file_types import (
ReadSetFileName,
ReadSetFileType,
ReferenceFileName,
)
from omics.transfer.manager import TransferManager, _format_local_filename
Expand Down Expand Up @@ -129,6 +128,11 @@ def test_format_fastq_index_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "FASTQ")
self.assertEqual(filename, "test-filename.index")

# UBAM should not have an .index file but we include this for consistency.
def test_format_ubam_index_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.INDEX, "UBAM")
self.assertEqual(filename, "test-filename.index")

def test_format_gz_local_filename(self):
filename = _format_local_filename("test-filename", ReadSetFileName.SOURCE1, "FASTQ", True)
self.assertEqual(filename, "test-filename_1.fastq")
Expand Down Expand Up @@ -193,12 +197,37 @@ def test_upload_too_many_files_throws_exception(self):
def test_upload_paired_with_wrong_file_type_throws_exception(self):
with self.assertRaises(AttributeError):
self.run_simple_upload(
[io.BytesIO(b"content1"), io.BytesIO(b"content2")], ReadSetFileType.BAM
[io.BytesIO(b"content1"), io.BytesIO(b"content2")], "BAM"
).result()
self.stubber.assert_no_pending_responses()

def test_upload_no_reference_with_BAM_file_type_exception(self):
with self.assertRaises(AttributeError):
self.self.transfer_manager.upload_read_set(
io.BytesIO(b"some file content1"),
TEST_CONSTANTS["sequence_store_id"],
"BAM",
"name",
"subjectId",
"sampleId",
).result()

self.stubber.assert_no_pending_responses()

def test_upload_no_reference_with_BAM_file_type_exception(self):
with self.assertRaises(AttributeError):
self.self.transfer_manager.upload_read_set(
io.BytesIO(b"some file content1"),
TEST_CONSTANTS["sequence_store_id"],
"CRAM",
"name",
"subjectId",
"sampleId",
).result()

self.stubber.assert_no_pending_responses()
def run_simple_upload(
self, files: any, file_type: ReadSetFileType = ReadSetFileType.FASTQ
self, files: any, file_type: str = "FASTQ"
) -> TransferFuture:
return self.transfer_manager.upload_read_set(
files,
Expand Down

0 comments on commit b1d50f2

Please sign in to comment.