Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CZID-8457] Create a file - part 2 #65

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions entities/api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def download_link(
) -> typing.Optional[SignedURL]:
if not self.path: # type: ignore
return None
key = self.path.lstrip("/") # type: ignore
key = self.path # type: ignore
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While i'm at it, I'm removing all lstrip calls and making sure the file paths generated in the seed script don't start with /

bucket_name = self.namespace # type: ignore
url = s3_client.generate_presigned_url(
ClientMethod="get_object", Params={"Bucket": bucket_name, "Key": key}, ExpiresIn=expiration
Expand All @@ -62,7 +62,7 @@ async def mark_upload_complete(

validator = get_validator(file.file_format)
try:
file_size = validator.validate(s3_client, file.namespace, file.path.lstrip("/"))
file_size = validator.validate(s3_client, file.namespace, file.path)
except: # noqa
file.status = db.FileStatus.FAILED
else:
Expand Down Expand Up @@ -122,7 +122,7 @@ async def create_file(
await session.commit()

# Create a signed URL
response = s3_client.generate_presigned_post(Bucket=file.namespace, Key=file.path.lstrip("/"), ExpiresIn=expiration)
response = s3_client.generate_presigned_post(Bucket=file.namespace, Key=file.path, ExpiresIn=expiration)
return SignedURL(
url=response["url"], fields=response["fields"], protocol="https", method="POST", expiration=expiration
)
59 changes: 56 additions & 3 deletions entities/api/tests/test_file_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from platformics.database.connect import SyncDB
from test_infra import factories as fa
from mypy_boto3_s3.client import S3Client
from database.models import File
from database.models import File, SequencingRead
import sqlalchemy as sa


Expand All @@ -27,7 +27,7 @@ async def test_file_validation(
file = session.execute(sa.select(File)).scalars().one()

valid_fastq_file = "test_infra/fixtures/test1.fastq"
moto_client.put_object(Bucket=file.namespace, Key=file.path.lstrip("/"), Body=open(valid_fastq_file, "rb"))
moto_client.put_object(Bucket=file.namespace, Key=file.path, Body=open(valid_fastq_file, "rb"))

# Mark upload complete
query = f"""
Expand Down Expand Up @@ -64,7 +64,7 @@ async def test_invalid_fastq(
session.commit()
file = session.execute(sa.select(File)).scalars().one()

moto_client.put_object(Bucket=file.namespace, Key=file.path.lstrip("/"), Body="this is not a fastq file")
moto_client.put_object(Bucket=file.namespace, Key=file.path, Body="this is not a fastq file")

# Mark upload complete
query = f"""
Expand All @@ -80,3 +80,56 @@ async def test_invalid_fastq(
res = await gql_client.query(query, member_projects=[project1_id])
fileinfo = res["data"]["markUploadComplete"]
assert fileinfo["status"] == "FAILED"


# Test creating files
@pytest.mark.asyncio
@pytest.mark.parametrize(
"member_projects,project_id,entity_field",
[
([456], 123, "sequence_file"), # Can't create file for entity you don't have access to
([123], 123, "does_not_exist"), # Can't create file for entity that isn't connected to a valid file type
([123], 123, "sequence_file"), # Can create file for entity you have access to
],
)
async def test_create_file(
member_projects: list[int],
project_id: int,
entity_field: str,
sync_db: SyncDB,
gql_client: GQLTestClient,
) -> None:
user_id = 12345

# Create mock data
with sync_db.session() as session:
fa.SessionStorage.set_session(session)
fa.SequencingReadFactory.create(owner_user_id=user_id, collection_id=project_id)
fa.FileFactory.update_file_ids()
session.commit()

sequencing_read = session.execute(sa.select(SequencingRead)).scalars().one()
entity_id = sequencing_read.entity_id

# Try creating a file
mutation = f"""
mutation MyQuery {{
createFile(entityId: "{entity_id}", entityFieldName: "{entity_field}",
fileName: "test.fastq", fileSize: 123, fileFormat: "fastq") {{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have (/want/need?) to get the file size info as part of the 'create' call?

Copy link
Contributor Author

@robertaboukhalil robertaboukhalil Sep 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure actually, I don't think we need the file size to be stored in the DB for the app to work. The only use case I can think of is: we want basic stats about how much certain uploaded files take up on S3 and querying the DB would be much faster than querying S3 (but not guaranteed to be accurate).

url
expiration
method
protocol
fields
}}
}}
"""
output = await gql_client.query(mutation, member_projects=member_projects)

# If don't have access to this entity, or trying to link an entity with a made up file type, should get an error
if project_id not in member_projects or entity_field == "does_not_exist":
assert output["data"] is None
assert output["errors"] is not None
return

assert output["data"]["createFile"]["url"] == "https://local-bucket.s3.amazonaws.com/"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we return a fully formed signed URL instead of all the parts of one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried putting it all in a URL but it didn't work for some reason, I had to send the fields in the body of the POST request

10 changes: 8 additions & 2 deletions entities/test_infra/factories.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import factory
import faker
import sqlalchemy as sa
from database.models import File, FileStatus, Sample, SequencingRead
from factory import Faker, fuzzy
Expand All @@ -11,6 +12,12 @@
Faker.add_provider(EnumProvider)


def generate_relative_file_path(obj) -> str: # type: ignore
fake = faker.Faker()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to use factory.Faker() here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I tried that but I can't find a way to evaluate the object afterwards:

Screenshot 2023-09-22 at 11 21 47 AM

And if I don't evaluate it, it will try to store the object itself in the DB, which fails

# Can't use absolute=True param because that requires newer version of faker than faker-biology supports
return fake.file_path(depth=3, extension=obj.file_format).lstrip("/")


# TODO, this is a lame singleton to prevent this library from
# requiring an active SA session at import-time. We should try
# to refactor it out when we know more about factoryboy
Expand Down Expand Up @@ -50,8 +57,7 @@ class Meta:
status = factory.Faker("enum", enum_cls=FileStatus)
protocol = fuzzy.FuzzyChoice(["S3", "GCP"])
namespace = fuzzy.FuzzyChoice(["local-bucket", "remote-bucket"])
# path = factory.LazyAttribute(lambda o: {factory.Faker("file_path", depth=3, extension=o.file_format)})
path = factory.Faker("file_path", depth=3)
path = factory.LazyAttribute(lambda o: generate_relative_file_path(o))
file_format = fuzzy.FuzzyChoice(["fasta", "fastq", "bam"])
compression_type = fuzzy.FuzzyChoice(["gz", "bz2", "xz"])
size = fuzzy.FuzzyInteger(1024, 1024 * 1024 * 1024) # Between 1k and 1G
Expand Down