Skip to content

Commit

Permalink
Look for Dataverse renamed files on upload
Browse files Browse the repository at this point in the history
Dataverse 'ingests' certain file types. These file types get renamed.
In upload when Waterbutler tries to find the correct metadata to return,
it will 500 since it  was not looking for the renamed file.
  • Loading branch information
AddisonSchiller committed Nov 13, 2017
1 parent 473191c commit 3ada43b
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 1 deletion.
9 changes: 9 additions & 0 deletions tests/providers/dataverse/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def settings():
'name': 'A look at wizards',
}


@pytest.fixture
def native_file_metadata():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
Expand Down Expand Up @@ -65,12 +66,20 @@ def dataset_metadata_object():
'Dataset Test Version'
)


@pytest.fixture
def file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['native_file_metadata']['datafile'], 'latest')


@pytest.fixture
def csv_file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['csv_native_file_metadata']['datafile'],
'latest')


@pytest.fixture
def revision_metadata_object():
return DataverseRevision('Test Dataset Verision')
16 changes: 16 additions & 0 deletions tests/providers/dataverse/fixtures/root_provider.json
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,22 @@
"label":"thefile.txt",
"version":1
},
"csv_native_file_metadata":{
"datafile":{
"contentType":"text/tab-separated-values",
"description":"",
"filename":"%2Fusr%2Flocal%2Fglassfish4%2Fglassfish%2Fdomains%2Fdomain1%2Ffiles%2F10.5072%2FFK2%2F232XYH%2F14c7a73d734-8383551cc713",
"id":20,
"md5":"6b50249f91258397fc5cb7d5a4127e15",
"name":"thefile.tab",
"originalFormatLabel":"Comma Separated Values",
"originalFileFormat": "text/csv"
},
"datasetVersionId":5,
"description":"",
"label":"thefile.tab",
"version":1
},
"checksum_mismatch_dataset_metadata":{
"data":{
"createTime":"2015-04-02T13:21:59Z",
Expand Down
27 changes: 27 additions & 0 deletions tests/providers/dataverse/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from tests.providers.dataverse.fixtures import (
dataset_metadata_object,
revision_metadata_object,
csv_file_metadata_object,
file_metadata_object
)


class TestDatasetMetadata:

def test_dataset_metadata(self, dataset_metadata_object):
Expand Down Expand Up @@ -45,6 +47,7 @@ def test_file_metadata(self, file_metadata_object):
assert not file_metadata_object.created_utc
assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII'
assert file_metadata_object.etag == 'latest::20'
assert file_metadata_object.original_name == 'thefile.txt'
assert file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
Expand All @@ -53,3 +56,27 @@ def test_file_metadata(self, file_metadata_object):
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}

def test_csv_file_metadata(self, csv_file_metadata_object):
assert csv_file_metadata_object.is_file
assert not csv_file_metadata_object.is_folder
assert csv_file_metadata_object.provider == 'dataverse'
assert csv_file_metadata_object.kind == 'file'
assert csv_file_metadata_object.file_id == '20'
assert csv_file_metadata_object.name == 'thefile.tab'
assert csv_file_metadata_object.path == '/20'
assert csv_file_metadata_object.materialized_path == '/thefile.tab'
assert not csv_file_metadata_object.size
assert not csv_file_metadata_object.modified
assert not csv_file_metadata_object.created_utc
assert csv_file_metadata_object.content_type == 'text/tab-separated-values'
assert csv_file_metadata_object.etag == 'latest::20'
assert csv_file_metadata_object.original_name == 'thefile.csv'
assert csv_file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
'hasPublishedVersion': False,
'hashes': {
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}
55 changes: 55 additions & 0 deletions tests/providers/dataverse/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from waterbutler.providers.dataverse import utils as dv_utils


@pytest.fixture
def format_dict():
return {
'xlsx': {
'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'originalFormatLabel': 'MS Excel (XLSX)',
'contentType': 'text/tab-separated-values',

},
'RData': {
'originalFileFormat': 'application/x-rlang-transport',
'originalFormatLabel': 'R Data',
'contentType': 'text/tab-separated-values'

},
'sav': {
'originalFileFormat': 'application/x-spss-sav',
'originalFormatLabel': 'SPSS SAV',
'contentType': 'text/tab-separated-values'
},
'dta': {
'originalFileFormat': 'application/x-stata',
'originalFormatLabel': 'Stata Binary',
'contentType': 'text/tab-separated-values'

},
'por': {
'originalFileFormat': 'application/x-spss-por',
'originalFormatLabel': 'SPSS Portable',
'contentType': 'text/tab-separated-values'

},
'csv': {
'originalFileFormat': 'text/csv',
'originalFormatLabel': 'Comma Separated Values',
'contentType': 'text/tab-separated-values'
}
}


class TestUtils:

def test_original_ext_from_raw_metadata(self, format_dict):
for key in format_dict:
assert key == dv_utils.original_ext_from_raw_metadata(format_dict[key])

def test_original_ext_from_raw_metadata_none_case(self, format_dict):
for key in format_dict:
format_dict[key]['originalFormatLabel'] = 'blarg'
assert dv_utils.original_ext_from_raw_metadata(format_dict[key]) is None
15 changes: 15 additions & 0 deletions waterbutler/providers/dataverse/metadata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from waterbutler.core import metadata
from waterbutler.providers.dataverse import utils as dv_utils


class BaseDataverseMetadata(metadata.BaseMetadata):
Expand Down Expand Up @@ -26,6 +27,20 @@ def file_id(self):
def name(self):
return self.raw.get('name', None) or self.raw.get('filename', None)

@property
def original_name(self):
""" Dataverse 'ingests' some files types. This changes their extension.
This property will look through the metadata to try to determine the original
name of the file.
"""

ext = dv_utils.original_ext_from_raw_metadata(self.raw)
if ext is None:
return self.name
else:
name = self.name[:self.name.rfind('.')]
return name + '.{}'.format(ext)

@property
def path(self):
return self.build_path(self.file_id)
Expand Down
3 changes: 2 additions & 1 deletion waterbutler/providers/dataverse/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ async def upload(self, stream, path, **kwargs):
# Find appropriate version of file
metadata = await self._get_data('latest')
files = metadata if isinstance(metadata, list) else []
file_metadata = next(file for file in files if file.name == path.name)
file_metadata = next(file for file in files if file.name == path.name or
file.original_name == path.name)

if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']:
raise exceptions.UploadChecksumMismatchError()
Expand Down
68 changes: 68 additions & 0 deletions waterbutler/providers/dataverse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
ORIGINAL_FORMATS = {
'xlsx': {
'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'original_label': 'MS Excel (XLSX)',
'content_type': 'text/tab-separated-values',

},
# Rdata can come in a few different forms, so just list all of them here
'RData': {
'original_format': 'application/x-rlang-transport',
'original_label': 'R Data',
'content_type': 'text/tab-separated-values'

},
'rdata': {
'original_format': 'application/x-rlang-transport',
'original_label': 'R Data',
'content_type': 'text/tab-separated-values'

},
'Rdata': {
'original_format': 'application/x-rlang-transport',
'original_label': 'R Data',
'content_type': 'text/tab-separated-values'

},
'sav': {
'original_format': 'application/x-spss-sav',
'original_label': 'SPSS SAV',
'content_type': 'text/tab-separated-values'
},
'dta': {
'original_format': 'application/x-stata',
'original_label': 'Stata Binary',
'content_type': 'text/tab-separated-values'

},
'por': {
'original_format': 'application/x-spss-por',
'original_label': 'SPSS Portable',
'content_type': 'text/tab-separated-values'

},
'csv': {
'original_format': 'text/csv',
'original_label': 'Comma Separated Values',
'content_type': 'text/tab-separated-values'
}
}


def original_ext_from_raw_metadata(data):
"""Use the raw metadata to figure out the original extension."""
label = data.get('originalFormatLabel', None)
file_format = data.get('originalFileFormat', None)
content_type = data.get('contentType', None)

if not label or not file_format or not content_type:
return None

for key in ORIGINAL_FORMATS:
if (label == ORIGINAL_FORMATS[key]['original_label'] and
file_format == ORIGINAL_FORMATS[key]['original_format'] and
content_type == ORIGINAL_FORMATS[key]['content_type']):

return key

return None

0 comments on commit 3ada43b

Please sign in to comment.