diff --git a/tests/providers/dataverse/fixtures.py b/tests/providers/dataverse/fixtures.py index 94eef05bb..a3bb06181 100644 --- a/tests/providers/dataverse/fixtures.py +++ b/tests/providers/dataverse/fixtures.py @@ -31,6 +31,7 @@ def settings(): 'name': 'A look at wizards', } + @pytest.fixture def native_file_metadata(): with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: @@ -65,12 +66,20 @@ def dataset_metadata_object(): 'Dataset Test Version' ) + @pytest.fixture def file_metadata_object(): with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: return DataverseFileMetadata(json.load(fp)['native_file_metadata']['datafile'], 'latest') +@pytest.fixture +def csv_file_metadata_object(): + with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp: + return DataverseFileMetadata(json.load(fp)['csv_native_file_metadata']['datafile'], + 'latest') + + @pytest.fixture def revision_metadata_object(): return DataverseRevision('Test Dataset Verision') diff --git a/tests/providers/dataverse/fixtures/root_provider.json b/tests/providers/dataverse/fixtures/root_provider.json index 3fd461287..850753ee5 100644 --- a/tests/providers/dataverse/fixtures/root_provider.json +++ b/tests/providers/dataverse/fixtures/root_provider.json @@ -258,6 +258,22 @@ "label":"thefile.txt", "version":1 }, + "csv_native_file_metadata":{ + "datafile":{ + "contentType":"text/tab-separated-values", + "description":"", + "filename":"%2Fusr%2Flocal%2Fglassfish4%2Fglassfish%2Fdomains%2Fdomain1%2Ffiles%2F10.5072%2FFK2%2F232XYH%2F14c7a73d734-8383551cc713", + "id":20, + "md5":"6b50249f91258397fc5cb7d5a4127e15", + "name":"thefile.tab", + "originalFormatLabel":"Comma Separated Values", + "originalFileFormat": "text/csv" + }, + "datasetVersionId":5, + "description":"", + "label":"thefile.tab", + "version":1 + }, "checksum_mismatch_dataset_metadata":{ "data":{ "createTime":"2015-04-02T13:21:59Z", diff --git a/tests/providers/dataverse/test_metadata.py b/tests/providers/dataverse/test_metadata.py index ccb139087..fd31b8419 100644 --- a/tests/providers/dataverse/test_metadata.py +++ b/tests/providers/dataverse/test_metadata.py @@ -2,9 +2,11 @@ from tests.providers.dataverse.fixtures import ( dataset_metadata_object, revision_metadata_object, + csv_file_metadata_object, file_metadata_object ) + class TestDatasetMetadata: def test_dataset_metadata(self, dataset_metadata_object): @@ -45,6 +47,7 @@ def test_file_metadata(self, file_metadata_object): assert not file_metadata_object.created_utc assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII' assert file_metadata_object.etag == 'latest::20' + assert file_metadata_object.original_name == 'thefile.txt' assert file_metadata_object.extra == { 'fileId': '20', 'datasetVersion': 'latest', @@ -53,3 +56,27 @@ def test_file_metadata(self, file_metadata_object): 'md5': '6b50249f91258397fc5cb7d5a4127e15', }, } + + def test_csv_file_metadata(self, csv_file_metadata_object): + assert csv_file_metadata_object.is_file + assert not csv_file_metadata_object.is_folder + assert csv_file_metadata_object.provider == 'dataverse' + assert csv_file_metadata_object.kind == 'file' + assert csv_file_metadata_object.file_id == '20' + assert csv_file_metadata_object.name == 'thefile.tab' + assert csv_file_metadata_object.path == '/20' + assert csv_file_metadata_object.materialized_path == '/thefile.tab' + assert not csv_file_metadata_object.size + assert not csv_file_metadata_object.modified + assert not csv_file_metadata_object.created_utc + assert csv_file_metadata_object.content_type == 'text/tab-separated-values' + assert csv_file_metadata_object.etag == 'latest::20' + assert csv_file_metadata_object.original_name == 'thefile.csv' + assert csv_file_metadata_object.extra == { + 'fileId': '20', + 'datasetVersion': 'latest', + 'hasPublishedVersion': False, + 'hashes': { + 'md5': '6b50249f91258397fc5cb7d5a4127e15', + }, + } diff --git a/tests/providers/dataverse/test_utils.py b/tests/providers/dataverse/test_utils.py new file mode 100644 index 000000000..1d4846b74 --- /dev/null +++ b/tests/providers/dataverse/test_utils.py @@ -0,0 +1,55 @@ +import pytest + +from waterbutler.providers.dataverse import utils as dv_utils + + +@pytest.fixture +def format_dict(): + return { + 'xlsx': { + 'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'originalFormatLabel': 'MS Excel (XLSX)', + 'contentType': 'text/tab-separated-values', + + }, + 'RData': { + 'originalFileFormat': 'application/x-rlang-transport', + 'originalFormatLabel': 'R Data', + 'contentType': 'text/tab-separated-values' + + }, + 'sav': { + 'originalFileFormat': 'application/x-spss-sav', + 'originalFormatLabel': 'SPSS SAV', + 'contentType': 'text/tab-separated-values' + }, + 'dta': { + 'originalFileFormat': 'application/x-stata', + 'originalFormatLabel': 'Stata Binary', + 'contentType': 'text/tab-separated-values' + + }, + 'por': { + 'originalFileFormat': 'application/x-spss-por', + 'originalFormatLabel': 'SPSS Portable', + 'contentType': 'text/tab-separated-values' + + }, + 'csv': { + 'originalFileFormat': 'text/csv', + 'originalFormatLabel': 'Comma Separated Values', + 'contentType': 'text/tab-separated-values' + } + } + + +class TestUtils: + + def test_original_ext_from_raw_metadata(self, format_dict): + for key in format_dict: + assert key == dv_utils.original_ext_from_raw_metadata(format_dict[key]) + + def test_original_ext_from_raw_metadata_none_case(self, format_dict): + for key in format_dict: + format_dict[key]['originalFormatLabel'] = 'blarg' + assert dv_utils.original_ext_from_raw_metadata(format_dict[key]) is None diff --git a/waterbutler/providers/dataverse/metadata.py b/waterbutler/providers/dataverse/metadata.py index 125325dbf..5047a6130 100644 --- a/waterbutler/providers/dataverse/metadata.py +++ b/waterbutler/providers/dataverse/metadata.py @@ -1,4 +1,5 @@ from waterbutler.core import metadata +from waterbutler.providers.dataverse import utils as dv_utils class BaseDataverseMetadata(metadata.BaseMetadata): @@ -26,6 +27,20 @@ def file_id(self): def name(self): return self.raw.get('name', None) or self.raw.get('filename', None) + @property + def original_name(self): + """ Dataverse 'ingests' some files types. This changes their extension. + This property will look through the metadata to try to determine the original + name of the file. + """ + + ext = dv_utils.original_ext_from_raw_metadata(self.raw) + if ext is None: + return self.name + else: + name = self.name[:self.name.rfind('.')] + return name + '.{}'.format(ext) + @property def path(self): return self.build_path(self.file_id) diff --git a/waterbutler/providers/dataverse/provider.py b/waterbutler/providers/dataverse/provider.py index eddaed0b9..82587eda1 100644 --- a/waterbutler/providers/dataverse/provider.py +++ b/waterbutler/providers/dataverse/provider.py @@ -178,7 +178,8 @@ async def upload(self, stream, path, **kwargs): # Find appropriate version of file metadata = await self._get_data('latest') files = metadata if isinstance(metadata, list) else [] - file_metadata = next(file for file in files if file.name == path.name) + file_metadata = next(file for file in files if file.name == path.name or + file.original_name == path.name) if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']: raise exceptions.UploadChecksumMismatchError() diff --git a/waterbutler/providers/dataverse/utils.py b/waterbutler/providers/dataverse/utils.py new file mode 100644 index 000000000..f2d7651c8 --- /dev/null +++ b/waterbutler/providers/dataverse/utils.py @@ -0,0 +1,68 @@ +ORIGINAL_FORMATS = { + 'xlsx': { + 'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'original_label': 'MS Excel (XLSX)', + 'content_type': 'text/tab-separated-values', + + }, + # Rdata can come in a few different forms, so just list all of them here + 'RData': { + 'original_format': 'application/x-rlang-transport', + 'original_label': 'R Data', + 'content_type': 'text/tab-separated-values' + + }, + 'rdata': { + 'original_format': 'application/x-rlang-transport', + 'original_label': 'R Data', + 'content_type': 'text/tab-separated-values' + + }, + 'Rdata': { + 'original_format': 'application/x-rlang-transport', + 'original_label': 'R Data', + 'content_type': 'text/tab-separated-values' + + }, + 'sav': { + 'original_format': 'application/x-spss-sav', + 'original_label': 'SPSS SAV', + 'content_type': 'text/tab-separated-values' + }, + 'dta': { + 'original_format': 'application/x-stata', + 'original_label': 'Stata Binary', + 'content_type': 'text/tab-separated-values' + + }, + 'por': { + 'original_format': 'application/x-spss-por', + 'original_label': 'SPSS Portable', + 'content_type': 'text/tab-separated-values' + + }, + 'csv': { + 'original_format': 'text/csv', + 'original_label': 'Comma Separated Values', + 'content_type': 'text/tab-separated-values' + } +} + + +def original_ext_from_raw_metadata(data): + """Use the raw metadata to figure out the original extension.""" + label = data.get('originalFormatLabel', None) + file_format = data.get('originalFileFormat', None) + content_type = data.get('contentType', None) + + if not label or not file_format or not content_type: + return None + + for key in ORIGINAL_FORMATS: + if (label == ORIGINAL_FORMATS[key]['original_label'] and + file_format == ORIGINAL_FORMATS[key]['original_format'] and + content_type == ORIGINAL_FORMATS[key]['content_type']): + + return key + + return None