diff --git a/fuji_server/client/ex_evaluate.py b/fuji_server/client/ex_evaluate.py index 562d5d3e..0c387651 100644 --- a/fuji_server/client/ex_evaluate.py +++ b/fuji_server/client/ex_evaluate.py @@ -152,7 +152,7 @@ #testpids=['https://data.gov.lv/dati/lv/dataset/maksatnespejas-procesi'] testpids=['http://doi.org/10.17882/42182'] #testpids = muchotestpids -testpids =['10.15152/QDB.121'] +testpids =['https://datadoi.ee/handle/33/48'] startpid='' def effectivehandlers(logger): handlers = logger.handlers diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index 8b06d8fc..b6ebccfb 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -35,6 +35,7 @@ import idutils import lxml import rdflib +from rdflib.exceptions import ParserError from rdflib.namespace import RDF from rdflib.namespace import DCTERMS from rdflib.namespace import DC @@ -94,7 +95,7 @@ class FAIRCheck: FILES_LIMIT = None LOG_SUCCESS = 25 VALID_RESOURCE_TYPES = [] - FUJI_VERSION = 'v1.0.5c' + FUJI_VERSION = 'v1.0.5d' def __init__(self, uid, test_debug=False, oaipmh=None, use_datacite=True): uid_bytes = uid.encode('utf-8') @@ -137,6 +138,7 @@ def __init__(self, uid, test_debug=False, oaipmh=None, use_datacite=True): self.logger.setLevel(logging.INFO) # set to debug in testing environment self.logger.addHandler(self.logStreamHandler) self.count = 0 + self.embedded_retrieved = False FAIRCheck.load_predata() self.extruct = None self.extruct_result = None @@ -147,7 +149,7 @@ def __init__(self, uid, test_debug=False, oaipmh=None, use_datacite=True): def load_predata(cls): cls.FILES_LIMIT = Preprocessor.data_files_limit if not cls.METRICS: - cls.METRICS = Preprocessor.get_custom_metrics(['metric_name', 'total_score','metric_tests']) + cls.METRICS = Preprocessor.get_custom_metrics(['metric_name', 'total_score','metric_tests','metric_number']) if not cls.SPDX_LICENSES: # cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES, cls.SPDX_LICENSE_URLS = Preprocessor.get_licenses() cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES = Preprocessor.get_licenses() @@ -184,18 +186,21 @@ def uri_validator(u): # TODO integrate into request_helper.py def retrieve_metadata(self, extruct_metadata): + embedded_exists={} if isinstance(extruct_metadata, dict): embedded_exists = {k: v for k, v in extruct_metadata.items() if v} self.extruct = embedded_exists.copy() + ''' if embedded_exists: # retrieve metadata from landing page self.logger.info( - 'FsF-F2-01M : Formats of structured metadata embedded in HTML markup - {}'.format( + 'FsF-F2-01M : Formats of structured metadata embedded in HTML markup detected by extruct - {}'.format( list(embedded_exists.keys()))) - self.retrieve_metadata_embedded(embedded_exists) + #self.retrieve_metadata_embedded(embedded_exists) else: self.logger.warning('FsF-F2-01M : NO structured metadata embedded in HTML') - + ''' if self.reference_elements: # this will be always true as we need datacite client id + self.retrieve_metadata_embedded(embedded_exists) self.retrieve_metadata_external() # ========= clean merged metadata, delete all entries which are None or '' @@ -217,7 +222,7 @@ def retrieve_metadata(self, extruct_metadata): if mv == '' or mv is None: del self.metadata_merged[mk] - self.logger.info('FsF-F2-01M : Type of object described by the metadata - {}'.format(self.metadata_merged.get('object_type'))) + self.logger.info('FsF-F2-01M : Type of object described by the metadata -: {}'.format(self.metadata_merged.get('object_type'))) # detect api and standards self.retrieve_apis_standards() @@ -230,7 +235,7 @@ def retrieve_apis_standards(self): if self.landing_url is not None: self.logger.info('FsF-R1.3-01M : Retrieving API and Standards') client_id = self.metadata_merged.get('datacite_client') - self.logger.info('FsF-R1.3-01M : re3data/datacite client id - {}'.format(client_id)) + self.logger.info('FsF-R1.3-01M : re3data/datacite client id -: {}'.format(client_id)) if self.oaipmh_endpoint: self.logger.info('{} : OAI-PMH endpoint provided as part of the request.'.format('FsF-R1.3-01M')) @@ -243,11 +248,11 @@ def retrieve_apis_standards(self): self.oaipmh_endpoint = repoHelper.getRe3MetadataAPIs().get('OAI-PMH') self.sparql_endpoint = repoHelper.getRe3MetadataAPIs().get('SPARQL') self.community_standards.extend(repoHelper.getRe3MetadataStandards()) - self.logger.info('{} : Metadata standards listed in re3data record - {}'.format('FsF-R1.3-01M', self.community_standards )) + self.logger.info('{} : Metadata standards listed in re3data record -: {}'.format('FsF-R1.3-01M', self.community_standards )) # retrieve metadata standards info from oai-pmh if self.oaipmh_endpoint: - self.logger.info('{} : Use OAI-PMH endpoint to retrieve standards used by the repository - {}'.format('FsF-R1.3-01M',self.oaipmh_endpoint)) + self.logger.info('{} : Use OAI-PMH endpoint to retrieve standards used by the repository -: {}'.format('FsF-R1.3-01M',self.oaipmh_endpoint)) if (self.uri_validator(self.oaipmh_endpoint)): oai_provider = OAIMetadataProvider(endpoint=self.oaipmh_endpoint, logger=self.logger,metric_id='FsF-R1.3-01M') self.community_standards_uri = oai_provider.getMetadataStandards() @@ -255,7 +260,7 @@ def retrieve_apis_standards(self): stds = None if self.community_standards_uri: stds = list(self.community_standards_uri.keys()) - self.logger.log(self.LOG_SUCCESS,'{} : Found disciplinary standards that are listed in OAI-PMH endpoint - {}'.format('FsF-R1.3-01M',stds )) + self.logger.log(self.LOG_SUCCESS,'{} : Found disciplinary standards that are listed in OAI-PMH endpoint -: {}'.format('FsF-R1.3-01M',stds )) else: self.logger.info('{} : Invalid endpoint'.format('FsF-R1.3-01M')) else: @@ -267,7 +272,12 @@ def retrieve_metadata_embedded(self, extruct_metadata): isPid = False if self.pid_scheme: isPid = True + self.embedded_retrieved = True + self.logger.info('FsF-F2-01M : Starting to identify EMBEDDED metadata at -: ' + str(self.landing_url)) + # ========= retrieve embedded rdfa and microdata metadata ======== + self.logger.info('FsF-F2-01M : Trying to retrieve Microdata metadata from html page') + micro_meta = extruct_metadata.get('microdata') microdata_collector = MetaDataCollectorMicroData(loggerinst=self.logger, sourcemetadata=micro_meta, mapping=Mapper.MICRODATA_MAPPING) @@ -280,14 +290,18 @@ def retrieve_metadata_embedded(self, extruct_metadata): if i in self.reference_elements: self.metadata_merged[i] = micro_dict[i] self.reference_elements.remove(i) - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found microdata metadata: '+str(micro_dict.keys())) + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found microdata metadata -: '+str(micro_dict.keys())) #================== RDFa + self.logger.info('FsF-F2-01M : Trying to retrieve RDFa metadata from html page') + RDFA_ns = rdflib.Namespace("http://www.w3.org/ns/rdfa#") rdfasource = MetaDataCollector.Sources.RDFA.value rdfagraph = None errors=[] try: + rdflib_logger = logging.getLogger('rdflib') + rdflib_logger.setLevel(logging.ERROR) rdfagraph = rdflib.Graph() rdfagraph.parse(data=self.landing_html, format='rdfa') rdfa_collector = MetaDataCollectorRdf(loggerinst=self.logger, target_url=self.landing_url, source=rdfasource, @@ -301,45 +315,38 @@ def retrieve_metadata_embedded(self, extruct_metadata): if i in self.reference_elements: self.metadata_merged[i] = rdfa_dict[i] self.reference_elements.remove(i) - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found RDFa metadata: '+str(rdfa_dict.keys())) - except: - self.logger.info('FsF-F2-01M : RDFa metadata parsing exception, probably no RDFa embedded in HTML') + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found RDFa metadata -: '+str(rdfa_dict.keys())) + except Exception as e: + self.logger.info('FsF-F2-01M : RDFa metadata parsing exception, probably no RDFa embedded in HTML -:'+str(e)) # ========= retrieve schema.org (embedded, or from via content-negotiation if pid provided) ========= ext_meta = extruct_metadata.get('json-ld') - if self.use_datacite is True: - target_url = self.pid_url - else: - target_url = self.landing_url + self.logger.info('FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata from html page') schemaorg_collector = MetaDataCollectorSchemaOrg(loggerinst=self.logger, sourcemetadata=ext_meta, - mapping=Mapper.SCHEMAORG_MAPPING, pidurl=target_url) + mapping=Mapper.SCHEMAORG_MAPPING, pidurl=None) source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata() schemaorg_dict = self.exclude_null(schemaorg_dict) if schemaorg_dict: self.namespace_uri.extend(schemaorg_collector.namespaces) - #not_null_sco = [k for k, v in schemaorg_dict.items() if v is not None] - if source_schemaorg == MetaDataCollector.Sources.SCHEMAORG_EMBED.value: - self.metadata_sources.append((source_schemaorg,'embedded')) - else: - self.metadata_sources.append((source_schemaorg, 'negotiated')) + self.metadata_sources.append((source_schemaorg,'embedded')) if schemaorg_dict.get('related_resources'): self.related_resources.extend(schemaorg_dict.get('related_resources')) if schemaorg_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata : ' + str(schemaorg_dict.get('object_content_identifier'))) + self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata -: ' + str(schemaorg_dict.get('object_content_identifier'))) # add object type for future reference for i in schemaorg_dict.keys(): if i in self.reference_elements: self.metadata_merged[i] = schemaorg_dict[i] self.reference_elements.remove(i) - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found Schema.org metadata: '+str(schemaorg_dict.keys())) + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found schema.org JSON-LD metadata in html page -: '+str(schemaorg_dict.keys())) else: - self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE') + self.logger.info('FsF-F2-01M : schema.org JSON-LD metadata in html page UNAVAILABLE') # ========= retrieve dublin core embedded in html page ========= if self.reference_elements: - self.logger.info('FsF-F2-01M : Checking for DublinCore metadata') + self.logger.info('FsF-F2-01M : Trying to retrieve Dublin Core metadata from html page') dc_collector = MetaDataCollectorDublinCore(loggerinst=self.logger, sourcemetadata=self.landing_html, mapping=Mapper.DC_MAPPING) source_dc, dc_dict = dc_collector.parse_metadata() @@ -354,11 +361,13 @@ def retrieve_metadata_embedded(self, extruct_metadata): if d in self.reference_elements: self.metadata_merged[d] = dc_dict[d] self.reference_elements.remove(d) - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found DublinCore metadata: '+str(dc_dict.keys())) + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found DublinCore metadata -: '+str(dc_dict.keys())) else: self.logger.info('FsF-F2-01M : DublinCore metadata UNAVAILABLE') # ======== retrieve OpenGraph metadata + self.logger.info('FsF-F2-01M : Trying to retrieve OpenGraph metadata from html page') + ext_meta = extruct_metadata.get('opengraph') opengraph_collector = MetaDataCollectorOpenGraph(loggerinst=self.logger, sourcemetadata=ext_meta, mapping=Mapper.OG_MAPPING) @@ -371,15 +380,16 @@ def retrieve_metadata_embedded(self, extruct_metadata): if i in self.reference_elements: self.metadata_merged[i] = opengraph_dict[i] self.reference_elements.remove(i) - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found OpenGraph metadata: ' + str(opengraph_dict.keys())) + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found OpenGraph metadata -: ' + str(opengraph_dict.keys())) else: - self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE') + self.logger.info('FsF-F2-01M : OpenGraph metadata UNAVAILABLE') #========= retrieve signposting data links + self.logger.info('FsF-F2-01M : Trying to identify Typed Links in html page') data_sign_links = self.get_signposting_links('item') if data_sign_links: - self.logger.info('FsF-F3-01M : Found data links in response header (signposting) : ' + str(len(data_sign_links))) + self.logger.info('FsF-F3-01M : Found data links in response header (signposting) -: ' + str(len(data_sign_links))) if self.metadata_merged.get('object_content_identifier') is None: self.metadata_merged['object_content_identifier'] = data_sign_links @@ -387,7 +397,7 @@ def retrieve_metadata_embedded(self, extruct_metadata): data_meta_links = self.get_html_typed_links(rel='item') if data_meta_links: - self.logger.info('FsF-F3-01M : Found data links in HTML head (link rel=item) : ' + str(len(data_meta_links))) + self.logger.info('FsF-F3-01M : Found data links in HTML head (link rel=item) -: ' + str(len(data_meta_links))) if self.metadata_merged.get('object_content_identifier') is None: self.metadata_merged['object_content_identifier'] = data_meta_links # self.metadata_sources.append((MetaDataCollector.Sources.TYPED_LINK.value,'linked')) @@ -452,10 +462,10 @@ def get_guessed_xml_link(self): response=urllib.urlopen(guessed_link) if response.getheader('Content-Type') in ['text/xml','application/rdf+xml']: datalink={'source':'guessed','url': guessed_link, 'type': response.getheader('Content-Type'), 'rel': 'alternate'} - self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found XML content at: '+guessed_link) + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found XML content at -: '+guessed_link) except: - self.logger.info('FsF-F2-01M : Guessed XML retrieval failed for: '+guessed_link) + self.logger.info('FsF-F2-01M : Guessed XML retrieval failed for -: '+guessed_link) return datalink def retrieve_metadata_external(self): @@ -463,8 +473,16 @@ def retrieve_metadata_external(self): test_typed_links = False test_signposting = False test_embedded = False + self.logger.info('FsF-F2-01M : Starting to identify EXTERNAL metadata through content negotiation or typed links') + # ========= retrieve xml metadata namespaces by content negotiation ======== if self.landing_url is not None: + + if self.use_datacite is True: + target_url = self.pid_url + else: + target_url = self.landing_url + self.logger.info('FsF-F2-01M : Trying to retrieve XML metadata through content negotiation') negotiated_xml_collector = MetaDataCollectorXML(loggerinst=self.logger,target_url=self.landing_url, link_type='negotiated') source_neg_xml, metadata_neg_dict = negotiated_xml_collector.parse_metadata() @@ -473,9 +491,35 @@ def retrieve_metadata_external(self): test_content_negotiation = True #TODO: Finish this ... + # ========= retrieve json-ld/schema.org metadata namespaces by content negotiation ======== + self.logger.info('FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata through content negotiation') + + schemaorg_collector = MetaDataCollectorSchemaOrg(loggerinst=self.logger, sourcemetadata=None, + mapping=Mapper.SCHEMAORG_MAPPING, pidurl=target_url) + source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata() + schemaorg_dict = self.exclude_null(schemaorg_dict) + if schemaorg_dict: + self.namespace_uri.extend(schemaorg_collector.namespaces) + self.metadata_sources.append((source_schemaorg, 'negotiated')) + if schemaorg_dict.get('related_resources'): + self.related_resources.extend(schemaorg_dict.get('related_resources')) + if schemaorg_dict.get('object_content_identifier'): + self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata -: ' + str( + schemaorg_dict.get('object_content_identifier'))) + # add object type for future reference + for i in schemaorg_dict.keys(): + if i in self.reference_elements: + self.metadata_merged[i] = schemaorg_dict[i] + self.reference_elements.remove(i) + self.logger.log(self.LOG_SUCCESS, + 'FsF-F2-01M : Found Schema.org metadata through content negotiation-: ' + str(schemaorg_dict.keys())) + else: + self.logger.info('FsF-F2-01M : Schema.org metadata through content negotiation UNAVAILABLE') + # ========= retrieve rdf metadata namespaces by content negotiation ======== self.logger.info('FsF-F2-01M : Trying to retrieve RDF metadata through content negotiation') source = MetaDataCollector.Sources.LINKED_DATA.value + #TODO: handle this the same way as with datacite based content negotiation->use the use_datacite switch if self.pid_scheme == 'purl': targeturl = self.pid_url else: @@ -487,19 +531,19 @@ def retrieve_metadata_external(self): source_rdf, rdf_dict = neg_rdf_collector.parse_metadata() # in case F-UJi was redirected and the landing page content negotiation doesnt return anything try the origin URL if not rdf_dict: - if self.origin_url is not None: + if self.origin_url is not None and self.origin_url != targeturl: neg_rdf_collector.target_url = self.origin_url source_rdf, rdf_dict = neg_rdf_collector.parse_metadata() self.namespace_uri.extend(neg_rdf_collector.getNamespaces()) rdf_dict = self.exclude_null(rdf_dict) if rdf_dict: if rdf_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in RDF metadata : ' + str( + self.logger.info('FsF-F3-01M : Found data links in RDF metadata -: ' + str( len(rdf_dict.get('object_content_identifier')))) test_content_negotiation = True self.logger.log(self.LOG_SUCCESS, - 'FsF-F2-01M : Found Linked Data metadata: {}'.format(str(rdf_dict.keys()))) + 'FsF-F2-01M : Found Linked Data metadata -: {}'.format(str(rdf_dict.keys()))) self.metadata_sources.append((source_rdf,'negotiated')) for r in rdf_dict.keys(): @@ -527,9 +571,9 @@ def retrieve_metadata_external(self): test_content_negotiation = True # not_null_dcite = [k for k, v in dcitejsn_dict.items() if v is not None] self.metadata_sources.append((source_dcitejsn,'negotiated')) - self.logger.log(self.LOG_SUCCESS,'FsF-F2-01M : Found Datacite metadata: {}'.format(str(dcitejsn_dict.keys()))) + self.logger.log(self.LOG_SUCCESS,'FsF-F2-01M : Found Datacite metadata -: {}'.format(str(dcitejsn_dict.keys()))) if dcitejsn_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in Datacite metadata : ' + str( + self.logger.info('FsF-F3-01M : Found data links in Datacite metadata -: ' + str( dcitejsn_dict.get('object_content_identifier'))) if dcitejsn_dict.get('related_resources'): self.related_resources.extend(dcitejsn_dict.get('related_resources')) @@ -569,12 +613,12 @@ def retrieve_metadata_external(self): typed_rdf_collector = None for metadata_link in typed_metadata_links: if metadata_link['type'] in ['application/rdf+xml','text/n3','text/ttl','application/ld+json']: - self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to RDF Metadata ('+str(metadata_link['type']+')')) + self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to RDF Metadata -: ('+str(metadata_link['type'])+' '+str(metadata_link['url'])+')') found_metadata_link=True source = MetaDataCollector.Sources.RDF_TYPED_LINKS.value typed_rdf_collector = MetaDataCollectorRdf(loggerinst=self.logger, target_url=metadata_link['url'], source=source ) elif metadata_link['type'] in ['text/xml','application/x-ddi-l+xml','application/x-ddametadata+xml']: - self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to XML Metadata (' + str( + self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to XML Metadata -: (' + str( metadata_link['type'] + ')')) typed_rdf_collector = MetaDataCollectorXML(loggerinst=self.logger, target_url=metadata_link['url'], link_type=metadata_link.get('source')) @@ -585,7 +629,7 @@ def retrieve_metadata_external(self): rdf_dict = self.exclude_null(rdf_dict) if rdf_dict: test_typed_links = True - self.logger.log(self.LOG_SUCCESS,'FsF-F2-01M : Found Linked Data metadata: {}'.format(str(rdf_dict.keys()))) + self.logger.log(self.LOG_SUCCESS,'FsF-F2-01M : Found Linked Data metadata -: {}'.format(str(rdf_dict.keys()))) self.metadata_sources.append((source_rdf,'linked')) for r in rdf_dict.keys(): @@ -596,7 +640,7 @@ def retrieve_metadata_external(self): self.logger.info('FsF-F2-01M : Linked Data metadata UNAVAILABLE') if self.reference_elements: - self.logger.debug('Reference metadata elements NOT FOUND - {}'.format(self.reference_elements)) + self.logger.debug('FsF-F2-01M : Reference metadata elements NOT FOUND -: {}'.format(self.reference_elements)) else: self.logger.debug('FsF-F2-01M : ALL reference metadata elements available') @@ -721,7 +765,10 @@ def get_log_messages_dict(self): m = log_message.split(":", 1) metric = m[0].strip() message_n_level = m[1].strip().split("|",1) - level = message_n_level[1] + if len(message_n_level) >1: + level = message_n_level[1] + else: + level ='INFO' message = message_n_level[0] if metric not in logger_messages: logger_messages[metric] =[] diff --git a/fuji_server/evaluators/fair_evaluator.py b/fuji_server/evaluators/fair_evaluator.py index 34c1ba65..c0ae6091 100644 --- a/fuji_server/evaluators/fair_evaluator.py +++ b/fuji_server/evaluators/fair_evaluator.py @@ -33,6 +33,7 @@ def __init__(self, fuji_instance): self.fuji=fuji_instance self.metric_identifier = None self.metrics = None + self.metric_number = None self.result = None self.metric_tests = dict() self.isDebug=self.fuji.isDebug @@ -47,6 +48,7 @@ def set_metric(self, metric_identifier, metrics): self.total_score = int(self.metrics.get(metric_identifier).get('total_score')) self.score = FAIRResultCommonScore(total=self.total_score) self.metric_name = self.metrics.get(metric_identifier).get('metric_name') + self.metric_number = self.metrics.get(metric_identifier).get('metric_number') self.initializeEvaluationCriteria() diff --git a/fuji_server/evaluators/fair_evaluator_community_metadata.py b/fuji_server/evaluators/fair_evaluator_community_metadata.py index aebbd97c..29e3a179 100644 --- a/fuji_server/evaluators/fair_evaluator_community_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_community_metadata.py @@ -30,7 +30,7 @@ class FAIREvaluatorCommunityMetadata(FAIREvaluator): def evaluate(self): - self.result = CommunityEndorsedStandard(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = CommunityEndorsedStandard(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) standards_detected: List[CommunityEndorsedStandardOutputInner] = [] @@ -39,7 +39,7 @@ def evaluate(self): # ============== retrieve community standards by collected namespace uris if len(self.fuji.namespace_uri) > 0: no_match = [] - self.logger.info('FsF-R1.3-01M : Namespaces included in the metadata - {}'.format(self.fuji.namespace_uri)) + self.logger.info('FsF-R1.3-01M : Namespaces included in the metadata -: {}'.format(self.fuji.namespace_uri)) for std_ns in self.fuji.namespace_uri: std_ns_temp = self.fuji.lookup_metadatastandard_by_uri(std_ns) # if std_ns_temp in FAIRCheck.COMMUNITY_METADATA_STANDARDS_URIS: @@ -48,11 +48,11 @@ def evaluate(self): std_name = self.fuji.COMMUNITY_METADATA_STANDARDS_URIS.get(std_ns_temp).get('title') if subject and all(elem == "Multidisciplinary" for elem in subject): self.logger.info( - 'FsF-R1.3-01M : Skipped non-disciplinary standard found through namespaces - {}'.format( + 'FsF-R1.3-01M : Skipped non-disciplinary standard found through namespaces -: {}'.format( std_ns)) else: self.logger.log(self.fuji.LOG_SUCCESS, - 'FsF-R1.3-01M : Found disciplinary standard through namespaces - {}'.format( + 'FsF-R1.3-01M : Found disciplinary standard through namespaces -: {}'.format( std_ns)) nsout = CommunityEndorsedStandardOutputInner() nsout.metadata_standard = std_name # use here original standard uri detected @@ -63,7 +63,7 @@ def evaluate(self): no_match.append(std_ns) if len(no_match) > 0: self.logger.info( - 'FsF-R1.3-01M : The following standards found through namespaces are excluded as they are not listed in RDA metadata catalog - {}'.format( + 'FsF-R1.3-01M : The following standards found through namespaces are excluded as they are not listed in RDA metadata catalog -: {}'.format( no_match)) if standards_detected: self.setEvaluationCriteriumScore('FsF-R1.3-01M-1a', 1, 'pass') @@ -78,11 +78,11 @@ def evaluate(self): if standard_found: subject = self.fuji.COMMUNITY_STANDARDS.get(standard_found).get('subject_areas') if subject and all(elem == "Multidisciplinary" for elem in subject): - self.logger.info('FsF-R1.3-01M : Skipped non-disciplinary standard - {}'.format(s)) + self.logger.info('FsF-R1.3-01M : Skipped non-disciplinary standard -: {}'.format(s)) else: self.setEvaluationCriteriumScore('FsF-R1.3-01M-1b', 1, 'pass') self.logger.log(self.fuji.LOG_SUCCESS, - 'FsF-R1.3-01M : Found disciplinary standard through re3data - {}'.format( + 'FsF-R1.3-01M : Found disciplinary standard through re3data -: {}'.format( s)) out = CommunityEndorsedStandardOutputInner() out.metadata_standard = s diff --git a/fuji_server/evaluators/fair_evaluator_content_included.py b/fuji_server/evaluators/fair_evaluator_content_included.py index dfe64678..2b671891 100644 --- a/fuji_server/evaluators/fair_evaluator_content_included.py +++ b/fuji_server/evaluators/fair_evaluator_content_included.py @@ -29,14 +29,14 @@ class FAIREvaluatorContentIncluded(FAIREvaluator): def evaluate(self): - self.result = IdentifierIncluded(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = IdentifierIncluded(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = IdentifierIncludedOutput() id_object = self.fuji.metadata_merged.get('object_identifier') self.output.object_identifier_included = id_object contents = self.fuji.metadata_merged.get('object_content_identifier') if id_object is not None: - self.logger.info('FsF-F3-01M : Object identifier specified {}'.format(id_object)) + self.logger.info('FsF-F3-01M : Object identifier specified -: {}'.format(id_object)) score = 0 content_list = [] if contents: @@ -44,11 +44,11 @@ def evaluate(self): contents = [contents] contents = [c for c in contents if c] number_of_contents = len(contents) - self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F3-01M : Number of object content identifier found - {}'.format(number_of_contents)) + self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F3-01M : Number of object content identifier found -: {}'.format(number_of_contents)) if number_of_contents >= self.fuji.FILES_LIMIT: self.logger.info( - 'FsF-F3-01M : The total number of object (content) specified is above threshold, so use the first {} content identifiers'.format( + 'FsF-F3-01M : The total number of object (content) specified is above threshold, so use the first -: {} content identifiers'.format( self.fuji.FILES_LIMIT)) contents = contents[:self.fuji.FILES_LIMIT] @@ -65,11 +65,11 @@ def evaluate(self): content_link['header_content_type'] = str(content_link['header_content_type']).split(';')[0] content_link['header_content_length'] = response.getheader('Content-Length') if content_link['header_content_type'] != content_link.get('type'): - self.logger.warning('FsF-F3-01M : Content type given in metadata (' + str(content_link.get( - 'type')) + ') differs from content type given in Header response (' + str( + self.logger.warning('FsF-F3-01M : Content type given in metadata differs from content type given in Header response -: (' + str(content_link.get( + 'type')) + ') vs. (' + str( content_link['header_content_type']) + ')') self.logger.info( - 'FsF-F3-01M : Replacing metadata content type with content type from Header response: ' + str( + 'FsF-F3-01M : Replacing metadata content type with content type from Header response -: ' + str( content_link['header_content_type'])) content_link['type'] = content_link['header_content_type'] # will pass even if the url cannot be accessed which is OK @@ -79,8 +79,7 @@ def evaluate(self): did_output_content.content_identifier_active = False #content_list.append(did_output_content) except urllib.error.HTTPError as e: - self.logger.warning( - 'FsF-F3-01M : Content identifier {0} inaccessible, HTTPError code {1} '.format( + self.logger.warning('FsF-F3-01M : Content identifier inaccessible -: {0} , HTTPError code {1} '.format( content_link.get('url'), e.code)) except urllib.error.URLError as e: self.logger.exception(e.reason) @@ -91,7 +90,7 @@ def evaluate(self): did_output_content.content_identifier_active = True content_list.append(did_output_content) else: - self.logger.warning('FsF-F3-01M : Object (content) url is empty - {}'.format(content_link)) + self.logger.warning('FsF-F3-01M : Object (content) url is empty -: {}'.format(content_link)) else: self.logger.warning('FsF-F3-01M : Data (content) identifier is missing.') diff --git a/fuji_server/evaluators/fair_evaluator_data_access_level.py b/fuji_server/evaluators/fair_evaluator_data_access_level.py index 0b4112cd..e3ea9b5b 100644 --- a/fuji_server/evaluators/fair_evaluator_data_access_level.py +++ b/fuji_server/evaluators/fair_evaluator_data_access_level.py @@ -36,7 +36,7 @@ def evaluate(self): #2) Eprints AccessRights Vocabulary: check for http://purl.org/eprint/accessRights/ #3) EU publications access rights check for http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC #4) Openaire Guidelines info:eu-repo/semantics/openAccess - self.result = DataAccessLevel(self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = DataAccessLevel(self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = DataAccessOutput() licence_evaluator = FAIREvaluatorLicense(self.fuji) #rights_regex = r'((\/licenses|purl.org\/coar\/access_right|purl\.org\/eprint\/accessRights|europa\.eu\/resource\/authority\/access-right)\/{1}(\S*))' @@ -52,11 +52,15 @@ def evaluate(self): #access_rights can be None or [] if access_rights: self.logger.info('FsF-A1-01M : Found access rights information in dedicated metadata element') + if isinstance(access_rights, str): access_rights = [access_rights] for access_right in access_rights: - self.logger.info('FsF-A1-01M : Access information specified - {}'.format(access_right)) + #TODO: remove new lines also from other logger messages or handle this elsewhere + access_right = re.sub(r"[\r\n]+", ' ', access_right) + self.logger.info('FsF-A1-01M : Access information specified -: {}'.format(access_right.replace('\n', ' '))) if not licence_evaluator.isLicense(value=access_right, metric_id=self.metric_identifier): # exclude license-based text from access_rights + rights_match = re.search(rights_regex, access_right, re.IGNORECASE) if rights_match is not None: last_group = len(rights_match.groups()) @@ -65,14 +69,25 @@ def evaluate(self): if re.search(right_code, filtered_rights, re.IGNORECASE): access_level = right_status access_details['access_condition'] = rights_match[1] #overwrite existing condition - self.logger.info('FsF-A1-01M : Access level recognized as ' + str(right_status)) + self.logger.info('FsF-A1-01M : Standardized actionable access level recognized as -:' + str(right_status)) break break else: - self.logger.info('FsF-A1-01M : Not a standardized access level') + self.logger.info('FsF-A1-01M : Not a standardized, actionable access level') else: - self.logger.warning('FsF-A1-01M : Access condition looks like license, therefore the following is ignored - {}'.format(access_right)) + self.logger.warning('FsF-A1-01M : Access condition looks like license, therefore the following is ignored -: {}'.format(access_right)) exclude.append(access_right) + + if not access_level: + lower_case_access_dict = dict((k.lower(), v) for k, v in Mapper.ACCESS_RIGHT_CODES.value.items()) + for access_right in access_rights: + if access_right.lower() in lower_case_access_dict: + self.logger.info('FsF-A1-01M : Non-actionable (term only) standard access level recognized as -:' + str( + lower_case_access_dict.get(access_right.lower()))) + access_level = lower_case_access_dict.get(access_right.lower()) + access_details['access_condition'] = access_right + break + if not access_details and access_rights: access_rights = set(access_rights) - set(exclude) if access_rights : @@ -97,7 +112,7 @@ def evaluate(self): if access_level == 'embargoed': available_date = self.fuji.metadata_merged.get('publication_date') if available_date: - self.logger.info('FsF-A1-01M : Embargoed access, available date - {}'.format(available_date)) + self.logger.info('FsF-A1-01M : Embargoed access, available date -: {}'.format(available_date)) access_details['available_date'] = available_date else: self.logger.warning('FsF-A1-01M : Embargoed access, available date NOT found') @@ -112,7 +127,7 @@ def evaluate(self): if access_level: #must be one of ['public', 'embargoed', 'restricted', 'closed_metadataonly'] self.output.access_level = access_level self.setEvaluationCriteriumScore('FsF-A1-01M-1', 1, 'pass') - self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-A1-01M : Access level to data could successfully be determined: '+access_level) + self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-A1-01M : Access level to data could successfully be determined -: '+access_level) else: self.logger.warning('FsF-A1-01M : Unable to determine the access level') self.output.access_details = access_details diff --git a/fuji_server/evaluators/fair_evaluator_data_content_metadata.py b/fuji_server/evaluators/fair_evaluator_data_content_metadata.py index fc93ce95..9c0afd1f 100644 --- a/fuji_server/evaluators/fair_evaluator_data_content_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_data_content_metadata.py @@ -34,7 +34,7 @@ class FAIREvaluatorDataContentMetadata(FAIREvaluator): def evaluate(self): - self.result = DataContentMetadata(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = DataContentMetadata(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = DataContentMetadataOutput() data_content_descriptors = [] test_data_content_text = None @@ -42,7 +42,7 @@ def evaluate(self): test_status = 'fail' score = 0 - self.logger.info('FsF-R1-01MD : Object landing page accessible status - {}'.format(self.fuji.isMetadataAccessible)) + self.logger.info('FsF-R1-01MD : Object landing page accessible status -: {}'.format(self.fuji.isMetadataAccessible)) # 1. check resource type #TODO: resource type collection might be classified as 'dataset' # http://doi.org/10.1007/s10531-013-0468-6 @@ -53,12 +53,12 @@ def evaluate(self): if str(resource_type).startswith('http'): resource_type = resource_type.split('/')[-1] if resource_type in self.fuji.VALID_RESOURCE_TYPES or resource_type in self.fuji.SCHEMA_ORG_CONTEXT: - self.logger.log(self.fuji.LOG_SUCCESS,'FsF-R1-01MD : Resource type specified - {}'.format(resource_type)) + self.logger.log(self.fuji.LOG_SUCCESS,'FsF-R1-01MD : Resource type specified -: {}'.format(resource_type)) self.output.object_type = resource_type self.setEvaluationCriteriumScore('FsF-R1-01MD-1', 1, 'pass') score += 1 else: - self.logger.warning('FsF-R1-01MD : No valid resource type specified: '+str(resource_type)) + self.logger.warning('FsF-R1-01MD : No valid resource type specified -: '+str(resource_type)) else: self.logger.warning('FsF-R1-01MD : NO resource type specified ') @@ -67,10 +67,10 @@ def evaluate(self): not_empty_content_uris = [d['url'] for d in self.fuji.content_identifier if 'url' in d] content_length = len(not_empty_content_uris) if content_length > 0: - self.logger.info('FsF-R1-01MD : Number of data content URI(s) specified - {}'.format(content_length)) + self.logger.info('FsF-R1-01MD : Number of data content URI(s) specified -: {}'.format(content_length)) test_data_content_url = not_empty_content_uris[-1] self.logger.info( - 'FsF-R1-01MD : Selected content file to be analyzed - {}'.format(test_data_content_url)) + 'FsF-R1-01MD : Selected content file to be analyzed -: {}'.format(test_data_content_url)) try: # Use Tika to parse the file response_body=[] @@ -91,8 +91,7 @@ def evaluate(self): # avoiding large file sizes to test with TIKA.. truncate after 1 Mb tika_content_size = tika_content_size + len(chunk) if time.time() > (start + timeout) or tika_content_size >= max_download_size: - self.logger.warning( - 'FsF-R1-01MD : File too large.., skipped download after ' + str( + self.logger.warning('FsF-R1-01MD : File too large.., skipped download after -:' + str( timeout) + ' sec or receiving > ' + str(max_download_size) + '- {}'.format( test_data_content_url)) tika_content_size = 0 @@ -100,13 +99,12 @@ def evaluate(self): break except urllib.error.HTTPError as e: - self.logger.warning( - 'FsF-F3-01M : Content identifier {0} inaccessible, HTTPError code {1} '.format( + self.logger.warning('FsF-F3-01M : Content identifier inaccessible -: {0}, HTTPError code {1} '.format( test_data_content_url, e.code)) except urllib.error.URLError as e: self.logger.exception(e.reason) except Exception as e: - self.logger.warning('FsF-F3-01M : Could not access the resource'+str(e)) + self.logger.warning('FsF-F3-01M : Could not access the resource -:'+str(e)) response_content = b''.join(response_body) status = 'tika error' @@ -116,11 +114,9 @@ def evaluate(self): status = parsedFile.get("status") tika_content_types = parsedFile.get("metadata").get('Content-Type') parsed_content = parsedFile.get("content") - self.logger.info( - '{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier)) + self.logger.info('{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier)) except Exception as e: - self.logger.warning( - '{0} : File parsing using TIKA failed: {1}'.format(self.metric_identifier, e)) + self.logger.warning('{0} : File parsing using TIKA failed -: {1}'.format(self.metric_identifier, e)) # in case TIKA request fails use response header info tika_content_types = str(r.headers.get('content-type')).split(';')[0] @@ -131,23 +127,20 @@ def evaluate(self): self.fuji.tika_content_types_list.append(content_types_str) # Extract the text content from the parsed file and convert to string - self.logger.info( - '{0} : File request status code {1}'.format(self.metric_identifier, status)) + self.logger.info('{0} : File request status code -: {1}'.format(self.metric_identifier, status)) test_data_content_text = str(parsed_content) # Escape any slash # test_data_content_text = parsed_content.replace('\\', '\\\\').replace('"', '\\"') if test_data_content_text: #parsed_files = parsedFile.get("metadata").get('resourceName') - self.logger.info('FsF-R1-01MD : Succesfully parsed data file(s) - {}'.format(test_data_content_url)) + self.logger.info('FsF-R1-01MD : Succesfully parsed data file(s) -: {}'.format(test_data_content_url)) #else: # self.logger.warning('FsF-R1-01MD : Data file not accessible {}'.format(r.status_code)) except Exception as e: - self.logger.warning( - '{0} : Could not retrieve/parse content object - {1}'.format(self.metric_identifier, e)) + self.logger.warning('{0} : Could not retrieve/parse content object -: {1}'.format(self.metric_identifier, e)) #traceback.print_exc() else: - self.logger.warning( - 'FsF-R1-01MD : NO data object content available/accessible to perform file descriptors (type and size) tests') + self.logger.warning('FsF-R1-01MD : NO data object content available/accessible to perform file descriptors (type and size) tests') # 3. check file type and size descriptors of parsed data file only (ref:test_data_content_url) if test_data_content_url: @@ -177,7 +170,7 @@ def evaluate(self): matches_content = True matches_type = True else: - self.logger.warning('{0} : Could not verify content type from downloaded file (expected: {1}, found: {2})'.format(self.metric_identifier, data_object.get('type'), str(self.fuji.tika_content_types_list) )) + self.logger.warning('{0} : Could not verify content type from downloaded file -: (expected: {1}, found: {2})'.format(self.metric_identifier, data_object.get('type'), str(self.fuji.tika_content_types_list) )) elif d == 'size': if tika_content_size == 0: self.logger.warning('{0} : Could not verify content size (received: 0 bytes) from downloaded file'.format(self.metric_identifier)) @@ -185,7 +178,7 @@ def evaluate(self): matches_content = True matches_size = True else: - self.logger.warning('{0} : Could not verify content size from downloaded file (expected: {1}, found: {2})'.format(self.metric_identifier, str(data_object.get('size')), str(tika_content_size) )) + self.logger.warning('{0} : Could not verify content size from downloaded file -: (expected: {1}, found: {2})'.format(self.metric_identifier, str(data_object.get('size')), str(tika_content_size) )) data_content_filetype_inner = DataContentMetadataOutputInner() data_content_filetype_inner.descriptor = descriptor @@ -193,7 +186,7 @@ def evaluate(self): data_content_filetype_inner.matches_content = matches_content data_content_descriptors.append(data_content_filetype_inner) else: - self.logger.warning('{0} : NO {1} info available'.format(self.metric_identifier, type)) + self.logger.warning('{0} : NO info available about {1} -: '.format(self.metric_identifier, type)) ### scoring for file descriptors match if matches_type and matches_size: score += 1 @@ -207,8 +200,7 @@ def evaluate(self): self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-R1-01MD : Found measured variables or observations (aka parameters) as content descriptor') if not test_data_content_text: - self.logger.warning( - 'FsF-R1-01MD : Could not verify measured variables found in data object content, content parsing failed') + self.logger.warning('FsF-R1-01MD : Could not verify measured variables found in data object content, content parsing failed') for variable in self.fuji.metadata_merged['measured_variable']: variable_metadata_inner = DataContentMetadataOutputInner() variable_metadata_inner.descriptor = 'measured_variable' @@ -224,11 +216,9 @@ def evaluate(self): is_variable_scored = True data_content_descriptors.append(variable_metadata_inner) else: - self.logger.warning( - 'FsF-R1-01MD : NO measured variables found in metadata, skip \'measured_variable\' test.') + self.logger.warning('FsF-R1-01MD : NO measured variables found in metadata, skip \'measured_variable\' test.') if not is_variable_scored: - self.logger.warning( - 'FsF-R1-01MD : Measured variables given in metadata do not match data object content') + self.logger.warning('FsF-R1-01MD : Measured variables given in metadata do not match data object content') if score >= self.total_score / 2: # more than half of total score, consider the test as pass test_status = 'pass' diff --git a/fuji_server/evaluators/fair_evaluator_data_provenance.py b/fuji_server/evaluators/fair_evaluator_data_provenance.py index 7705f520..77c414b6 100644 --- a/fuji_server/evaluators/fair_evaluator_data_provenance.py +++ b/fuji_server/evaluators/fair_evaluator_data_provenance.py @@ -31,7 +31,7 @@ class FAIREvaluatorDataProvenance(FAIREvaluator): def evaluate(self): - self.result = DataProvenance(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = DataProvenance(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = DataProvenanceOutput() score = 0 diff --git a/fuji_server/evaluators/fair_evaluator_file_format.py b/fuji_server/evaluators/fair_evaluator_file_format.py index db7158cc..6dde689c 100644 --- a/fuji_server/evaluators/fair_evaluator_file_format.py +++ b/fuji_server/evaluators/fair_evaluator_file_format.py @@ -35,7 +35,7 @@ def evaluate(self): text_format_regex = r'(^text)[\/]|[\/\+](xml|text|json)' - self.result = DataFileFormat(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = DataFileFormat(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = DataFileFormatOutput() data_file_list = [] @@ -47,7 +47,7 @@ def evaluate(self): for c in contents: if c.get('type'): unique_types.append(c.get('type')) - self.logger.info('FsF-R1.3-02D : File format(s) specified - {}'.format(list(set(unique_types)))) + self.logger.info('FsF-R1.3-02D : File format(s) specified -: {}'.format(list(set(unique_types)))) mime_url_pair = {} if len(self.fuji.content_identifier) > 0: @@ -58,7 +58,7 @@ def evaluate(self): if data_file.get('url') is not None: if mime_type is None or mime_type in ['application/octet-stream']: self.logger.info( - 'FsF-R1.3-02D : Guessing the type of a file based on its filename or URL - {}'.format( + 'FsF-R1.3-02D : Guessing the type of a file based on its filename or URL -: {}'.format( data_file.get('url'))) # if mime type not given try to guess it based on the file name guessed_mime_type = mimetypes.guess_type(data_file.get('url')) @@ -74,7 +74,7 @@ def evaluate(self): self.fuji.tika_content_types_list = [n for n in self.fuji.tika_content_types_list if n not in self.fuji.ARCHIVE_MIMETYPES] self.logger.info( - 'FsF-R1.3-02D : Extracted file formats for selected data object (see FsF-R1-01MD) - {}'.format(self.fuji.tika_content_types_list)) + 'FsF-R1.3-02D : Extracted file formats for selected data object (see FsF-R1-01MD) -: {}'.format(self.fuji.tika_content_types_list)) for t in self.fuji.tika_content_types_list: mime_url_pair[t] = data_file.get('url') else: @@ -128,8 +128,7 @@ def evaluate(self): self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-R1.3-02D : Could identify a file format commonly used by the scientific community') self.result.test_status = 'pass' else: - self.logger.warning( - 'FsF-R1.3-02D : Could not perform file format checks as data content identifier(s) unavailable/inaccesible') + self.logger.warning('FsF-R1.3-02D : Could not perform file format checks as data content identifier(s) unavailable/inaccesible') self.result.test_status = 'fail' self.output = data_file_list diff --git a/fuji_server/evaluators/fair_evaluator_formal_metadata.py b/fuji_server/evaluators/fair_evaluator_formal_metadata.py index 8ac66509..f56d20f5 100644 --- a/fuji_server/evaluators/fair_evaluator_formal_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_formal_metadata.py @@ -32,7 +32,7 @@ class FAIREvaluatorFormalMetadata(FAIREvaluator): def evaluate(self): - self.result = FormalMetadata(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = FormalMetadata(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) outputs = [] @@ -123,29 +123,26 @@ def evaluate(self): # self.pid_url = 'https://meta.icos-cp.eu/objects/9ri1elaogsTv9LQFLNTfDNXm' #test uri if self.fuji.sparql_endpoint: self.logger.info( - '{0} : SPARQL endpoint found - {1}'.format(self.metric_identifier, self.fuji.sparql_endpoint)) + '{0} : SPARQL endpoint found -: {1}'.format(self.metric_identifier, self.fuji.sparql_endpoint)) sparql_provider = SPARQLMetadataProvider(endpoint=self.fuji.sparql_endpoint, logger=self.logger, metric_id=self.metric_identifier) query = "CONSTRUCT {{?dataURI ?property ?value}} where {{ VALUES ?dataURI {{ <{}> }} ?dataURI ?property ?value }}".format( self.fuji.pid_url) - self.logger.info('{0} : Executing SPARQL - {1}'.format(self.metric_identifier, query)) + self.logger.info('{0} : Executing SPARQL -: {1}'.format(self.metric_identifier, query)) rdfgraph, contenttype = sparql_provider.getMetadata(query) if rdfgraph: outputs.append( FormalMetadataOutputInner(serialization_format=contenttype, source='sparql_endpoint', is_metadata_found=True)) score += 1 - self.logger.log(self.fuji.LOG_SUCCESS, - '{0} : Found RDF content through SPARQL endpoint'.format( + self.logger.log(self.fuji.LOG_SUCCESS,'{0} : Found RDF content through SPARQL endpoint'.format( self.metric_identifier)) self.setEvaluationCriteriumScore('FsF-I1-01M-2', 1, 'pass') self.fuji.namespace_uri.extend(sparql_provider.getNamespaces()) else: - self.logger.warning( - '{0} : NO RDF metadata retrieved through the sparql endpoint'.format(self.metric_identifier)) + self.logger.warning('{0} : NO RDF metadata retrieved through the sparql endpoint'.format(self.metric_identifier)) else: - self.logger.warning( - '{0} : NO SPARQL endpoint found through re3data based on the object URI provided'.format( + self.logger.warning('{0} : NO SPARQL endpoint found through re3data based on the object URI provided'.format( self.metric_identifier)) if score > 0: diff --git a/fuji_server/evaluators/fair_evaluator_license.py b/fuji_server/evaluators/fair_evaluator_license.py index 1298d008..62bd13fe 100644 --- a/fuji_server/evaluators/fair_evaluator_license.py +++ b/fuji_server/evaluators/fair_evaluator_license.py @@ -46,7 +46,7 @@ def isLicense (self, value, metric_id): return islicense def lookup_license_by_url(self, u, metric_id): - self.logger.info('{0} : Verify URL through SPDX registry - {1}'.format(metric_id, u)) + self.logger.info('{0} : Verify URL through SPDX registry -: {1}'.format(metric_id, u)) html_url = None isOsiApproved = False for item in self.fuji.SPDX_LICENSES: @@ -54,7 +54,7 @@ def lookup_license_by_url(self, u, metric_id): # if any(u in v.lower() for v in item.values()): seeAlso = item['seeAlso'] if any(u in v for v in seeAlso): - self.logger.info('{0} : Found SPDX license representation - {1}'.format(metric_id, item['detailsUrl'])) + self.logger.info('{0} : Found SPDX license representation -: {1}'.format(metric_id, item['detailsUrl'])) # html_url = '.html'.join(item['detailsUrl'].rsplit('.json', 1)) html_url = item['detailsUrl'].replace(".json", ".html") isOsiApproved = item['isOsiApproved'] @@ -65,14 +65,14 @@ def lookup_license_by_name(self, lvalue, metric_id): # TODO - find simpler way to run fuzzy-based search over dict/json (e.g., regex) html_url = None isOsiApproved = False - self.logger.info('{0} : Verify name through SPDX registry - {1}'.format(metric_id, lvalue)) + self.logger.info('{0} : Verify name through SPDX registry -: {1}'.format(metric_id, lvalue)) # Levenshtein distance similarity ratio between two license name sim = [Levenshtein.ratio(lvalue.lower(), i) for i in self.fuji.SPDX_LICENSE_NAMES] if max(sim) > 0.85: index_max = max(range(len(sim)), key=sim.__getitem__) sim_license = self.fuji.SPDX_LICENSE_NAMES[index_max] found = next((item for item in self.fuji.SPDX_LICENSES if item['name'] == sim_license), None) - self.logger.info('{0}: Found SPDX license representation - {1}'.format(metric_id,found['detailsUrl'])) + self.logger.info('{0}: Found SPDX license representation -: {1}'.format(metric_id,found['detailsUrl'])) # html_url = '.html'.join(found['detailsUrl'].rsplit('.json', 1)) html_url = found['detailsUrl'].replace(".json", ".html") isOsiApproved = found['isOsiApproved'] @@ -80,7 +80,7 @@ def lookup_license_by_name(self, lvalue, metric_id): def evaluate(self): - self.result = License(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = License(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) licenses_list = [] specified_licenses = self.fuji.metadata_merged.get('license') self.score.earned = 0 diff --git a/fuji_server/evaluators/fair_evaluator_metadata_preservation.py b/fuji_server/evaluators/fair_evaluator_metadata_preservation.py index 6f4ae2c7..54b56a78 100644 --- a/fuji_server/evaluators/fair_evaluator_metadata_preservation.py +++ b/fuji_server/evaluators/fair_evaluator_metadata_preservation.py @@ -30,7 +30,7 @@ class FAIREvaluatorMetadataPreserved(FAIREvaluator): def evaluate(self): registry_bound_pid = ['doi'] - self.result = MetadataPreserved(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = MetadataPreserved(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) outputs = [] test_status = 'fail' @@ -44,8 +44,7 @@ def evaluate(self): self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Metadata registry bound PID system used: ' + self.fuji.pid_scheme.format(self.metric_identifier)) else: - self.logger.warning( - '{0} : NO metadata registry bound PID system used'.format(self.metric_identifier)) + self.logger.warning('{0} : NO metadata registry bound PID system used'.format(self.metric_identifier)) self.score.earned = score self.result.score = self.score self.result.output = outputs diff --git a/fuji_server/evaluators/fair_evaluator_minimal_metadata.py b/fuji_server/evaluators/fair_evaluator_minimal_metadata.py index 72a5bb88..cf6a28b3 100644 --- a/fuji_server/evaluators/fair_evaluator_minimal_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_minimal_metadata.py @@ -32,7 +32,7 @@ def evaluate(self): self.logger.warning('FsF-F2-01M : Metadata checks probably unreliable: landing page URL could not be determined') self.fuji.retrieve_metadata(self.fuji.extruct_result) - self.result = CoreMetadata(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = CoreMetadata(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) metadata_required = Mapper.REQUIRED_CORE_METADATA.value metadata_found = {k: v for k, v in self.fuji.metadata_merged.items() if k in metadata_required} @@ -41,9 +41,9 @@ def evaluate(self): partial_elements = ['creator', 'title', 'object_identifier', 'publication_date','publisher','object_type'] # TODO: check the number of metadata elements which metadata_found has in common with metadata_required # set(a) & set(b) - self.logger.info('FsF-F2-01M : Testing for required core descriptive metadata elements {}'.format(metadata_required)) + self.logger.info('FsF-F2-01M : Testing for required core descriptive metadata elements -: {}'.format(metadata_required)) if set(metadata_found) == set(metadata_required): - self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F2-01M : Found required core descriptive metadata elements {}'.format(metadata_required)) + self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F2-01M : Found required core descriptive metadata elements -: {}'.format(metadata_required)) metadata_status = 'all metadata' self.score.earned = self.total_score self.setEvaluationCriteriumScore('FsF-F2-01M-3', 1, 'pass') @@ -51,18 +51,17 @@ def evaluate(self): test_status = 'pass' else: core_missing = list(set(metadata_required) - set(metadata_found)) - self.logger.warning('FsF-F2-01M : Not all required core descriptive metadata elements exist, missing: {}'.format(str(core_missing))) - self.logger.info('FsF-F2-01M : Testing for required core citation metadata elements {}'.format(partial_elements)) + self.logger.warning('FsF-F2-01M : Not all required core descriptive metadata elements exist, missing -: {}'.format(str(core_missing))) + self.logger.info('FsF-F2-01M : Testing for required core citation metadata elements -: {}'.format(partial_elements)) if set(partial_elements).issubset(metadata_found): - self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F2-01M : Found required core citation metadata elements {}'.format(partial_elements)) + self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F2-01M : Found required core citation metadata elements -: {}'.format(partial_elements)) metadata_status = 'partial metadata' self.setEvaluationCriteriumScore('FsF-F2-01M-2', 1, 'pass') self.score.earned = self.total_score - 1 test_status = 'pass' else: partial_missing = list(set(partial_elements) - set(metadata_found)) - self.logger.warning( - 'FsF-F2-01M : Not all required citation metadata elements exist, missing: '+str(partial_missing)) + self.logger.warning('FsF-F2-01M : Not all required citation metadata elements exist, missing -: '+str(partial_missing)) metadata_status = 'insufficient metadata' # status should follow enumeration in yaml self.score.earned = 0 test_status = 'fail' diff --git a/fuji_server/evaluators/fair_evaluator_persistent_identifier.py b/fuji_server/evaluators/fair_evaluator_persistent_identifier.py index 92849a6f..c94baa4f 100644 --- a/fuji_server/evaluators/fair_evaluator_persistent_identifier.py +++ b/fuji_server/evaluators/fair_evaluator_persistent_identifier.py @@ -33,7 +33,7 @@ class FAIREvaluatorPersistentIdentifier(FAIREvaluator): def evaluate(self): - self.result = Persistence(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = Persistence(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info('FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'.format( @@ -49,7 +49,7 @@ def evaluate(self): # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request - neg_source, self.fuji.extruct_result = requestHelper.content_negotiate('FsF-F1-02D') + neg_source, self.fuji.extruct_result = requestHelper.content_negotiate('FsF-F1-02D', ignore_html = False) r = requestHelper.getHTTPResponse() if r: @@ -103,17 +103,15 @@ def evaluate(self): self.fuji.isMetadataAccessible = True elif r.status_code in [401, 402, 403]: self.fuji.isMetadataAccessible = False - self.logger.warning("Resource inaccessible, identifier returned http status code: {code}".format(code=r.status_code)) + self.logger.warning("Resource inaccessible, identifier returned http status code -: {code}".format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False - self.logger.warning("Resource inaccessible, identifier returned http status code: {code}".format(code=r.status_code)) + self.logger.warning("Resource inaccessible, identifier returned http status code -: {code}".format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False - self.logger.warning( - "FsF-F1-02D :Resource inaccessible, no response received from: {}".format(check_url)) + self.logger.warning("FsF-F1-02D :Resource inaccessible, no response received from -: {}".format(check_url)) else: - self.logger.warning( - "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier: {}".format(self.fuji.id)) + self.logger.warning("FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}".format(self.fuji.id)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) @@ -132,11 +130,11 @@ def evaluate(self): #print(self.metric_tests) - self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) + self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-02D : Persistence identifier scheme -: {}'.format(self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 - self.logger.warning('FsF-F1-02D : Not a persistent identifier scheme - {}'.format(self.fuji.id_scheme)) + self.logger.warning('FsF-F1-02D : Not a persistent identifier scheme -: {}'.format(self.fuji.id_scheme)) self.result.score = self.score self.result.metric_tests = self.metric_tests diff --git a/fuji_server/evaluators/fair_evaluator_related_resources.py b/fuji_server/evaluators/fair_evaluator_related_resources.py index a76547a0..e656f53c 100644 --- a/fuji_server/evaluators/fair_evaluator_related_resources.py +++ b/fuji_server/evaluators/fair_evaluator_related_resources.py @@ -29,10 +29,10 @@ class FAIREvaluatorRelatedResources(FAIREvaluator): def evaluate(self): - self.result = RelatedResource(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = RelatedResource(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = RelatedResourceOutput() - self.logger.info('{0} : Total number of related resources extracted - {1}'.format(self.metric_identifier, + self.logger.info('{0} : Total number of related resources extracted -: {1}'.format(self.metric_identifier, len(self.fuji.related_resources))) # if self.metadata_merged.get('related_resources'): @@ -40,7 +40,7 @@ def evaluate(self): # QC check: exclude potential incorrect relation self.fuji.related_resources = [item for item in self.fuji.related_resources if item.get('related_resource') != self.fuji.pid_url] - self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Number of related resources after QC step - {1}'.format(self.metric_identifier, len( + self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Number of related resources after QC step -: {1}'.format(self.metric_identifier, len( self.fuji.related_resources))) if self.fuji.related_resources: # TODO include source of relation diff --git a/fuji_server/evaluators/fair_evaluator_searchable.py b/fuji_server/evaluators/fair_evaluator_searchable.py index e46c22cb..8bafaa51 100644 --- a/fuji_server/evaluators/fair_evaluator_searchable.py +++ b/fuji_server/evaluators/fair_evaluator_searchable.py @@ -30,13 +30,13 @@ class FAIREvaluatorSearchable(FAIREvaluator): def evaluate(self): - self.result = Searchable(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = Searchable(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = SearchableOutput() search_mechanisms = [] sources_registry = [MetaDataCollector.Sources.DATACITE_JSON.value] all = str([e.value for e in MetaDataCollector.Sources]).strip('[]') - self.logger.info('FsF-F4-01M : Supported tests of metadata retrieval/extraction - {}'.format(all)) + self.logger.info('FsF-F4-01M : Supported tests of metadata retrieval/extraction -: {}'.format(all)) search_engines_support = [MetaDataCollector.Sources.SCHEMAORG_NEGOTIATE.value, MetaDataCollector.Sources.SCHEMAORG_EMBED.value, MetaDataCollector.Sources.DUBLINCORE.value, @@ -49,7 +49,7 @@ def evaluate(self): OutputSearchMechanisms(mechanism='structured data', mechanism_info=search_engine_support_match)) self.logger.info('FsF-F4-01M : Metadata found through - structured data') else: - self.logger.warning('FsF-F4-01M : Metadata is NOT found through - {}'.format(search_engines_support)) + self.logger.warning('FsF-F4-01M : Metadata is NOT found through -: {}'.format(search_engines_support)) #TODO: replace this metadata format based test by real lookup at registries registry_support_match = list(set(dict(self.fuji.metadata_sources).keys()).intersection(sources_registry)) if registry_support_match: @@ -58,8 +58,7 @@ def evaluate(self): OutputSearchMechanisms(mechanism='metadata registry', mechanism_info=registry_support_match)) self.logger.info('FsF-F4-01M : Metadata found through - metadata registry') else: - self.logger.warning( - 'FsF-F4-01M : Metadata is NOT found through registries considered by the assessment service - {}'.format( + self.logger.warning('FsF-F4-01M : Metadata is NOT found through registries considered by the assessment service -: {}'.format( sources_registry)) length = len(search_mechanisms) if length > 0: diff --git a/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py b/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py index 0069ffb3..1409e47f 100644 --- a/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py +++ b/fuji_server/evaluators/fair_evaluator_semantic_vocabulary.py @@ -30,14 +30,14 @@ class FAIREvaluatorSemanticVocabulary(FAIREvaluator): def evaluate(self): - self.result = SemanticVocabulary(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = SemanticVocabulary(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) # remove duplicates if self.fuji.namespace_uri: self.fuji.namespace_uri = list(set(self.fuji.namespace_uri)) self.fuji.namespace_uri = [x.strip() for x in self.fuji.namespace_uri] - self.logger.info('{0} : Number of vocabulary namespaces extracted from all RDF-based metadata - {1}'.format( + self.logger.info('{0} : Number of vocabulary namespaces extracted from all RDF-based metadata -: {1}'.format( self.metric_identifier, len(self.fuji.namespace_uri))) # exclude white list @@ -49,7 +49,7 @@ def evaluate(self): self.fuji.namespace_uri[:] = [x for x in self.fuji.namespace_uri if x not in excluded] if excluded: self.logger.info( - '{0} : Default vocabulary namespace(s) excluded - {1}'.format(self.metric_identifier, excluded)) + '{0} : Default vocabulary namespace(s) excluded -: {1}'.format(self.metric_identifier, excluded)) outputs = [] score = 0 @@ -59,11 +59,11 @@ def evaluate(self): lod_namespaces = [d['namespace'] for d in self.fuji.VOCAB_NAMESPACES if 'namespace' in d] exists = list(set(lod_namespaces) & set(self.fuji.namespace_uri)) self.logger.info( - '{0} : Check the remaining namespace(s) exists in LOD - {1}'.format(self.metric_identifier, exists)) + '{0} : Check the remaining namespace(s) exists in LOD -: {1}'.format(self.metric_identifier, exists)) if exists: score = self.total_score self.setEvaluationCriteriumScore('FsF-I1-02M-1', 1, 'pass') - self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Namespace matches found - {1}'.format(self.metric_identifier, exists)) + self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Namespace matches found -: {1}'.format(self.metric_identifier, exists)) for e in exists: outputs.append(SemanticVocabularyOutputInner(namespace=e, is_namespace_active=True)) else: @@ -71,12 +71,10 @@ def evaluate(self): not_exists = [x for x in self.fuji.namespace_uri if x not in exists] if not_exists: - self.logger.warning( - '{0} : Vocabulary namespace (s) specified but no match is found in LOD reference list - {1}'.format( + self.logger.warning('{0} : Vocabulary namespace (s) specified but no match is found in LOD reference list -: {1}'.format( self.metric_identifier, not_exists)) else: - self.logger.warning( - '{0} : NO namespaces of semantic vocabularies found in the metadata'.format(self.metric_identifier)) + self.logger.warning('{0} : NO namespaces of semantic vocabularies found in the metadata'.format(self.metric_identifier)) if score > 0: test_status = 'pass' diff --git a/fuji_server/evaluators/fair_evaluator_standardised_protocol_data.py b/fuji_server/evaluators/fair_evaluator_standardised_protocol_data.py index 36ef56a2..18ebfd25 100644 --- a/fuji_server/evaluators/fair_evaluator_standardised_protocol_data.py +++ b/fuji_server/evaluators/fair_evaluator_standardised_protocol_data.py @@ -32,7 +32,7 @@ class FAIREvaluatorStandardisedProtocolData(FAIREvaluator): def evaluate(self): - self.result = StandardisedProtocolData(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = StandardisedProtocolData(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) metadata_output = data_output = None metadata_required = Mapper.REQUIRED_CORE_METADATA.value diff --git a/fuji_server/evaluators/fair_evaluator_standardised_protocol_metadata.py b/fuji_server/evaluators/fair_evaluator_standardised_protocol_metadata.py index e0c1ba44..8d864c1d 100644 --- a/fuji_server/evaluators/fair_evaluator_standardised_protocol_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_standardised_protocol_metadata.py @@ -32,7 +32,7 @@ class FAIREvaluatorStandardisedProtocolMetadata(FAIREvaluator): def evaluate(self): - self.result = StandardisedProtocolMetadata(id=self.fuji.count, metric_identifier=self.metric_identifier, + self.result = StandardisedProtocolMetadata(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) metadata_output = data_output = None metadata_required = Mapper.REQUIRED_CORE_METADATA.value @@ -48,7 +48,7 @@ def evaluate(self): self.logger.warning('FsF-A1-02M : No metadata given or found, therefore the protocol of given PID was not assessed. See: FsF-F2-01M') else: if metadata_url_scheme in self.fuji.STANDARD_PROTOCOLS: - self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-A1-02M : Standard protocol for access to metadata found: ' + str(metadata_url_scheme)) + self.logger.log(self.fuji.LOG_SUCCESS, 'FsF-A1-02M : Standard protocol for access to metadata found -: ' + str(metadata_url_scheme)) metadata_output = {metadata_url_scheme: self.fuji.STANDARD_PROTOCOLS.get(metadata_url_scheme)} test_status = 'pass' diff --git a/fuji_server/evaluators/fair_evaluator_unique_identifier.py b/fuji_server/evaluators/fair_evaluator_unique_identifier.py index cc061518..5e036e9b 100644 --- a/fuji_server/evaluators/fair_evaluator_unique_identifier.py +++ b/fuji_server/evaluators/fair_evaluator_unique_identifier.py @@ -33,7 +33,7 @@ class FAIREvaluatorUniqueIdentifier(FAIREvaluator): def evaluate(self): # ======= CHECK IDENTIFIER UNIQUENESS ======= - self.result = Uniqueness(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) + self.result = Uniqueness(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = UniquenessOutput() schemes = [i[0] for i in idutils.PID_SCHEMES] self.logger.info('FsF-F1-01D : Using idutils schemes') diff --git a/fuji_server/helper/metadata_collector_datacite.py b/fuji_server/helper/metadata_collector_datacite.py index 79159f69..41ab8863 100644 --- a/fuji_server/helper/metadata_collector_datacite.py +++ b/fuji_server/helper/metadata_collector_datacite.py @@ -38,7 +38,7 @@ def __init__(self, mapping, pid_url=None, loggerinst=None): def parse_metadata(self): source_name = None dcite_metadata = {} - self.logger.info('FsF-F2-01M : Extract datacite metadata') + self.logger.info('FsF-F2-01M : Trying to retrieve datacite metadata') requestHelper = RequestHelper(self.pid_url, self.logger) requestHelper.setAcceptType(AcceptTypes.datacite_json) neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M') @@ -58,7 +58,7 @@ def parse_metadata(self): dcite_metadata['creator'] = names if dcite_metadata.get('related_resources'): - self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from {1}'.format( + self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format( len(dcite_metadata['related_resources']), source_name)) temp_rels = [] @@ -75,5 +75,5 @@ def parse_metadata(self): flat = ', '.join(map(str, value)) dcite_metadata[key] = flat except Exception as e: - self.logger.exception('Failed to extract Datacite Json - {}'.format(e)) + self.logger.exception('Failed to extract Datacite Json -: {}'.format(e)) return source_name, dcite_metadata diff --git a/fuji_server/helper/metadata_collector_dublincore.py b/fuji_server/helper/metadata_collector_dublincore.py index 075f5560..e83ce81f 100644 --- a/fuji_server/helper/metadata_collector_dublincore.py +++ b/fuji_server/helper/metadata_collector_dublincore.py @@ -37,7 +37,7 @@ def parse_metadata(self): source = None if self.source_metadata is not None: try: - self.logger.info('FsF-F2-01M : Extract DublinCore metadata from html page') + #self.logger.info('FsF-F2-01M : Trying to extract DublinCore metadata from html page') # get core metadat from dublin core meta tags: # < meta name = "DCTERMS.element" content = "Value" / > # meta_dc_matches = re.findall(']*)name=\"(DC|DCTERMS)?\.([a-z]+)\"(.*?)content=\"(.*?)\"',self.landing_html) @@ -56,7 +56,7 @@ def parse_metadata(self): meta_dc_matches.append([dc_name_parts[1],dc_t,meta_tag.get('content')]) #meta_dc_matches = re.findall(exp, self.source_metadata) except Exception as e: - self.logger.exception('Parsing error, failed to extract DublinCore - {}'.format(e)) + self.logger.exception('Parsing error, failed to extract DublinCore -: {}'.format(e)) if len(meta_dc_matches) > 0: self.namespaces.append('http://purl.org/dc/elements/1.1/') source = self.getEnumSourceNames().DUBLINCORE.value @@ -112,7 +112,7 @@ def parse_metadata(self): dc_core_metadata[elem] = v if dc_core_metadata.get('related_resources'): count = len([d for d in dc_core_metadata.get('related_resources') if d.get('related_resource')]) - self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from {1}'.format(count, source)) + self.logger.info('FsF-I3-01M : number of related resource(s) extracted -: {0} from {1}'.format(count, source)) else: self.logger.info('FsF-I3-01M : No related resource(s) found in DublinCore metadata') diff --git a/fuji_server/helper/metadata_collector_microdata.py b/fuji_server/helper/metadata_collector_microdata.py index d95bf764..a97e32b7 100644 --- a/fuji_server/helper/metadata_collector_microdata.py +++ b/fuji_server/helper/metadata_collector_microdata.py @@ -18,7 +18,7 @@ def parse_metadata(self): ext_meta = self.source_metadata[0] if ext_meta is not None: - self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name)) + self.logger.info('FsF-F2-01M : Trying to extract Microdata metadata from -: {}'.format(self.source_name)) # TODO check syntax - not ending with /, type and @type # TODO (important) extend mapping to detect other pids (link to related entities)? # TODO replace check_context_type list context comparison by regex @@ -32,7 +32,7 @@ def parse_metadata(self): self.logger.info('FsF-F2-01M : Failed to parse non schema.org type Microdata') except Exception as err: #print(err.with_traceback()) - self.logger.info('FsF-F2-01M : Failed to parse Microdata - {}'.format(err)) + self.logger.info('FsF-F2-01M : Failed to parse Microdata -: {}'.format(err)) else: self.logger.info('FsF-F2-01M : Could not identify Microdata metadata') diff --git a/fuji_server/helper/metadata_collector_opengraph.py b/fuji_server/helper/metadata_collector_opengraph.py index 44864b85..af600314 100644 --- a/fuji_server/helper/metadata_collector_opengraph.py +++ b/fuji_server/helper/metadata_collector_opengraph.py @@ -15,15 +15,18 @@ def parse_metadata(self): og_metadata = {} ext_meta=None if self.source_metadata: - self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name)) + #self.logger.info('FsF-F2-01M : Trying to extract OpenGraph metadata from html page') self.source_name = self.getEnumSourceNames().OPENGRAPH.value ext_meta =dict(self.source_metadata[0].get('properties')) if ext_meta is not None: - self.logger.info('FsF-F2-01M : Found OpenGraph metadata') for map_key, map_value in self.metadata_mapping.value.items(): - og_metadata[map_key] = ext_meta.get(map_value) - self.namespaces.append('http://ogp.me/ns#') - else: + if ext_meta.get(map_value): + og_metadata[map_key] = ext_meta.get(map_value) + if len(og_metadata) >0: + self.logger.info('FsF-F2-01M : Found OpenGraph metadata-: ' + str(og_metadata.keys())) + self.namespaces.append('http://ogp.me/ns#') + #else: + # self.logger.info('FsF-F2-01M : Non-metadata OpenGraph properties -:'+str(ext_meta)) self.logger.info('FsF-F2-01M : Could not identify OpenGraph metadata') return self.source_name, og_metadata \ No newline at end of file diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index 49cbad7d..704f4b26 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -45,7 +45,7 @@ def __init__(self, loggerinst, target_url, source, rdf_graph=None): def parse_metadata(self): #self.source_name = self.getEnumSourceNames().LINKED_DATA.value - self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name)) + #self.logger.info('FsF-F2-01M : Trying to request RDF metadata from -: {}'.format(self.source_name)) rdf_metadata=dict() if self.rdf_graph is None: #print(self.target_url) @@ -66,7 +66,7 @@ def parse_metadata(self): rdf_response = jsonldgraph.parse(data=json.dumps(rdf_response), format='json-ld') rdf_response = jsonldgraph except Exception as e: - self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD - {}'.format(e)) + self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD -: {}'.format(e)) else: neg_source, rdf_response = 'html' , self.rdf_graph @@ -86,7 +86,7 @@ def parse_metadata(self): for ns in rdf_response.namespaces(): self.namespaces.append(str(ns[1])) else: - self.logger.info('FsF-F2-01M : Expected RDF Graph but received - {0}'.format(self.content_type)) + self.logger.info('FsF-F2-01M : Expected RDF Graph but received -: {0}'.format(self.content_type)) return self.source_name, rdf_metadata def get_default_metadata(self,g): @@ -100,12 +100,12 @@ def get_default_metadata(self,g): meta[l] = str(v) break except Exception as e: - self.logger.info('FsF-F2-01M : SPARQLing error - {}'.format(e)) + self.logger.info('FsF-F2-01M : SPARQLing error -: {}'.format(e)) if len(meta)<=0: meta['object_type'] = 'Other' self.logger.info('FsF-F2-01M : Could not find metadata elements through generic SPARQL query on RDF') else: - self.logger.info('FsF-F2-01M : Found some metadata elements through generic SPARQL query on RDF: '+str(meta.keys())) + self.logger.info('FsF-F2-01M : Found some metadata elements through generic SPARQL query on RDF -: '+str(meta.keys())) return meta #TODO rename to: get_core_metadata @@ -153,6 +153,7 @@ def get_ontology_metadata(self, graph): return ont_metadata def get_dcat_metadata(self, graph): + dcat_metadata=dict() DCAT = Namespace("http://www.w3.org/ns/dcat#") @@ -193,7 +194,7 @@ def get_dcat_metadata(self, graph): dcat_metadata['object_content_identifier'].append({'url':str(durl),'type':str(dtype), 'size':dsize}) if dcat_metadata['object_content_identifier']: - self.logger.info('FsF-F3-01M : Found data links in DCAT.org metadata : ' + str(dcat_metadata['object_content_identifier'])) + self.logger.info('FsF-F3-01M : Found data links in DCAT.org metadata -: ' + str(dcat_metadata['object_content_identifier'])) #TODO: add provenance metadata retrieval else: self.logger.info('FsF-F2-01M : Found DCAT content but could not correctly parse metadata') diff --git a/fuji_server/helper/metadata_collector_schemaorg.py b/fuji_server/helper/metadata_collector_schemaorg.py index 86bf20f9..480049a7 100644 --- a/fuji_server/helper/metadata_collector_schemaorg.py +++ b/fuji_server/helper/metadata_collector_schemaorg.py @@ -40,10 +40,7 @@ def parse_metadata(self, ls=None): if self.source_metadata: self.source_name = self.getEnumSourceNames().SCHEMAORG_EMBED.value ext_meta = self.source_metadata[0] - else: - #if self.is_pid: - # in case use_datacite id false use the landing page URL for content negotiation, otherwise the pid url - + elif self.pid_url: self.source_name = self.getEnumSourceNames().SCHEMAORG_NEGOTIATE.value # TODO (IMPORTANT) PID agency may support Schema.org in JSON-LD # TODO (IMPORTANT) validate schema.org @@ -53,7 +50,7 @@ def parse_metadata(self, ls=None): neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M') if ext_meta is not None: - self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name)) + self.logger.info('FsF-F2-01M : Trying to extract schema.org JSON-LD metadata from -: {}'.format(self.source_name)) # TODO check syntax - not ending with /, type and @type # TODO (important) extend mapping to detect other pids (link to related entities)? check_context_type = ["Dataset", "Collection"] @@ -64,9 +61,9 @@ def parse_metadata(self, ls=None): if str(ext_meta['@type']).lower() not in self.SCHEMA_ORG_CONTEXT: self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a schema.org object based on the given context type') elif ext_meta['@type'] not in check_context_type: - self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a research data object') + self.logger.info('FsF-F2-01M : Found schema.org JSON-LD but seems not to be a research data object') else: - self.logger.info('FsF-F2-01M : Found JSON-LD which seems to be valid, based on the given context type') + self.logger.info('FsF-F2-01M : Found schema.org JSON-LD which seems to be valid, based on the given context type') self.namespaces.append('http://schema.org/') jsnld_metadata = jmespath.search(self.metadata_mapping.value, ext_meta) @@ -85,7 +82,7 @@ def parse_metadata(self, ls=None): #TODO instead of custom check there should a valdiator to evaluate the whole schema.org metadata invalid_license = False if jsnld_metadata.get('license'): - self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) - {}'.format( + self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) -: {}'.format( jsnld_metadata.get('license'))) if isinstance(jsnld_metadata.get('license'), list): @@ -111,11 +108,13 @@ def parse_metadata(self, ls=None): relateds = [d for d in jsnld_metadata['related_resources'] if d['related_resource'] is not None] if relateds: jsnld_metadata['related_resources'] = relateds - self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from {1}'.format(len(jsnld_metadata['related_resources']), self.source_name)) + self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format(len(jsnld_metadata['related_resources']), self.source_name)) else: del jsnld_metadata['related_resources'] self.logger.info('FsF-I3-01M : No related resource(s) found in Schema.org metadata') + + # TODO quick-fix, expand mapping expression instead if jsnld_metadata.get('object_size'): jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText') @@ -125,7 +124,7 @@ def parse_metadata(self, ls=None): except Exception as err: #print(err.with_traceback()) - self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org - {}'.format(err)) + self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org -: {}'.format(err)) else: self.logger.info('FsF-F2-01M : Could not identify JSON-LD schema.org metadata') diff --git a/fuji_server/helper/metadata_collector_xml.py b/fuji_server/helper/metadata_collector_xml.py index b052d9f4..7110c36b 100644 --- a/fuji_server/helper/metadata_collector_xml.py +++ b/fuji_server/helper/metadata_collector_xml.py @@ -46,13 +46,13 @@ def parse_metadata(self): dc_core_metadata = None requestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.xml) - self.logger.info('FsF-F2-01M : Trying to access metadata from: {}'.format(self.target_url)) + #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url)) neg_source, xml_response = requestHelper.content_negotiate('FsF-F2-01M') if requestHelper.getHTTPResponse() is not None: - self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(source_name)) + self.logger.info('FsF-F2-01M : Trying to extract/parse metadata from -: {}'.format(source_name)) #dom = lxml.html.fromstring(self.landing_html.encode('utf8')) if neg_source != 'xml': - self.logger.info('FsF-F2-01M : Expected XML but content negotiation responded: '+str(neg_source)) + self.logger.info('FsF-F2-01M : Expected XML but content negotiation responded -: '+str(neg_source)) else: tree = lxml.etree.XML(xml_response) schema_locations = set(tree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) diff --git a/fuji_server/helper/metadata_mapper.py b/fuji_server/helper/metadata_mapper.py index ef43e0fd..eab24ff2 100644 --- a/fuji_server/helper/metadata_mapper.py +++ b/fuji_server/helper/metadata_mapper.py @@ -89,7 +89,8 @@ class Mapper(Enum): 'access_level: conditionsOfAccess, ' \ 'access_free: (isAccessibleForFree || free), ' \ 'measured_variable: variableMeasured[*].name || variableMeasured , object_size: size,' \ - 'related_resources: [{related_resource: isPartOf."@id" || isPartOf.url || isPartOf, relation_type: \'isPartOf\'}, {related_resource: "@reverse".isBasedOn."@id" || "@reverse".isBasedOn.url || isBasedOn , relation_type: \'isBasedOn\'} ], ' \ + 'related_resources: [{related_resource: isPartOf."@id" || isPartOf.url || isPartOf, relation_type: \'isPartOf\'}, ' \ + '{related_resource: "@reverse".isBasedOn."@id" || "@reverse".isBasedOn.url || isBasedOn , relation_type: \'isBasedOn\'} ], ' \ 'object_content_identifier: (distribution[*].{url: contentUrl, type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion} || [distribution.{url: contentUrl, type: (encodingFormat || fileFormat), size: (contentSize || fileSize), profile: schemaVersion}])}' # 'related_resources: [{related_resource: isPartOf, relation_type: \'isPartOf\'}, {related_resource: isBasedOn, relation_type: \'isBasedOn\'}], ' \ diff --git a/fuji_server/helper/metadata_provider_oai.py b/fuji_server/helper/metadata_provider_oai.py index 0ae53b4f..ffd4bc4e 100644 --- a/fuji_server/helper/metadata_provider_oai.py +++ b/fuji_server/helper/metadata_provider_oai.py @@ -61,7 +61,7 @@ def getMetadataStandards(self): if not any(s in metadata_schema for s in filter): schemas[metadata_prefix]= [metadata_schema] else: - self.logger.info('{0} : Skipped domain-agnostic standard listed in OAI-PMH endpoint - {1}'.format(self.metric_id,metadata_prefix)) + self.logger.info('{0} : Skipped domain-agnostic standard listed in OAI-PMH endpoint -: {1}'.format(self.metric_id,metadata_prefix)) except: self.logger.info( '{0} : Could not parse XML response retrieved from OAI-PMH endpoint'.format(self.metric_id)) diff --git a/fuji_server/helper/metadata_provider_sparql.py b/fuji_server/helper/metadata_provider_sparql.py index 71ce08a3..81183d95 100644 --- a/fuji_server/helper/metadata_provider_sparql.py +++ b/fuji_server/helper/metadata_provider_sparql.py @@ -37,7 +37,7 @@ def getMetadata(self, queryString): response = wrapper.query() #application/rdf+xml content_type = response.info()['content-type'].split(';')[0] if 'html' in content_type: - self.logger.warning('{0} : Looks like not a valid SPARQL endpoint, content type - {1} '.format(self.metric_id, content_type)) + self.logger.warning('{0} : Looks like not a valid SPARQL endpoint, content type -: {1} '.format(self.metric_id, content_type)) else: rdf_graph = response.convert() #rdflib.graph.ConjunctiveGraph #print(rdf_graph.serialize(format='xml')) @@ -46,15 +46,15 @@ def getMetadata(self, queryString): # a SPARQL Results Document in XML, JSON, or CSV/TSV format (for SPARQL Query forms SELECT and ASK); or # an RDF graph [RDF-CONCEPTS] serialized, for example, in the RDF/XML syntax [RDF-XML], or an equivalent RDF graph serialization, for SPARQL Query forms DESCRIBE and CONSTRUCT if isinstance(rdf_graph, rdflib.graph.Graph) and len(rdf_graph) > 0 : - self.logger.info('{0} : {1} of triples found in the graph, format - {2}'.format(self.metric_id, len(rdf_graph), content_type)) + self.logger.info('{0} : number of triples found in the graph, format -: {1} of {2}'.format(self.metric_id, len(rdf_graph), content_type)) for n in rdf_graph.namespaces(): self.namespaces.append(str(n[1])) else: self.logger.warning('{0} : SPARQL query returns NO result.'.format(self.metric_id)) except HTTPError as err1: - self.logger.warning('{0} : HTTPError - {1}'.format(self.metric_id, err1)) + self.logger.warning('{0} : HTTPError -: {1}'.format(self.metric_id, err1)) except SPARQLExceptions.EndPointNotFound as err2: - self.logger.warning('{0} : SPARQLExceptions - {1}'.format(self.metric_id, err2)) + self.logger.warning('{0} : SPARQLExceptions -: {1}'.format(self.metric_id, err2)) return rdf_graph, content_type def getNamespaces(self): diff --git a/fuji_server/helper/repository_helper.py b/fuji_server/helper/repository_helper.py index 698c0c87..d635fd93 100644 --- a/fuji_server/helper/repository_helper.py +++ b/fuji_server/helper/repository_helper.py @@ -42,7 +42,7 @@ def __init__(self, client, pidscheme): self.repo_apis = {} self.repo_standards = [] self.logger = Preprocessor.logger #logging.getLogger(__name__) - print(__name__) + #print(__name__) def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(self.client_id) # {client_id,re3doi} diff --git a/fuji_server/helper/request_helper.py b/fuji_server/helper/request_helper.py index 920405c1..2ced0dab 100644 --- a/fuji_server/helper/request_helper.py +++ b/fuji_server/helper/request_helper.py @@ -97,7 +97,7 @@ def content_decode(self,content): a=1 return True - def content_negotiate(self, metric_id=''): + def content_negotiate(self, metric_id='', ignore_html = True): #TODO: not necessarily to be done with the landing page e.g. http://purl.org/vocommons/voaf resolves to a version URL which responds HTML instead of RDF self.metric_id=metric_id source = 'html' @@ -155,9 +155,13 @@ def content_negotiate(self, metric_id=''): for at in AcceptTypes: #e.g., at.name = html, at.value = 'text/html, application/xhtml+xml' if content_type in at.value: if at.name == 'html': - self.logger.info('%s : Found HTML page!' % metric_id) - - self.parse_response = self.parse_html(self.response_content.decode(self.response_charset)) + #since we already parse HTML in the landing page we ignore this and do not parse again + if ignore_html == False: + self.logger.info('%s : Found HTML page!' % metric_id) + self.parse_response = self.parse_html(self.response_content.decode(self.response_charset)) + else: + self.logger.info('%s : Ignoring HTML response' % metric_id) + self.parse_response = None source='html' break if at.name == 'xml': # TODO other types (xml) @@ -192,7 +196,7 @@ def content_negotiate(self, metric_id=''): else: self.logger.warning('{0} : Content-type is NOT SPECIFIED'.format(metric_id)) else: - self.logger.warning('{0} : NO successful response received, status code - {1}'.format(metric_id, str(status_code))) + self.logger.warning('{0} : NO successful response received, status code -: {1}'.format(metric_id, str(status_code))) #except requests.exceptions.SSLError as e: except urllib.error.HTTPError as e: # self.logger.warning('%s : SSL Error: Untrusted SSL certificate, failed to connect to %s ' % (metric_id, self.request_url)) @@ -201,12 +205,12 @@ def content_negotiate(self, metric_id=''): #except requests.exceptions.RequestException as e: #All exceptions that Requests explicitly raises inherit from requests.exceptions.RequestException #self.logger.warning('%s : Request Error: Failed to connect to %s ' % (metric_id, self.request_url)) - self.logger.warning('%s : Content negotiation failed: accept=%s, status=%s ' % (metric_id, self.accept_type, str(e.code))) + self.logger.warning('%s : Content negotiation failed -: accept=%s, status=%s ' % (metric_id, self.accept_type, str(e.code))) #self.logger.exception("{} : RequestException: {}".format(metric_id, e)) #traceback.print_exc() #self.logger.exception('%s : Failed to connect to %s ' % (metric_id, self.request_url)) except urllib.error.URLError as e: - self.logger.warning("{} : RequestException: {} : {}".format(metric_id, e.reason, self.request_url)) + self.logger.warning("{} : RequestException -: {} : {}".format(metric_id, e.reason, self.request_url)) #self.logger.warning('%s : Content negotiation failed: accept=%s, status=%s ' % (metric_id, self.accept_type, str(e.code))) return source, self.parse_response @@ -214,10 +218,11 @@ def parse_html(self, html_texts): # extract contents from the landing page using extruct, which returns a dict with # keys 'json-ld', 'microdata', 'microformat','opengraph','rdfa' try: + #print(html_texts.encode('utf8')) extracted = extruct.extract(html_texts.encode('utf8')) - except: + except Exception as e: extracted=None - self.logger.warning('%s : Failed to perform parsing on microdata or JSON %s' % (self.metric_id, self.request_url)) + self.logger.warning('%s : Failed to parse HTML embedded microdata or JSON -: %s' % (self.metric_id, self.request_url+' '+str(e))) #filtered = {k: v for k, v in extracted.items() if v} return extracted @@ -227,13 +232,13 @@ def parse_rdf(self, response, type): # https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.graph.Graph.parse graph = None try: - self.logger.info('%s : Try to parse RDF from %s' % (self.metric_id, self.request_url)) + self.logger.info('%s : Try to parse RDF from -: %s' % (self.metric_id, self.request_url)) graph = rdflib.Graph() graph.parse(data=response, format=type) #queries have to be done in specific metadata collector classes except: error = sys.exc_info()[0] - self.logger.warning('%s : Failed to parse RDF %s %s' % (self.metric_id, self.request_url, str(error))) + self.logger.warning('%s : Failed to parse RDF -: %s %s' % (self.metric_id, self.request_url, str(error))) self.logger.debug(error) return graph @@ -254,7 +259,7 @@ def parse_xml(self, response, type): # TODO: implement a generic XML parsing which checks domain specific # document schema and performs a XSLT to get metadata elements # write some domain specific XSLTs and/or parsers - self.logger.info('%s : Try to parse XML from %s' % (self.metric_id, self.request_url)) + self.logger.info('%s : Try to parse XML from -: %s' % (self.metric_id, self.request_url)) self.logger.warning('%s : Domain specific XML parsing not yet implemented ' % (self.metric_id,)) #print('Not yet implemented') return None diff --git a/fuji_server/yaml/metrics_v0.4.yaml b/fuji_server/yaml/metrics_v0.4.yaml index 2c2d5fce..04fb8420 100644 --- a/fuji_server/yaml/metrics_v0.4.yaml +++ b/fuji_server/yaml/metrics_v0.4.yaml @@ -2,6 +2,7 @@ metrics: ## ---------------- FINDABILITY ---------------- ## - metric_identifier: FsF-F1-01D + metric_number: 1 metric_name: Data is assigned a globally unique identifier. description: A data object may be assigned with a globally unique identifier such that it can be referenced unambiguously by humans or machines. Globally unique means an identifier should be associated with only one resource at any time. Examples of unique identifiers of data are Internationalized Resource Identifier (IRI), Uniform Resource Identifier (URI) such as URL and URN, Digital Object Identifier (DOI), the Handle System, identifiers.org, w3id.org and Archival Resource Key (ARK). A data repository may assign a globally unique identifier to your data or metadata when you publish and make it available through their services. fair_principle: F1 @@ -17,6 +18,7 @@ metrics: total_score: 1 - metric_identifier: FsF-F1-02D + metric_number: 2 metric_short_name: Persistent Identifier metric_name : Data is assigned a persistent identifier. description: We make a distinction between the uniqueness and persistence of an identifier. An HTTP URL (the address of a given unique resource on the web) is globally unique, but may not be persistent as the URL of data may be not accessible (link rot problem) or the data available under the original URL may be changed (content drift problem). Identifiers based on the Handle System, DOI, ARK are both globally unique and persistent. They are maintained and governed such that they remain stable and resolvable for the long term. The persistent identifier (PID) of a data object may be resolved (point) to a landing page with metadata containing further information on how to access the data content, in some cases a downloadable artefact, or none if the data or repository is no longer maintained. Therefore, ensuring persistence is a shared responsibility between a PID service provider (e.g., datacite) and its clients (e.g., data repositories). For example, the DOI system guarantees the persistence of its identifiers through its social (e.g., policy) and technical infrastructures, whereas a data provider ensures the availability of the resource (e.g., landing page, downloadable artefact) associated with the identifier. @@ -37,6 +39,7 @@ metrics: total_score: 1 - metric_identifier: FsF-F2-01M + metric_number: 3 metric_short_name: Descriptive Core Metadata metric_name: Metadata includes descriptive core elements (creator, title, data identifier, publisher, publication date, summary and keywords) to support data findability. description: Metadata is descriptive information about a data object. Since the metadata required differs depending on the users and their applications, this metric focuses on core metadata. The core metadata is the minimum descriptive information required to enable data finding, including citation which makes it easier to find data. We determine the required metadata based on common data citation guidelines (e.g., DataCite, ESIP, and IASSIST), and metadata recommendations for data discovery (e.g., EOSC Datasets Minimum Information (EDMI), DataCite Metadata Schema, W3C Recommendation Data on the Web Best Practices and Data Catalog Vocabulary). This metric focuses on domain-agnostic core metadata. Domain or discipline-specific metadata specifications are covered under metric FsF-R1.3-01M. A repository should adopt a schema that includes properties of core metadata, whereas data authors should take the responsibility of providing core metadata. @@ -77,6 +80,7 @@ metrics: passed: false - metric_identifier: FsF-F3-01M + metric_number: 4 metric_short_name: Inclusion of Data Identifier in Metadata metric_name: Metadata includes the identifier of the data it describes. description: The metadata should explicitly specify the identifier of the data such that users can discover and access the data through the metadata. If the identifier specified is persistent and points to a landing page, the data identifier and links to download the data content should be taken into account in the assessment. @@ -93,6 +97,7 @@ metrics: total_score: 1 - metric_identifier: FsF-F4-01M + metric_number: 5 metric_short_name: Searchable Metadata metric_name: Metadata is offered in such a way that it can be retrieved programmatically. description: This metric refers to ways through which the metadata of data is exposed or provided in a standard and machine-readable format. Assessing this metric will require an understanding of the capabilities offered by the data repository used to host the data. Metadata may be available through multiple endpoints. For example, if data is hosted by a repository, the repository may disseminate its metadata through a metadata harvesting protocol (e.g., via OAI-PMH) and/or a web service. Metadata may also be embedded as structured data on a data page for use by web search engines such as Google and Bing or be available as linked (open) data. @@ -112,6 +117,7 @@ metrics: total_score: 2 - metric_identifier: FsF-A1-01M + metric_number: 6 metric_short_name: Data Access Information metric_name: Metadata contains access level and access conditions of the data. description: This metric determines if the metadata includes the level of access to the data such as public, embargoed, restricted, or metadata-only access and its access conditions. Both access level and conditions are necessary information to potentially gain access to the data. It is recommended that data should be as open as possible and as closed as necessary. There are no access conditions for public data. Datasets should be released into the public domain (e.g., with an appropriate public-domain-equivalent license such as Creative Commons CC0 licence) and openly accessible without restrictions when possible. Embargoed access refers to data that will be made publicly accessible at a specific date which should be specified in the metadata. For example, a data author may release their data after having published their findings from the data. Therefore, access conditions such as the date the data will be released publically is essential. Restricted access refers to data that can be accessed under certain conditions (e.g. because of commercial, sensitive, or other confidentiality reasons or the data is only accessible via a subscription or a fee). Restricted data may be available to a particular group of users or after permission is granted. For restricted data, the metadata should include the conditions of access to the data such as point of contact or instructions to access the data. Metadata-only access refers to data that is not made publicly available and for which only metadata is publicly available. @@ -128,6 +134,7 @@ metrics: total_score: 1 - metric_identifier: FsF-A1-03D + metric_number: 8 metric_short_name: Standardized Communication Protocol of Data metric_name: Data is accessible through a standardized communication protocol. description: Given an identifier of a dataset, the dataset should be retrievable using a standard communication protocol such as HTTP, HTTPS, FTP, TFTP, SFTP, FTAM and AtomPub. Avoid disseminating data using a proprietary protocol. @@ -144,6 +151,7 @@ metrics: total_score: 1 - metric_identifier: FsF-A1-02M + metric_number: 7 metric_short_name: Standardized Communication Protocol of Metadata metric_name: Metadata is accessible through a standardized communication protocol. description: Given an identifier of a dataset, the metadata of the dataset should be retrievable using a standard communication protocol such as HTTP, HTTPS, FTP, TFTP, SFTP, FTAM and AtomPub. Avoid disseminating data using a proprietary protocol. @@ -160,6 +168,7 @@ metrics: total_score: 1 - metric_identifier: FsF-A2-01M + metric_number: 9 metric_short_name: Metadata Preservation metric_name: Metadata remains available, even if the data is no longer available. description: This metric determines if the metadata will be preserved even when the data they represent are no longer available, replaced or lost. @@ -176,6 +185,7 @@ metrics: total_score: 1 - metric_identifier: FsF-I1-01M + metric_number: 10 metric_short_name: Formal Representation of Metadata metric_name: Metadata is represented using a formal knowledge representation language. description: Knowledge representation is vital for machine-processing of the knowledge of a domain. Expressing the metadata of a data object using a formal knowledge representation will enable machines to process it in a meaningful way and enable more data exchange possibilities. Examples of knowledge representation languages are RDF, RDFS, and OWL. These languages may be serialized (written) in different formats. For instance, RDF/XML, RDFa, Notation3, Turtle, N-Triples and N-Quads, and JSON-LD are RDF serialization formats. @@ -195,6 +205,7 @@ metrics: total_score: 2 - metric_identifier: FsF-I1-02M + metric_number: 11 metric_short_name: Metadata with Semantic Resources metric_name: Metadata uses semantic resources description: A metadata document or selected parts of the document may incorporate additional terms from semantic resources (also referred as semantic artefacts) so that the contents are unambiguous and can be processed automatically by machines. This enrichment facilitates enhanced data search and interoperability of data from different sources. Ontology, thesaurus, and taxonomy are kinds of semantic resources, and they come with varying degrees of expressiveness and computational complexity. Knowledge organization schemes such as thesaurus and taxonomy are semantically less formal than ontologies. @@ -211,6 +222,7 @@ metrics: total_score: 1 - metric_identifier: FsF-I3-01M + metric_number: 12 metric_short_name: Links to related entities metric_name: Metadata includes links between the data and its related entities. description: Linking data to its related entities will increase its potential for reuse. The linking information should be captured as part of the metadata. A dataset may be linked to its prior version, related datasets or resources (e.g. publication, physical sample, funder, repository, platform, site, or observing network registries). Links between data and its related entities should be expressed through relation types (e.g., DataCite Metadata Schema specifies relation types between research objects through the fields ‘RelatedIdentifier’ and ‘RelationType’), and preferably use persistent Identifiers for related entities (e.g., ORCID for contributors, DOI for publications, and ROR for institutions). @@ -227,6 +239,7 @@ metrics: total_score: 1 - metric_identifier: FsF-R1-01MD + metric_number: 13 metric_short_name: Metadata of Data Content metric_name: Metadata specifies the content of the data. description: This metric evaluates if a description (properties) of the content of the data is specified in the metadata. The description should be an accurate reflection of the actual data deposited. Data content descriptors include but are not limited to resource type (e.g., data or a collection of data), variable(s) measured or observed, method, data format and size. Ideally, ontological vocabularies should be used to describe data content to support interdisciplinary reuse. @@ -258,6 +271,7 @@ metrics: total_score: 4 - metric_identifier: FsF-R1.1-01M + metric_number: 14 metric_short_name: Data Usage License metric_name: Metadata includes license information under which data can be reused. description: This metric evaluates if data is associated with a license because otherwise users cannot reuse it in a clear legal context. We encourage the application of licenses for all kinds of data whether public, restricted or for specific users. Without an explicit license, users do not have a clear idea of what can be done with your data. Licenses can be of standard type (Creative Commons, Open Data Commons Open Database License) or bespoke licenses, and rights statements which indicate the conditions under which data can be reused. It is highly recommended to use a standard, machine-readable license such that it can be interpreted by machines and humans. In order to inform users about what rights they have to use a dataset, the license information should be specified as part of the dataset’s metadata. @@ -277,6 +291,7 @@ metrics: total_score: 2 - metric_identifier: FsF-R1.2-01M + metric_number: 15 metric_short_name: Data Provenance metric_name: Metadata includes provenance information about data creation or generation. description: Data provenance (also known as lineage) represents a dataset’s history, including the people, entities, and processes involved in its creation, management and longer-term curation. It is essential to provide provenance information about your data to provide valuable context and to enable informed use and reuse. The levels of provenance information needed can vary depending on the data type (e.g., measurement, observation, derived data, or data product) and research domains. For that reason, it is difficult to define a set of finite provenance properties that will be adequate for all domains. Based on existing work, we suggest that the following provenance properties of data generation or collection are included in the metadata record as a minimum. @@ -302,6 +317,7 @@ metrics: total_score: 2 - metric_identifier: FsF-R1.3-01M + metric_number: 16 metric_short_name: Community-Endorsed Metadata Standard metric_name: Metadata follows a standard recommended by the target research community of the data. description: In addition to core metadata required to support data discovery (covered under metric FsF-F2-01M), metadata to support data reusability should be made available following community-endorsed metadata standards. Some communities have well-established metadata standards (e.g., geospatial [ISO19115], biodiversity [DarwinCore, ABCD, EML], social science [DDI], astronomy [International Virtual Observatory Alliance Technical Specifications]) while others have limited standards or standards that are under development (e.g., engineering and linguistics). The use of community-endorsed metadata standards is usually encouraged and supported by domain and discipline-specific repositories. @@ -321,6 +337,7 @@ metrics: total_score: 1 - metric_identifier: FsF-R1.3-02D + metric_number: 17 metric_short_name: Data File format metric_name: Data is available in a file format recommended by the target research community. description: File formats refer to methods for encoding digital information. For example, CSV for tabular data, NetCDF for multidimensional data and GeoTIFF for raster imagery. Data should be made available in a file format that is backed by the research community to enable data sharing and reuse. Consider for example, file formats that are widely used and supported by the most commonly used software and tools. These formats also should be suitable for long-term storage and archiving, which are usually recommended by a data repository. The formats not only give a higher certainty that your data can be read in the future, but they will also help to increase the reusability and interoperability. Using community-endorsed formats enables data to be loaded directly into the software and tools used for data analysis. It makes it possible to easily integrate your data with other data using the same preferred format. The use of preferred formats will also help to transform the format to a newer one, in case a preferred format gets outdated.