diff --git a/data-generation/real-data.js b/data-generation/real-data.js index 87e087c92..5e790b7ea 100644 --- a/data-generation/real-data.js +++ b/data-generation/real-data.js @@ -58,6 +58,9 @@ const dois = [ '10.1175/BAMS-D-19-0337.1', '10.1186/s12966-019-0834-1', '10.1515/itit-2019-0040', + '10.2777/28598', + '10.2788/50967', + '10.2788/52504', '10.4233/uuid:4bb38399-9267-428f-b10a-80b86e101f23', '10.5194/egusphere-egu21-4805', '10.5194/ems2022-105', diff --git a/docker-compose.yml b/docker-compose.yml index 2afcbef9f..a149b666c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -157,7 +157,7 @@ services: scrapers: build: ./scrapers - image: rsd/scrapers:1.6.0 + image: rsd/scrapers:1.7.0 environment: # it uses values from .env file - POSTGREST_URL diff --git a/frontend/components/mention/ImportMentions/ImportReportBody.tsx b/frontend/components/mention/ImportMentions/ImportReportBody.tsx index ea818ac0e..e9592271c 100644 --- a/frontend/components/mention/ImportMentions/ImportReportBody.tsx +++ b/frontend/components/mention/ImportMentions/ImportReportBody.tsx @@ -1,7 +1,8 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 dv4all +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) // // SPDX-License-Identifier: Apache-2.0 @@ -82,7 +83,7 @@ export default function ImportReportBody({initialResults,onCancel,onImport}: Bul return 'DOI not found' case 'unsupportedRA': return 'Registration agent (RA) is not supported' - case 'alredyImported': + case 'alreadyImported': return 'This publication is already imported' default: return 'Unknown error' diff --git a/frontend/components/mention/ImportMentions/apiImportMentions.tsx b/frontend/components/mention/ImportMentions/apiImportMentions.tsx index a33f918a6..282ddf735 100644 --- a/frontend/components/mention/ImportMentions/apiImportMentions.tsx +++ b/frontend/components/mention/ImportMentions/apiImportMentions.tsx @@ -10,7 +10,7 @@ import {useState} from 'react' import {extractSearchTerm} from '~/components/software/edit/mentions/utils' import {SearchResult} from '.' import {getMentionsByDoiFromRsd} from '~/utils/editMentions' -import {getDoiRAList, getItemsFromCrossref, getItemsFromDatacite} from '~/utils/getDOI' +import {getDoiRAList, getItemsFromCrossref, getItemsFromDatacite, getItemsFromOpenAlex} from '~/utils/getDOI' import {MentionItemProps} from '~/types/Mention' import {createJsonHeaders, extractReturnMessage} from '~/utils/fetchHelpers' import useEditMentionReducer from '../useEditMentionReducer' @@ -53,7 +53,7 @@ export async function validateInputList(doiList: string[], mentions: MentionItem const found = mentions.find(mention => mention.doi?.toLowerCase() === doi) if (found) { // flag item with DOI alredy processed - mentionResultPerDoi.set(doi, {doi ,status: 'alredyImported', include: false}) + mentionResultPerDoi.set(doi, {doi ,status: 'alreadyImported', include: false}) return false } return true @@ -96,6 +96,7 @@ export async function validateInputList(doiList: string[], mentions: MentionItem // classify dois by RA const crossrefDois: string[] = [] const dataciteDois: string[] = [] + const openalexDois: string[] = [] doiRas.forEach(doiRa => { const doi = doiRa.DOI.toLowerCase() if (typeof doiRa?.RA === 'undefined') { @@ -105,6 +106,8 @@ export async function validateInputList(doiList: string[], mentions: MentionItem crossrefDois.push(doi) } else if (doiRa.RA === 'DataCite') { dataciteDois.push(doi) + } else if (doiRa.RA === 'OP') { + openalexDois.push(doi) } else { mentionResultPerDoi.set(doi, {doi, status: 'unsupportedRA', include: false}) } @@ -142,6 +145,20 @@ export async function validateInputList(doiList: string[], mentions: MentionItem } }) + const openalexMentions = await getItemsFromOpenAlex(openalexDois) + openalexMentions.forEach(mention => { + if (mention.doi !== null) { + const doi = mention.doi.toLowerCase() + mentionResultPerDoi.set(doi, { + doi, + status: 'valid', + source: 'OpenAlex', + include: true, + mention + }) + } + }) + // flag dois that are not updated doisNotInDatabase.forEach(doi => { if (!mentionResultPerDoi.has(doi)) { diff --git a/frontend/components/mention/ImportMentions/index.tsx b/frontend/components/mention/ImportMentions/index.tsx index 1c558eb03..1994a8b97 100644 --- a/frontend/components/mention/ImportMentions/index.tsx +++ b/frontend/components/mention/ImportMentions/index.tsx @@ -1,7 +1,7 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 dv4all // // SPDX-License-Identifier: Apache-2.0 @@ -23,9 +23,9 @@ import {DoiBulkImportReport, addMentions, linkMentionToEntity} from './apiImport export type SearchResult = { doi: string - status: 'valid' | 'invalidDoi' | 'doiNotFound' |'unsupportedRA' | 'alredyImported' | 'unknown', + status: 'valid' | 'invalidDoi' | 'doiNotFound' |'unsupportedRA' | 'alreadyImported' | 'unknown', include: boolean - source?: 'RSD' | 'Crossref' | 'DataCite', + source?: 'RSD' | 'Crossref' | 'DataCite' | 'OpenAlex', mention?: MentionItemProps } diff --git a/frontend/utils/fetchHelpers.ts b/frontend/utils/fetchHelpers.ts index ca102f920..7a0221142 100644 --- a/frontend/utils/fetchHelpers.ts +++ b/frontend/utils/fetchHelpers.ts @@ -1,5 +1,7 @@ // SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) // SPDX-FileCopyrightText: 2022 dv4all +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -42,7 +44,7 @@ export async function extractReturnMessage(resp: Response, dataId?: string) { status: resp.status, message: ` ${resp.statusText}. - You might not have sufficient priveleges to edit this item. + You might not have sufficient privileges to edit this item. Please contact site administrators. ` } diff --git a/frontend/utils/getCrossref.ts b/frontend/utils/getCrossref.ts index 37515cd6a..f0dc511bd 100644 --- a/frontend/utils/getCrossref.ts +++ b/frontend/utils/getCrossref.ts @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) // SPDX-FileCopyrightText: 2022 dv4all // @@ -142,7 +142,7 @@ export async function getCrossrefItemsByQuery(query: string) { } } -function crossrefToRsdType(type: string): MentionTypeKeys { +export function crossrefToRsdType(type: string): MentionTypeKeys { if (!type) return 'other' switch (type.trim().toLowerCase()) { case 'book': diff --git a/frontend/utils/getDOI.ts b/frontend/utils/getDOI.ts index 8444d7032..bfaccc6a4 100644 --- a/frontend/utils/getDOI.ts +++ b/frontend/utils/getDOI.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all) // SPDX-FileCopyrightText: 2022 - 2023 dv4all -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -9,6 +9,7 @@ import {MentionItemProps} from '~/types/Mention' import {crossrefItemToMentionItem, getCrossrefItemByDoi} from './getCrossref' import {dataCiteGraphQLItemToMentionItem, getDataciteItemByDoiGraphQL, getDataciteItemsByDoiGraphQL} from './getDataCite' import logger from './logger' +import {getOpenalexItemByDoi, getOpenalexItemsByDoi, openalexItemToMentionItem} from '~/utils/getOpenalex' type DoiRA = { DOI: string, @@ -155,6 +156,30 @@ export async function getItemsFromDatacite(dois: string[]) { return mentions } +async function getItemFromOpenalex(doi: string) { + const resp = await getOpenalexItemByDoi(doi) + // debugger + if (resp.status === 200) { + const mention = openalexItemToMentionItem(resp.message) + return { + status: 200, + message: mention + } + } + // return error message + return resp +} + +export async function getItemsFromOpenAlex(dois: string[]): Promise { + if (dois.length === 0) { + return [] + } + + const response = await getOpenalexItemsByDoi(dois) + + return response.message.map((rawMention: any) => openalexItemToMentionItem(rawMention)) +} + export async function getMentionByDoi(doi: string) { // get RA first const doiRA = await getDoiRA(doi) @@ -167,6 +192,8 @@ export async function getMentionByDoi(doi: string) { case 'datacite': // get from datacite return getItemFromDatacite(doi) + case 'op': + return getItemFromOpenalex(doi) default: return { status: 400, diff --git a/frontend/utils/getOpenalex.ts b/frontend/utils/getOpenalex.ts new file mode 100644 index 000000000..3c03fc79f --- /dev/null +++ b/frontend/utils/getOpenalex.ts @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +import logger from '~/utils/logger' +import {MentionItemProps} from '~/types/Mention' +import {crossrefToRsdType} from '~/utils/getCrossref' + +export async function getOpenalexItemByDoi(doi: string) { + try { + const url = `https://api.openalex.org/works/https://doi.org/${doi}` + + const resp = await fetch(url) + + if (resp.status === 200) { + const json = await resp.json() + return ({ + status: 200, + message: json + }) + } + else if (resp.status === 404) { + return { + status: 404, + message: 'DOI not found' + } + } + else { + return ({ + status: resp.status, + message: 'unexpected response from OpenAlex' + }) + } + } catch (e:any) { + logger(`getOpenalexItemByDoi: ${e?.message}`, 'error') + return { + status: 500, + message: e?.message + } + } +} + +export async function getOpenalexItemsByDoi(dois: string[]) { + try { + const url = `https://api.openalex.org/works?filter=doi:${dois.join('|')}` + + const resp = await fetch(url) + + if (resp.status === 200) { + const json = await resp.json() + return ({ + status: 200, + message: json.results + }) + } + else { + return ({ + status: resp.status, + message: 'unexpected response from OpenAlex' + }) + } + } catch (e:any) { + logger(`getOpenalexItemByDoi: ${e?.message}`, 'error') + return { + status: 500, + message: e?.message + } + } +} + +export function openalexItemToMentionItem(json: any): MentionItemProps { + return ({ + id: null, + doi: json.doi.substring('https://doi.org/'.length), + url: json.doi, + title: json.title, + authors: extractAuthors(json), + publisher: null, + publication_year: json.publication_year, + journal: null, + page: null, + // url to external image + image_url: null, + // is_featured?: boolean + mention_type: crossrefToRsdType(json.type_crossref), + source: 'OpenAlex', + note: null + }) +} + +function extractAuthors(json: any): string { + return json.authorships.map((authorship: any) => authorship.raw_author_name as string).join(', ') +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java index b56c28c83..a5f64ba65 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -10,6 +10,8 @@ import com.google.gson.JsonObject; import com.google.gson.JsonParser; import nl.esciencecenter.rsd.scraper.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; @@ -17,9 +19,6 @@ import java.util.Map; import java.util.TreeMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - public class DataCiteReleaseRepository { private static final Logger LOGGER = LoggerFactory.getLogger(DataCiteReleaseRepository.class); @@ -41,7 +40,9 @@ public class DataCiteReleaseRepository { """; public Map> getVersionedDois(Collection conceptDois) { - if (conceptDois.isEmpty()) return Collections.EMPTY_MAP; + if (conceptDois.isEmpty()) { + return Collections.emptyMap(); + } String query = QUERY_UNFORMATTED.formatted(DataciteMentionRepository.joinCollection(conceptDois)); JsonObject body = new JsonObject(); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java index fc23581f8..2e85bd5c2 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -11,6 +11,8 @@ import com.google.gson.JsonParser; import nl.esciencecenter.rsd.scraper.Config; import nl.esciencecenter.rsd.scraper.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.time.Instant; import java.util.ArrayList; @@ -20,9 +22,6 @@ import java.util.TreeMap; import java.util.stream.Collectors; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - public class MainMentions { private static final Logger LOGGER = LoggerFactory.getLogger(MainMentions.class); @@ -45,7 +44,7 @@ public static void main(String[] args) { String doisJoined = mentionsToScrape.stream() .map(mention -> mention.doi) - .map(doi -> Utils.urlEncode(doi)) + .map(Utils::urlEncode) .collect(Collectors.joining(",")); String jsonSources = null; try { @@ -88,9 +87,26 @@ public static void main(String[] args) { } } + String email = Config.crossrefContactEmail().orElse(null); + Collection europeanPublicationsOfficeDois = doiToSource.entrySet() + .stream() + .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("OP")) + .map(Map.Entry::getKey) + .toList(); + try { + Collection openalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email); + for (MentionRecord openalexMention : openalexMentions) { + mentionsFailedToScrape.remove(openalexMention.doi); + scrapedMentions.add(openalexMention); + } + } catch (Exception e) { + Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e); + } + Instant now = Instant.now(); for (MentionRecord mention : mentionsFailedToScrape.values()) { mention.scrapedAt = now; + LOGGER.info("Failed to scrape mention with DOI {}", mention.doi); } scrapedMentions.addAll(mentionsFailedToScrape.values()); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java index 1447931d7..2958dac86 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java @@ -18,16 +18,48 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Objects; import java.util.Optional; import java.util.UUID; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.StreamSupport; -public class OpenAlexCitations { +class OpenAlexCitations { static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s"; + public Collection mentionData(Collection dataciteDois, String email) throws IOException, InterruptedException { + String filter = dataciteDois.stream().filter(Objects::nonNull).collect(Collectors.joining("|")); + String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200"; + + HttpResponse response; + if (email == null || email.isBlank()) { + response = Utils.getAsHttpResponse(worksUri); + } else { + response = Utils.getAsHttpResponse(worksUri, "User-Agent", "mailto:" + email); + } + + JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject(); + JsonArray citationsArray = tree + .getAsJsonArray("results"); + + Collection mentions = new ArrayList<>(); + Instant now = Instant.now(); + for (JsonElement citation : citationsArray) { + MentionRecord citationAsMention; + try { + citationAsMention = parseCitationAsMention(citation, now); + } catch (RuntimeException e) { + Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); + continue; + } + mentions.add(citationAsMention); + } + + return mentions; + } + public Collection citations(String doi, String email, UUID id) throws IOException, InterruptedException { String doiUrlEncoded = Utils.urlEncode(doi); @@ -182,7 +214,7 @@ static URI extractUrlFromLocation(JsonArray locations) { JsonObject locationObject = location.getAsJsonObject(); String landingPageUrl = Utils.stringOrNull(locationObject.get("landing_page_url")); if (landingPageUrl != null) { - landingPageUrl = landingPageUrl.replaceAll("\\\\", "%5C"); + landingPageUrl = landingPageUrl.replace("\\", "%5C"); return URI.create(landingPageUrl); } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java index e00c54d4e..66e075036 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -66,8 +66,10 @@ public void saveReleaseContent(Collection releaseData, Map softwareIds = conceptDoiToSoftwareIds.get(versionDoiToConceptDoi.get(versionDoi)); + for (Map.Entry entry : versionDoiToConceptDoi.entrySet()) { + String versionDoi = entry.getKey(); + String conceptDoi = entry.getValue(); + Collection softwareIds = conceptDoiToSoftwareIds.get(conceptDoi); for (UUID softwareId : softwareIds) { JsonObject couple = new JsonObject(); couple.addProperty("release_id", softwareId.toString()); diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java new file mode 100644 index 000000000..1750e2de0 --- /dev/null +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitationsTest.java @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.net.URI; + +public class OpenAlexCitationsTest { + + @Test + void givenLocationWithBackSlashes_whenExtractedAsLocation_thenSlashesUrlEncoded() { + JsonArray array = new JsonArray(); + // Example of the structure: https://api.openalex.org/works/https://doi.org/10.2777/28598 + JsonObject location = new JsonObject(); + location.addProperty("landing_page_url", "https://www.example.com/path\\with\\slash"); + array.add(location); + + URI result = OpenAlexCitations.extractUrlFromLocation(array); + + Assertions.assertNotNull(result); + Assertions.assertEquals("https://www.example.com/path%5Cwith%5Cslash", result.toString()); + } +}