From da5779b8e9922f99ed5ed2e84ef1da7faf61cbb2 Mon Sep 17 00:00:00 2001 From: "LAP-TMX-VIRT\\thomas" Date: Mon, 30 Sep 2024 14:15:26 +0200 Subject: [PATCH 1/2] fix mime type ( human browsing <> api call) --- thesesfr/parser.js | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/thesesfr/parser.js b/thesesfr/parser.js index e0fe43fc..a44ea502 100755 --- a/thesesfr/parser.js +++ b/thesesfr/parser.js @@ -27,10 +27,15 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { const apiDocumentRegex = /\/api\/v1\/document\/(([0-9]{4})([a-z]{2}[0-9a-z]{2})[0-9a-z]+)/ig; const apiProtectedDocRegex = /\/api\/v1\/document\/protected\/(([0-9]{4})([a-z]{2}[0-9a-z]{2})[0-9a-z]+)/ig; + const baseReferer ='https://www.theses.fr/'; + const baseRefererShort ='https://theses.fr/'; let match; - if ( + if (ec['User-Agent'] === 'node') { + //NOP + + } else if ( ((match = /^\/(([0-9]{4})([a-z]{2}[0-9a-z]{2})[0-9a-z]+)\/document$/i.exec(path)) !== null) || ((match = apiDocumentRegex.exec(path)) !== null) ) { @@ -69,6 +74,13 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { result.unitid = match[1]; result.ppn = match[1]; + let myReferer = baseReferer + result.unitid; + let myRefererShort = baseRefererShort + result.unitid; + + if ((ec['Referer'] === myReferer) || (ec['Referer'] === myRefererShort)) { + result.mime = 'HTML'; + } + } else if ((match = apiOrganismeRegex.exec(path)) !== null) { // RECORD organism JSON // /api/v1/theses/organisme/159502497 @@ -77,6 +89,13 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { result.unitid = match[1]; result.ppn = match[1]; + let myReferer = baseReferer+result.unitid; + let myRefererShort = baseRefererShort+result.unitid; + + if ((ec['Referer'] === myReferer) || (ec['Referer'] === myRefererShort)) { + result.mime = 'HTML'; + } + } else if ((match = /^\/([0-9]{8}[0-9X])$/i.exec(path)) !== null) { // /258987731 RECORD HTML undeterminable person or organism, will eventually set to BIO in middleware thesesfr-personne result.rtype = 'RECORD'; @@ -91,6 +110,13 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { result.mime = 'JSON'; result.unitid = match[1]; + let myReferer = baseReferer+result.unitid; + let myRefererShort = baseRefererShort+result.unitid; + + if ((ec['Referer'] === myReferer) || (ec['Referer'] === myRefererShort)) { + result.mime = 'HTML'; + } + } else if ((match = /^\/(s[0-9]+)$/i.exec(path)) !== null) { // /s366354 ABStract notice d’une thèse en préparation HTML result.rtype = 'ABS'; @@ -106,6 +132,13 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { result.publication_date = match[2]; result.institution_code = match[3]; + let myReferer = baseReferer+result.unitid; + let myRefererShort = baseRefererShort+result.unitid; + + if ((ec['Referer'] === myReferer) || (ec['Referer'] === myRefererShort)) { + result.mime = 'HTML'; + } + } else if ((match = /^\/(([0-9]{4})([a-z]{2}[0-9a-z]{2})[0-9a-z]+)$/i.exec(path)) !== null) { // /2023UPASP097 ABStract notice d’une thèse soutenue HTML result.rtype = 'ABS'; From 89f67cf406de22c39f8507b7461c89eacdf4b068 Mon Sep 17 00:00:00 2001 From: "LAP-TMX-VIRT\\thomas" Date: Mon, 30 Sep 2024 14:46:44 +0200 Subject: [PATCH 2/2] linter --- thesesfr/parser.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thesesfr/parser.js b/thesesfr/parser.js index a44ea502..ab947cdf 100755 --- a/thesesfr/parser.js +++ b/thesesfr/parser.js @@ -33,8 +33,8 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { let match; if (ec['User-Agent'] === 'node') { - //NOP - + //NOP + } else if ( ((match = /^\/(([0-9]{4})([a-z]{2}[0-9a-z]{2})[0-9a-z]+)\/document$/i.exec(path)) !== null) || ((match = apiDocumentRegex.exec(path)) !== null)