diff --git a/cloud/scrapers/focusarnhem.ts b/cloud/scrapers/focusarnhem.ts new file mode 100644 index 00000000..ce49b98c --- /dev/null +++ b/cloud/scrapers/focusarnhem.ts @@ -0,0 +1,179 @@ +import { DateTime } from 'luxon' +import Xray from 'x-ray' + +import { logger as parentLogger } from '../powertools' +import { Screening } from '../types' +import xRayPuppeteer from '../xRayPuppeteer' +import { guessYear } from './utils/guessYear' +import { monthToNumber } from './utils/monthToNumber' +import { splitTime } from './utils/splitTime' + +const logger = parentLogger.createChild({ + persistentLogAttributes: { + scraper: 'focusarnhem', + }, +}) + +const trim = (value) => (typeof value === 'string' ? value.trim() : value) + +const cleanTitle = (value) => + typeof value === 'string' ? value.replace(/^Expat Cinema: /gi, '') : value + +const toLowerCase = (value) => + typeof value === 'string' ? value.toLowerCase() : value + +const replaceNoBreakSpace = (value) => + typeof value === 'string' ? value.replace(/\u00a0/g, ' ') : value + +const normalizeWhitespace = (value) => + typeof value === 'string' ? value.replace(/\s+/g, ' ') : value + +const xray = Xray({ + filters: { + cleanTitle, + replaceNoBreakSpace, + toLowerCase, + trim, + normalizeWhitespace, + }, +}) + .driver( + xRayPuppeteer({ + logger, + waitForOptions: { timeout: 60_000, waitUntil: 'networkidle2' }, + }), + ) + .concurrency(10) + .throttle(10, 300) + +type XRayFromMoviePage = { + title: string + metadata: string + screenings: { + date: string + times: string[] + }[] +} + +type XRayFromMainPage = { + title: string + url: string +} + +const hasEnglishSubtitles = (movie: XRayFromMoviePage) => { + return /ondertiteling: english/i.test(movie.metadata) +} + +const splitDate = (date: string) => { + if (date === 'Vandaag') { + const { day, month, year } = DateTime.now() + return { day, month, year } + } else if (date === 'Morgen') { + const { day, month, year } = DateTime.now().plus({ days: 1 }) + return { day, month, year } + } else { + const [dayString, monthString] = date + .split(/\s+/) // ['Woensdag 8 mei'] => ['Woensdag', '8', 'mei'] + .slice(1) // ['Woensdag', '8', 'mei'] => ['8', 'mei'] + + const day = Number(dayString) + const month = monthToNumber(monthString) + + const year = guessYear({ + day, + month, + }) + + return { day, month, year } + } +} + +const extractFromMoviePage = async (url: string): Promise => { + logger.info('extracting', { url }) + + const movie: XRayFromMoviePage = await xray(url, { + title: 'h1 | trim | cleanTitle', + metadata: '#credits | normalizeWhitespace | trim', + screenings: xray('#movie-times li', [ + { + date: '.date | trim', + times: ['a .text | trim'], + }, + ]), + }) + + logger.info('extractFromMoviePage', { movie }) + + if (!hasEnglishSubtitles(movie)) { + logger.warn('extractFromMoviePage without english subtitles', { + url, + title: movie.title, + }) + return [] + } + + const screenings: Screening[] = movie.screenings.flatMap( + ({ date, times }) => { + const { day, month, year } = splitDate(date) + + return times.map((time) => { + const [hour, minute] = splitTime(time) + + return { + title: movie.title, + url, + cinema: 'Focus Arnhem', + date: DateTime.fromObject({ + day, + month, + year, + hour, + minute, + }).toJSDate(), + } + }) + }, + ) + + return screenings +} + +const extractFromMainPage = async () => { + const url = 'https://www.focusarnhem.nl/special/focus-expat-cinema/' + + const movies: XRayFromMainPage[] = await xray( + url, + '#special-films .movie-block', + [ + { + title: '@title | trim | cleanTitle', + url: '@href', + }, + ], + ) + + logger.info('movies', { movies }) + + const screenings = ( + await Promise.all(movies.map(({ url }) => extractFromMoviePage(url))) + ).flat() + + logger.info('extractFromMainPage', { screenings }) + + return screenings +} + +if (require.main === module) { + extractFromMainPage() + .then((x) => JSON.stringify(x, null, 2)) + .then(console.log) + + // extractFromMoviePage( + // // 'https://www.focusarnhem.nl/agenda/un-metier-serieux/', + // 'https://www.focusarnhem.nl/agenda/expat-cinema-the-peasants/', + // ) + // .then((x) => JSON.stringify(x, null, 2)) + // .then(console.log) +} + +export default extractFromMainPage diff --git a/cloud/scrapers/index.ts b/cloud/scrapers/index.ts index 65d3e2c0..fe540696 100644 --- a/cloud/scrapers/index.ts +++ b/cloud/scrapers/index.ts @@ -29,6 +29,7 @@ import eyefilm from './eyefilm' import filmhuisdenhaag from './filmhuisdenhaag' import filmhuislumen from './filmhuislumen' import filmkoepel from './filmkoepel' +import focusarnhem from './focusarnhem' import forumgroningen from './forumgroningen' import hartlooper from './hartlooper' import ketelhuis from './ketelhuis' @@ -61,6 +62,7 @@ const SCRAPERS = { filmhuisdenhaag, filmhuislumen, filmkoepel, + focusarnhem, forumgroningen, hartlooper, ketelhuis, diff --git a/cloud/scrapers/lab1.ts b/cloud/scrapers/lab1.ts index d9d3948d..19487542 100644 --- a/cloud/scrapers/lab1.ts +++ b/cloud/scrapers/lab1.ts @@ -19,7 +19,6 @@ const xray = Xray({ typeof value === 'string' ? value.replace(/\s+/g, ' ') : value, }, }) - // .driver(xRayPuppeteer({ logger })) .concurrency(10) .throttle(10, 300) diff --git a/cloud/scrapers/lab111.ts b/cloud/scrapers/lab111.ts index a0c3fbfb..0f2ae16d 100644 --- a/cloud/scrapers/lab111.ts +++ b/cloud/scrapers/lab111.ts @@ -22,7 +22,7 @@ const xray = Xray({ typeof value === 'string' ? value.replace(/\s+/g, ' ') : value, }, }) - .driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60000 } })) + .driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60_000 } })) .concurrency(10) .throttle(10, 300) diff --git a/cloud/scrapers/lumiere.ts b/cloud/scrapers/lumiere.ts index 03db5f3d..c7607211 100644 --- a/cloud/scrapers/lumiere.ts +++ b/cloud/scrapers/lumiere.ts @@ -35,7 +35,7 @@ const xray = Xray({ trim, }, }) - .driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60000 } })) + .driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60_000 } })) .concurrency(10) .throttle(10, 300) diff --git a/web/data/cinema.json b/web/data/cinema.json index a7f1e10d..426c420a 100644 --- a/web/data/cinema.json +++ b/web/data/cinema.json @@ -184,5 +184,11 @@ "city": "Eindhoven", "url": "https://www.natlab.nl/", "logo": "natlab.png" + }, + { + "name": "Focus Arnhem", + "city": "Arnhem", + "url": "https://www.focusarnhem.nl/", + "logo": "focusarnhem.png" } ] diff --git a/web/public/images/focusarnhem.png b/web/public/images/focusarnhem.png new file mode 100644 index 00000000..1305f21b Binary files /dev/null and b/web/public/images/focusarnhem.png differ