Skip to content

Commit

Permalink
focus arnhem
Browse files Browse the repository at this point in the history
  • Loading branch information
ckuijjer committed Apr 9, 2024
1 parent fc8397b commit 7fff1b3
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 3 deletions.
179 changes: 179 additions & 0 deletions cloud/scrapers/focusarnhem.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import { DateTime } from 'luxon'
import Xray from 'x-ray'

import { logger as parentLogger } from '../powertools'
import { Screening } from '../types'
import xRayPuppeteer from '../xRayPuppeteer'
import { guessYear } from './utils/guessYear'
import { monthToNumber } from './utils/monthToNumber'
import { splitTime } from './utils/splitTime'

const logger = parentLogger.createChild({
persistentLogAttributes: {
scraper: 'focusarnhem',
},
})

const trim = (value) => (typeof value === 'string' ? value.trim() : value)

const cleanTitle = (value) =>
typeof value === 'string' ? value.replace(/^Expat Cinema: /gi, '') : value

const toLowerCase = (value) =>
typeof value === 'string' ? value.toLowerCase() : value

const replaceNoBreakSpace = (value) =>
typeof value === 'string' ? value.replace(/\u00a0/g, ' ') : value

const normalizeWhitespace = (value) =>
typeof value === 'string' ? value.replace(/\s+/g, ' ') : value

const xray = Xray({
filters: {
cleanTitle,
replaceNoBreakSpace,
toLowerCase,
trim,
normalizeWhitespace,
},
})
.driver(
xRayPuppeteer({
logger,
waitForOptions: { timeout: 60_000, waitUntil: 'networkidle2' },
}),
)
.concurrency(10)
.throttle(10, 300)

type XRayFromMoviePage = {
title: string
metadata: string
screenings: {
date: string
times: string[]
}[]
}

type XRayFromMainPage = {
title: string
url: string
}

const hasEnglishSubtitles = (movie: XRayFromMoviePage) => {
return /ondertiteling: english/i.test(movie.metadata)
}

const splitDate = (date: string) => {
if (date === 'Vandaag') {
const { day, month, year } = DateTime.now()
return { day, month, year }
} else if (date === 'Morgen') {
const { day, month, year } = DateTime.now().plus({ days: 1 })
return { day, month, year }
} else {
const [dayString, monthString] = date
.split(/\s+/) // ['Woensdag 8 mei'] => ['Woensdag', '8', 'mei']
.slice(1) // ['Woensdag', '8', 'mei'] => ['8', 'mei']

const day = Number(dayString)
const month = monthToNumber(monthString)

const year = guessYear({
day,
month,
})

return { day, month, year }
}
}

const extractFromMoviePage = async (url: string): Promise<Screening[]> => {
logger.info('extracting', { url })

const movie: XRayFromMoviePage = await xray(url, {
title: 'h1 | trim | cleanTitle',
metadata: '#credits | normalizeWhitespace | trim',
screenings: xray('#movie-times li', [
{
date: '.date | trim',
times: ['a .text | trim'],
},
]),
})

logger.info('extractFromMoviePage', { movie })

if (!hasEnglishSubtitles(movie)) {
logger.warn('extractFromMoviePage without english subtitles', {
url,
title: movie.title,
})
return []
}

const screenings: Screening[] = movie.screenings.flatMap(
({ date, times }) => {
const { day, month, year } = splitDate(date)

return times.map((time) => {
const [hour, minute] = splitTime(time)

return {
title: movie.title,
url,
cinema: 'Focus Arnhem',
date: DateTime.fromObject({
day,
month,
year,
hour,
minute,
}).toJSDate(),
}
})
},
)

return screenings
}

const extractFromMainPage = async () => {
const url = 'https://www.focusarnhem.nl/special/focus-expat-cinema/'

const movies: XRayFromMainPage[] = await xray(
url,
'#special-films .movie-block',
[
{
title: '@title | trim | cleanTitle',
url: '@href',
},
],
)

logger.info('movies', { movies })

const screenings = (
await Promise.all(movies.map(({ url }) => extractFromMoviePage(url)))
).flat()

logger.info('extractFromMainPage', { screenings })

return screenings
}

if (require.main === module) {
extractFromMainPage()
.then((x) => JSON.stringify(x, null, 2))
.then(console.log)

// extractFromMoviePage(
// // 'https://www.focusarnhem.nl/agenda/un-metier-serieux/',
// 'https://www.focusarnhem.nl/agenda/expat-cinema-the-peasants/',
// )
// .then((x) => JSON.stringify(x, null, 2))
// .then(console.log)
}

export default extractFromMainPage
2 changes: 2 additions & 0 deletions cloud/scrapers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import eyefilm from './eyefilm'
import filmhuisdenhaag from './filmhuisdenhaag'
import filmhuislumen from './filmhuislumen'
import filmkoepel from './filmkoepel'
import focusarnhem from './focusarnhem'
import forumgroningen from './forumgroningen'
import hartlooper from './hartlooper'
import ketelhuis from './ketelhuis'
Expand Down Expand Up @@ -61,6 +62,7 @@ const SCRAPERS = {
filmhuisdenhaag,
filmhuislumen,
filmkoepel,
focusarnhem,
forumgroningen,
hartlooper,
ketelhuis,
Expand Down
1 change: 0 additions & 1 deletion cloud/scrapers/lab1.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ const xray = Xray({
typeof value === 'string' ? value.replace(/\s+/g, ' ') : value,
},
})
// .driver(xRayPuppeteer({ logger }))
.concurrency(10)
.throttle(10, 300)

Expand Down
2 changes: 1 addition & 1 deletion cloud/scrapers/lab111.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ const xray = Xray({
typeof value === 'string' ? value.replace(/\s+/g, ' ') : value,
},
})
.driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60000 } }))
.driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60_000 } }))
.concurrency(10)
.throttle(10, 300)

Expand Down
2 changes: 1 addition & 1 deletion cloud/scrapers/lumiere.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ const xray = Xray({
trim,
},
})
.driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60000 } }))
.driver(xRayPuppeteer({ logger, waitForOptions: { timeout: 60_000 } }))
.concurrency(10)
.throttle(10, 300)

Expand Down
6 changes: 6 additions & 0 deletions web/data/cinema.json
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,11 @@
"city": "Eindhoven",
"url": "https://www.natlab.nl/",
"logo": "natlab.png"
},
{
"name": "Focus Arnhem",
"city": "Arnhem",
"url": "https://www.focusarnhem.nl/",
"logo": "focusarnhem.png"
}
]
Binary file added web/public/images/focusarnhem.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 7fff1b3

Please sign in to comment.