From 2718bad09662f2a09853c12b9e41fbfbd8f2065d Mon Sep 17 00:00:00 2001 From: Manuel Ruck Date: Sun, 7 Apr 2024 14:16:04 +0200 Subject: [PATCH] feat: use api instead of crawling bt website Signed-off-by: Manuel Ruck --- .../import-plenary-minutes/package.json | 2 + .../import-plenary-minutes/src/config.ts | 27 +++++ .../import-plenary-minutes/src/index.ts | 112 +++++++++++------- 3 files changed, 95 insertions(+), 46 deletions(-) create mode 100644 services/cron-jobs/import-plenary-minutes/src/config.ts diff --git a/services/cron-jobs/import-plenary-minutes/package.json b/services/cron-jobs/import-plenary-minutes/package.json index 20cb83930..2e042a68b 100644 --- a/services/cron-jobs/import-plenary-minutes/package.json +++ b/services/cron-jobs/import-plenary-minutes/package.json @@ -12,6 +12,7 @@ "start": "node ./build/index.js" }, "dependencies": { + "@democracy-deutschland/bt-dip-sdk": "1.3.0", "@democracy-deutschland/bundestagio-common": "workspace:*", "axios": "^1.6.0", "cheerio": "^1.0.0-rc.3", @@ -20,6 +21,7 @@ "devDependencies": { "@types/axios": "^0.14.0", "@types/cheerio": "^0.22.21", + "@types/node": "^20.12.5", "dotenv": "^16.0.0", "ts-node-dev": "^1.0.0-pre.49", "ts-unused-exports": "^8.0.0", diff --git a/services/cron-jobs/import-plenary-minutes/src/config.ts b/services/cron-jobs/import-plenary-minutes/src/config.ts new file mode 100644 index 000000000..520e017b0 --- /dev/null +++ b/services/cron-jobs/import-plenary-minutes/src/config.ts @@ -0,0 +1,27 @@ +const { + DB_URL = 'mongodb://localhost:27017/bundestagio', + IMPORT_PROCEDURES_START_CURSOR = '*', + IMPORT_PROCEDURES_FILTER_BEFORE = new Date().toISOString().slice(0, 10), + IMPORT_PROCEDURES_FILTER_AFTER = new Date(Number(new Date()) - 1000 * 60 * 60 * 24 * 7 * 4) + .toISOString() + .slice(0, 10), +} = process.env; + +let { IMPORT_PROCEDURES_CHUNK_SIZE = 100, IMPORT_PROCEDURES_CHUNK_ROUNDS = 5 } = process.env; + +IMPORT_PROCEDURES_CHUNK_SIZE = Number(IMPORT_PROCEDURES_CHUNK_SIZE); +IMPORT_PROCEDURES_CHUNK_ROUNDS = Number(IMPORT_PROCEDURES_CHUNK_ROUNDS); +const IMPORT_PROCEDURES_FILTER_TYPES = process.env.IMPORT_PROCEDURES_FILTER_TYPES + ? process.env.IMPORT_PROCEDURES_FILTER_TYPES.split(',') + : undefined; + +export const CONFIG = { + DIP_API_KEY: process.env.DIP_API_KEY || '', + DB_URL, + IMPORT_PROCEDURES_CHUNK_SIZE, + IMPORT_PROCEDURES_CHUNK_ROUNDS, + IMPORT_PROCEDURES_FILTER_BEFORE, + IMPORT_PROCEDURES_FILTER_AFTER, + IMPORT_PROCEDURES_FILTER_TYPES, + IMPORT_PROCEDURES_START_CURSOR, +} as const; diff --git a/services/cron-jobs/import-plenary-minutes/src/index.ts b/services/cron-jobs/import-plenary-minutes/src/index.ts index f9b4143d5..c802f2135 100644 --- a/services/cron-jobs/import-plenary-minutes/src/index.ts +++ b/services/cron-jobs/import-plenary-minutes/src/index.ts @@ -1,9 +1,16 @@ -import axios from 'axios'; +import axios, { AxiosResponse } from 'axios'; import cheerio from 'cheerio'; import moment from 'moment'; import { PlenaryMinuteModel, mongoConnect } from '@democracy-deutschland/bundestagio-common'; import { MetaData, PlenaryMinutesItem } from './types'; +import { Configuration, PlenarprotokolleApi, PlenarprotokollListResponse } from '@democracy-deutschland/bt-dip-sdk'; +import { CONFIG } from './config'; + +const config = new Configuration({ + apiKey: `ApiKey ${CONFIG.DIP_API_KEY}`, // Replace #YOUR_API_KEY# with your api key +}); +const api = new PlenarprotokolleApi(config, undefined, axios); const AxiosInstance = axios.create(); @@ -35,40 +42,50 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio, period: number): Ple // Parse Title const title = cheerio(elem).find('strong').text().trim(); const regex = /Plenarprotokoll der (?\d{1,3}).*?dem (?.*?)$/gi; - const match = regex.exec(title)!.groups as { - meeting: string; - date: string; - }; - const m = moment(match.date, 'DD MMMM YYYY', 'de'); + try { + const match = regex.exec(title)?.groups as { + meeting: string; + date: string; + }; + const m = moment(match.date, 'DD MMMM YYYY', 'de'); - // Parse link - const xmlLink = cheerio(elem).find('.bt-link-dokument').attr('href'); + // Parse link + const xmlLink = cheerio(elem).find('.bt-link-dokument').attr('href'); - const plenaryMinutesItem: PlenaryMinutesItem = { - date: m.toDate(), - period, - meeting: parseInt(match.meeting), - xml: `https://www.bundestag.de${xmlLink}`, - }; - plenaryMinutesItems.push(plenaryMinutesItem); + const plenaryMinutesItem: PlenaryMinutesItem = { + date: m.toDate(), + period, + meeting: parseInt(match.meeting), + xml: `https://www.bundestag.de${xmlLink}`, + }; + plenaryMinutesItems.push(plenaryMinutesItem); + } catch (error) { + console.log('error', error, title); + } }); return plenaryMinutesItems; }; const parsePage = async (url: string, period: number) => { - return await AxiosInstance.get(url).then((response) => { - const html = response.data; - const $ = cheerio.load(html); - const meta: cheerio.Cheerio = $('.meta-slider'); - const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr'); - const metaData = getMeta(meta); - const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period); - return { - meta: metaData, - plenaryMinutes, - }; - }); + console.log('parsePage', url); + return await AxiosInstance.get(url) + .then((response) => { + const html = response.data; + const $ = cheerio.load(html); + const meta: cheerio.Cheerio = $('.meta-slider'); + const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr'); + const metaData = getMeta(meta); + const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period); + return { + meta: metaData, + plenaryMinutes, + }; + }) + .catch((error) => { + console.error('error', error); + throw error; + }); }; const getUrl = ({ offset, id }: { offset: number; id: string }) => @@ -79,22 +96,27 @@ const periods = [ { period: 20, id: '866354-866354' }, ]; -const start = async (period: number) => { - const periodId = periods.find((p) => p.period === period)!.id; - - let url: string | false = getUrl({ offset: 0, id: periodId }); - const data: PlenaryMinutesItem[] = []; +const start = async () => { + const cursor: string | undefined = undefined; + const plenarprotokollItems: PlenaryMinutesItem[] = []; + let hasNextPage = true; do { - const { meta, plenaryMinutes } = await parsePage(url, period); - data.push(...plenaryMinutes); - if (meta.nextOffset < meta.hits) { - url = getUrl({ offset: meta.nextOffset, id: periodId }); - } else { - url = false; + const { data } = await api.getPlenarprotokollList({ cursor }); + for (const plenarprotokoll of data.documents) { + const regex = /Protokoll der (?\d+)\. Sitzung/gi; + const match = regex.exec(plenarprotokoll.titel); + const meetingNumber = match?.groups?.meeting; + plenarprotokollItems.push({ + date: new Date(plenarprotokoll.datum), + period: plenarprotokoll.wahlperiode, + meeting: meetingNumber, + xml: plenarprotokoll.fundstelle.xml_url, + }); } - } while (url); + hasNextPage = cursor !== data.cursor; + } while (hasNextPage); await PlenaryMinuteModel.collection.bulkWrite( - data.map((item) => ({ + plenarprotokollItems.map((item) => ({ updateOne: { filter: { meeting: item.meeting, period: item.period }, update: { @@ -104,18 +126,16 @@ const start = async (period: number) => { }, })), ); - console.log(`found for period ${period}: `, data.length); }; (async () => { console.info('START'); - console.info('process.env', process.env.DB_URL); - if (!process.env.DB_URL) { + console.info('process.env', CONFIG.DB_URL); + if (!CONFIG.DB_URL) { throw new Error('you have to set environment variable: DB_URL'); } - await mongoConnect(process.env.DB_URL); + await mongoConnect(CONFIG.DB_URL); console.log('PlenaryMinutes', await PlenaryMinuteModel.countDocuments({})); - await start(19); - await start(20); + await start(); process.exit(0); })();