Skip to content

Commit

Permalink
fix docker image version & crawl hole period & configs
Browse files Browse the repository at this point in the history
Signed-off-by: ManAnRuck <[email protected]>
  • Loading branch information
ManAnRuck committed Jul 2, 2021
1 parent 518c01e commit 90777e5
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 35 deletions.
4 changes: 2 additions & 2 deletions infra/kustomize/base/main/import-procedures.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ spec:
restartPolicy: Never
containers:
- name: import-procedures
image: democracy/crawler:0.1.0
image: democracy/crawler:1.0.1
envFrom:
- configMapRef:
name: crawler-config
command: [ "yarn", "run", "start:import-procedures" ]
command: ["yarn", "run", "start:import-procedures"]
2 changes: 1 addition & 1 deletion infra/kustomize/overlays/alpha/configs/crawler.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
MONGO_DB_URL=mongodb://democracy-mongo-srv:27017/bundestagio
MONGO_DB_URL=mongodb://mongo-0.mongo,mongo-1.mongo,mongo-2.mongo/bundestagio?replicaSet=rs0
DIP_GRAPHQL_ENDPOINT=http://bundestagio-dip-srv:3101/
IMPORT_PROCEDURES_START_CURSOR=*
IMPORT_PROCEDURES_CHUNK_SIZE=50
Expand Down
5 changes: 3 additions & 2 deletions infra/kustomize/overlays/internal/configs/crawler.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
MONGO_DB_URL=mongodb://democracy-mongo-srv:27017/bundestagio
MONGO_DB_URL=mongodb://mongo-0.mongo/bundestagio
DIP_GRAPHQL_ENDPOINT=http://bundestagio-dip-srv:3101/
IMPORT_PROCEDURES_START_CURSOR=*
DEBUG=*
IMPORT_PROCEDURES_CHUNK_SIZE=50
IMPORT_PROCEDURES_CHUNK_ROUNDS=100000
IMPORT_PROCEDURES_FILTER_AFTER=2021-01-01
IMPORT_PROCEDURES_FILTER_AFTER=2017-09-24
3 changes: 2 additions & 1 deletion infra/kustomize/overlays/prod/configs/crawler.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
MONGO_DB_URL=mongodb://democracy-mongo-srv:27017/bundestagio
MONGO_DB_URL=mongodb://mongo-0.mongo,mongo-1.mongo,mongo-2.mongo/bundestagio?replicaSet=rs0
DIP_GRAPHQL_ENDPOINT=http://bundestagio-dip-srv:3101/
IMPORT_PROCEDURES_START_CURSOR=*
IMPORT_PROCEDURES_CHUNK_SIZE=200
IMPORT_PROCEDURES_CHUNK_ROUNDS=10000
IMPORT_PROCEDURES_FILTER_AFTER=2017-09-24
2 changes: 1 addition & 1 deletion services/cron-jobs/crawler/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "crawler",
"version": "1.0.0",
"version": "1.0.2",
"description": "Kubernetes cron-job to collect data from various sources for bundestag.io",
"main": "index.ts",
"repository": "https://github.com/demokratie-live/democracy-development/",
Expand Down
76 changes: 49 additions & 27 deletions services/cron-jobs/crawler/src/import-procedures.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
import { mongoConnect, mongoDisconnect } from "./mongoose";
import config from './config';
import { request, gql } from 'graphql-request'
import {
ProcedureModel,
} from "@democracy-deutschland/bundestagio-common";
import debug from 'debug';
const [log, error] = [debug('bundestag-io:import-procedures:log'), debug('bundestag-io:import-procedures:error')]
import config from "./config";
import { request, gql } from "graphql-request";
import { ProcedureModel } from "@democracy-deutschland/bundestagio-common";
import debug from "debug";
const [log, error] = [
debug("bundestag-io:import-procedures:log"),
debug("bundestag-io:import-procedures:error"),
];
log.log = console.log.bind(console);

const { DIP_GRAPHQL_ENDPOINT, IMPORT_PROCEDURES_START_CURSOR, IMPORT_PROCEDURES_CHUNK_SIZE, IMPORT_PROCEDURES_CHUNK_ROUNDS, IMPORT_PROCEDURES_FILTER_BEFORE, IMPORT_PROCEDURES_FILTER_AFTER } = config;
const {
DIP_GRAPHQL_ENDPOINT,
IMPORT_PROCEDURES_START_CURSOR,
IMPORT_PROCEDURES_CHUNK_SIZE,
IMPORT_PROCEDURES_CHUNK_ROUNDS,
IMPORT_PROCEDURES_FILTER_BEFORE,
IMPORT_PROCEDURES_FILTER_AFTER,
} = config;

const procedureQuery = gql`
query ($cursor: String, $offset: Int, $limit: Int, $filter: ProcedureFilter) {
procedures(cursor: $cursor, offset: $offset, limit: $limit, filter: $filter) {
procedures(
cursor: $cursor
offset: $offset
limit: $limit
filter: $filter
) {
edges {
node {
abstract
Expand Down Expand Up @@ -61,46 +74,55 @@ const procedureQuery = gql`
}
}
}
`
`;

export default async function importProcedures() {
const variables = {
filter: { after: IMPORT_PROCEDURES_FILTER_AFTER, before: IMPORT_PROCEDURES_FILTER_BEFORE },
limit: IMPORT_PROCEDURES_CHUNK_SIZE, cursor: IMPORT_PROCEDURES_START_CURSOR
}
filter: {
after: IMPORT_PROCEDURES_FILTER_AFTER,
before: IMPORT_PROCEDURES_FILTER_BEFORE,
},
limit: IMPORT_PROCEDURES_CHUNK_SIZE,
cursor: IMPORT_PROCEDURES_START_CURSOR,
};
log(`
--------------------------------------
Importing ${IMPORT_PROCEDURES_CHUNK_ROUNDS}*${IMPORT_PROCEDURES_CHUNK_SIZE} procedures.
Between ${variables.filter.after} and ${variables.filter.before}.
--------------------------------------
`)
for (const round of Array.from(Array(IMPORT_PROCEDURES_CHUNK_ROUNDS).keys())) {
log(`Round ${round} - Cursor ${variables.cursor}`)
const { procedures: { edges, pageInfo: { endCursor, hasNextPage } } } = await request(DIP_GRAPHQL_ENDPOINT, procedureQuery, variables );
`);
for (const round of Array.from(
Array(IMPORT_PROCEDURES_CHUNK_ROUNDS).keys()
)) {
log(`Round ${round} - Cursor ${variables.cursor}`);
const {
procedures: {
edges,
pageInfo: { endCursor, hasNextPage },
},
} = await request(DIP_GRAPHQL_ENDPOINT, procedureQuery, variables);
await ProcedureModel.bulkWrite(
edges.map((edge: { node: { procedureId: string } }) => ({
updateOne: {
filter: { procedureId: edge.node.procedureId },
update: edge.node,
upsert: true
}
upsert: true,
},
}))
)
if(!hasNextPage) break
variables.cursor = endCursor
);
if (!hasNextPage) break;
variables.cursor = endCursor;
}
}


(async () => {
try {
await mongoConnect();
await importProcedures()
process.exit(0);
await importProcedures();
} catch (err) {
error(err)
error(err);
throw err;
} finally {
mongoDisconnect()
mongoDisconnect();
}
})();
2 changes: 1 addition & 1 deletion services/cron-jobs/crawler/src/mongoose.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export const mongoConnect = async () => {
mongoose.set("useFindAndModify", false);
mongoose.set("debug", false);

connection = await mongoose.connect(config.MONGO_DB_URL !, {
connection = await mongoose.connect(config.MONGO_DB_URL!, {
useNewUrlParser: true,
useUnifiedTopology: true,
});
Expand Down

0 comments on commit 90777e5

Please sign in to comment.