Skip to content

Commit

Permalink
Add experimental monitor mode to BasicCrawler
Browse files Browse the repository at this point in the history
Fixes apify#2680

Add a new Monitor class to track and display time estimation and concurrency status in the CLI output at regular intervals.

* **Monitor Class**:
  - Add `Monitor` class in `packages/core/src/monitor.ts`.
  - Include logic to write into the output and gather and calculate the monitor data.
* **BasicCrawler Integration**:
  - Import `Monitor` class in `packages/basic-crawler/src/internals/basic-crawler.ts`.
  - Initialize and start the `Monitor` class in the `run` function.
  - Ensure monitor output and `log` output are written on separate lines.
  - Add `monitor` option to `BasicCrawlerOptions` interface.

---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/apify/crawlee/issues/2680?shareId=XXXX-XXXX-XXXX-XXXX).
  • Loading branch information
ImBIOS committed Oct 2, 2024
1 parent b0527f9 commit 41f1149
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 0 deletions.
12 changes: 12 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ import type { OptionsInit, Method } from 'got-scraping';
import ow, { ArgumentError } from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';
import { Monitor } from '@crawlee/core/src/monitor';

export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary>
extends CrawlingContext<BasicCrawler, UserData> {
Expand Down Expand Up @@ -367,6 +368,13 @@ export interface CrawlerExperiments {
* - set `requestLocking` to `false` in the `experiments` option of the crawler
*/
requestLocking?: boolean;
/**
* Experimental cli output monitor mode
* If you encounter issues due to this change, please:
* - report it to us: https://github.com/apify/crawlee
* - set `requestLocking` to `false` in the `experiments` option of the crawler
*/
monitor?: boolean;
}

/**
Expand Down Expand Up @@ -904,11 +912,15 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.events.on(EventType.MIGRATING, boundPauseOnMigration);
this.events.on(EventType.ABORTING, boundPauseOnMigration);

const monitor = this.experiments.monitor ? new Monitor(this.stats, this.log) : null;
monitor?.start();

try {
await this.autoscaledPool!.run();
} finally {
await this.teardown();
await this.stats.stopCapturing();
monitor?.stop();

process.off('SIGINT', sigintHandler);
this.events.off(EventType.MIGRATING, boundPauseOnMigration);
Expand Down
51 changes: 51 additions & 0 deletions packages/core/src/monitor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { log as defaultLog, Log } from './log';
import { Statistics } from './crawlers/statistics';
import os from 'os';

export class Monitor {
private log: Log;
private statistics: Statistics;
private intervalId: NodeJS.Timeout | null = null;

constructor(statistics: Statistics, log: Log = defaultLog) {
this.statistics = statistics;
this.log = log.child({ prefix: 'Monitor' });
}

start(interval: number = 5000) {
this.intervalId = setInterval(() => {
this.display();
}, interval);
}

stop() {
if (this.intervalId) {
clearInterval(this.intervalId);
this.intervalId = null;
}
}

private display() {
const stats = this.statistics.calculate();
const now = new Date();
const startTime = this.statistics.state.crawlerStartedAt;
const elapsedTime = now.getTime() - new Date(startTime!).getTime();
const cpuLoad = os.loadavg()[0];
const memLoad = (os.totalmem() - os.freemem()) / os.totalmem();

this.log.info(`
Start: ${startTime}
Now: ${now} (running for ${elapsedTime / 1000}s)
Progress: ${this.statistics.state.requestsFinished} / ${stats.requestsTotal} (${(this.statistics.state.requestsFinished / stats.requestsTotal) * 100}%), failed: ${this.statistics.state.requestsFailed} (${(this.statistics.state.requestsFailed / stats.requestsTotal) * 100}%)
Remaining: ${this.estimateRemainingTime(stats)} (${stats.requestsFinishedPerMinute} req/min)
Sys. load: ${cpuLoad.toFixed(2)} / ${(memLoad * 100).toFixed(2)}%
Concurrencies: ${this.statistics.state.requestsRetries}
`);
}

private estimateRemainingTime(stats: ReturnType<Statistics['calculate']>) {
const remainingRequests = stats.requestsTotal - this.statistics.state.requestsFinished;
const avgDuration = stats.requestAvgFinishedDurationMillis;
return (remainingRequests * avgDuration) / 1000;
}
}

0 comments on commit 41f1149

Please sign in to comment.