forked from apify/crawlee
-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add experimental monitor mode to BasicCrawler
Fixes apify#2680 Add a new Monitor class to track and display time estimation and concurrency status in the CLI output at regular intervals. * **Monitor Class**: - Add `Monitor` class in `packages/core/src/monitor.ts`. - Include logic to write into the output and gather and calculate the monitor data. * **BasicCrawler Integration**: - Import `Monitor` class in `packages/basic-crawler/src/internals/basic-crawler.ts`. - Initialize and start the `Monitor` class in the `run` function. - Ensure monitor output and `log` output are written on separate lines. - Add `monitor` option to `BasicCrawlerOptions` interface. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/apify/crawlee/issues/2680?shareId=XXXX-XXXX-XXXX-XXXX).
- Loading branch information
Showing
2 changed files
with
63 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import { log as defaultLog, Log } from './log'; | ||
import { Statistics } from './crawlers/statistics'; | ||
import os from 'os'; | ||
|
||
export class Monitor { | ||
private log: Log; | ||
private statistics: Statistics; | ||
private intervalId: NodeJS.Timeout | null = null; | ||
|
||
constructor(statistics: Statistics, log: Log = defaultLog) { | ||
this.statistics = statistics; | ||
this.log = log.child({ prefix: 'Monitor' }); | ||
} | ||
|
||
start(interval: number = 5000) { | ||
this.intervalId = setInterval(() => { | ||
this.display(); | ||
}, interval); | ||
} | ||
|
||
stop() { | ||
if (this.intervalId) { | ||
clearInterval(this.intervalId); | ||
this.intervalId = null; | ||
} | ||
} | ||
|
||
private display() { | ||
const stats = this.statistics.calculate(); | ||
const now = new Date(); | ||
const startTime = this.statistics.state.crawlerStartedAt; | ||
const elapsedTime = now.getTime() - new Date(startTime!).getTime(); | ||
const cpuLoad = os.loadavg()[0]; | ||
const memLoad = (os.totalmem() - os.freemem()) / os.totalmem(); | ||
|
||
this.log.info(` | ||
Start: ${startTime} | ||
Now: ${now} (running for ${elapsedTime / 1000}s) | ||
Progress: ${this.statistics.state.requestsFinished} / ${stats.requestsTotal} (${(this.statistics.state.requestsFinished / stats.requestsTotal) * 100}%), failed: ${this.statistics.state.requestsFailed} (${(this.statistics.state.requestsFailed / stats.requestsTotal) * 100}%) | ||
Remaining: ${this.estimateRemainingTime(stats)} (${stats.requestsFinishedPerMinute} req/min) | ||
Sys. load: ${cpuLoad.toFixed(2)} / ${(memLoad * 100).toFixed(2)}% | ||
Concurrencies: ${this.statistics.state.requestsRetries} | ||
`); | ||
} | ||
|
||
private estimateRemainingTime(stats: ReturnType<Statistics['calculate']>) { | ||
const remainingRequests = stats.requestsTotal - this.statistics.state.requestsFinished; | ||
const avgDuration = stats.requestAvgFinishedDurationMillis; | ||
return (remainingRequests * avgDuration) / 1000; | ||
} | ||
} |