Skip to content

Commit

Permalink
feat(prover_cli): Stuck status (#2441)
Browse files Browse the repository at this point in the history
This PR adds the functionality to display jobs that are stuck at some
point in the process for the status batch command, along with their
respective tests.

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.

---------

Co-authored-by: Joaquin Carletti <[email protected]>
Co-authored-by: Ivan Litteri <[email protected]>
Co-authored-by: Ivan Litteri <[email protected]>
Co-authored-by: ilitteri <[email protected]>
Co-authored-by: EmilLuta <[email protected]>
  • Loading branch information
6 people authored Aug 19, 2024
1 parent e22cfb6 commit 232a817
Show file tree
Hide file tree
Showing 5 changed files with 461 additions and 96 deletions.
55 changes: 55 additions & 0 deletions core/lib/basic_types/src/prover_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,11 @@ pub struct ProverJobFriInfo {
pub picked_by: Option<String>,
}

pub trait Stallable {
fn get_status(&self) -> WitnessJobStatus;
fn get_attempts(&self) -> u32;
}

#[derive(Debug, Clone)]
pub struct BasicWitnessGeneratorJobInfo {
pub l1_batch_number: L1BatchNumber,
Expand All @@ -277,6 +282,16 @@ pub struct BasicWitnessGeneratorJobInfo {
pub picked_by: Option<String>,
}

impl Stallable for BasicWitnessGeneratorJobInfo {
fn get_status(&self) -> WitnessJobStatus {
self.status.clone()
}

fn get_attempts(&self) -> u32 {
self.attempts
}
}

#[derive(Debug, Clone)]
pub struct LeafWitnessGeneratorJobInfo {
pub id: u32,
Expand All @@ -295,6 +310,16 @@ pub struct LeafWitnessGeneratorJobInfo {
pub picked_by: Option<String>,
}

impl Stallable for LeafWitnessGeneratorJobInfo {
fn get_status(&self) -> WitnessJobStatus {
self.status.clone()
}

fn get_attempts(&self) -> u32 {
self.attempts
}
}

#[derive(Debug, Clone)]
pub struct NodeWitnessGeneratorJobInfo {
pub id: u32,
Expand All @@ -314,6 +339,16 @@ pub struct NodeWitnessGeneratorJobInfo {
pub picked_by: Option<String>,
}

impl Stallable for NodeWitnessGeneratorJobInfo {
fn get_status(&self) -> WitnessJobStatus {
self.status.clone()
}

fn get_attempts(&self) -> u32 {
self.attempts
}
}

#[derive(Debug, Clone)]
pub struct RecursionTipWitnessGeneratorJobInfo {
pub l1_batch_number: L1BatchNumber,
Expand All @@ -329,6 +364,16 @@ pub struct RecursionTipWitnessGeneratorJobInfo {
pub picked_by: Option<String>,
}

impl Stallable for RecursionTipWitnessGeneratorJobInfo {
fn get_status(&self) -> WitnessJobStatus {
self.status.clone()
}

fn get_attempts(&self) -> u32 {
self.attempts
}
}

#[derive(Debug, Clone)]
pub struct SchedulerWitnessGeneratorJobInfo {
pub l1_batch_number: L1BatchNumber,
Expand All @@ -344,6 +389,16 @@ pub struct SchedulerWitnessGeneratorJobInfo {
pub picked_by: Option<String>,
}

impl Stallable for SchedulerWitnessGeneratorJobInfo {
fn get_status(&self) -> WitnessJobStatus {
self.status.clone()
}

fn get_attempts(&self) -> u32 {
self.attempts
}
}

#[derive(Debug, EnumString, Display, Clone)]
pub enum ProofCompressionJobStatus {
#[strum(serialize = "queued")]
Expand Down
130 changes: 74 additions & 56 deletions prover/crates/bin/prover_cli/src/commands/status/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use anyhow::Context as _;
use circuit_definitions::zkevm_circuits::scheduler::aux::BaseLayerCircuitType;
use clap::Args as ClapArgs;
use colored::*;
use zksync_config::configs::FriProverConfig;
use zksync_env_config::FromEnv;
use zksync_prover_dal::{Connection, ConnectionPool, Prover, ProverDal};
use zksync_types::{
basic_fri_types::AggregationRound,
Expand All @@ -16,8 +18,11 @@ use zksync_types::{
L1BatchNumber,
};

use super::utils::{BatchData, StageInfo, Status};
use crate::cli::ProverCLIConfig;
use super::utils::{get_prover_job_status, BatchData, StageInfo, Status};
use crate::{
cli::ProverCLIConfig,
commands::status::utils::{get_prover_jobs_status_from_vec, get_witness_generator_job_status},
};

#[derive(ClapArgs)]
pub struct Args {
Expand All @@ -36,7 +41,7 @@ pub(crate) async fn run(args: Args, config: ProverCLIConfig) -> anyhow::Result<(
format!("Batch {} Status", batch_data.batch_number).bold()
);

if let Status::Custom(msg) = batch_data.compressor.witness_generator_jobs_status() {
if let Status::Custom(msg) = batch_data.compressor.witness_generator_jobs_status(10) {
if msg.contains("Sent to server") {
println!("> Proof sent to server ✅");
continue;
Expand All @@ -45,7 +50,7 @@ pub(crate) async fn run(args: Args, config: ProverCLIConfig) -> anyhow::Result<(

let basic_witness_generator_status = batch_data
.basic_witness_generator
.witness_generator_jobs_status();
.witness_generator_jobs_status(10);
if matches!(basic_witness_generator_status, Status::JobsNotFound) {
println!("> No batch found. 🚫");
continue;
Expand Down Expand Up @@ -205,25 +210,21 @@ fn display_batch_status(batch_data: BatchData) {
}

fn display_status_for_stage(stage_info: StageInfo) {
let max_attempts = FriProverConfig::from_env()
.expect("Fail to read prover config.")
.max_attempts;
display_aggregation_round(&stage_info);
match stage_info.witness_generator_jobs_status() {
let status = stage_info.witness_generator_jobs_status(max_attempts);
match status {
Status::Custom(msg) => {
println!("{}: {} \n", stage_info.to_string().bold(), msg);
}
Status::Queued | Status::WaitingForProofs | Status::Stuck | Status::JobsNotFound => {
println!(
"{}: {}",
stage_info.to_string().bold(),
stage_info.witness_generator_jobs_status()
)
println!("{}: {}", stage_info.to_string().bold(), status)
}
Status::InProgress | Status::Successful => {
println!(
"{}: {}",
stage_info.to_string().bold(),
stage_info.witness_generator_jobs_status()
);
if let Some(job_status) = stage_info.prover_jobs_status() {
println!("{}: {}", stage_info.to_string().bold(), status);
if let Some(job_status) = stage_info.prover_jobs_status(max_attempts) {
println!("> {}: {}", "Prover Jobs".to_owned().bold(), job_status);
}
}
Expand All @@ -240,53 +241,51 @@ fn display_batch_info(batch_data: BatchData) {
}

fn display_info_for_stage(stage_info: StageInfo) {
let max_attempts = FriProverConfig::from_env()
.expect("Fail to read prover config.")
.max_attempts;
display_aggregation_round(&stage_info);
match stage_info.witness_generator_jobs_status() {
let status = stage_info.witness_generator_jobs_status(max_attempts);
match status {
Status::Custom(msg) => {
println!("{}: {}", stage_info.to_string().bold(), msg);
}
Status::Queued | Status::WaitingForProofs | Status::Stuck | Status::JobsNotFound => {
println!(
" > {}: {}",
stage_info.to_string().bold(),
stage_info.witness_generator_jobs_status()
)
Status::Queued | Status::WaitingForProofs | Status::JobsNotFound => {
println!(" > {}: {}", stage_info.to_string().bold(), status)
}
Status::InProgress => {
println!(
"v {}: {}",
stage_info.to_string().bold(),
stage_info.witness_generator_jobs_status()
);
Status::InProgress | Status::Stuck => {
println!("v {}: {}", stage_info.to_string().bold(), status);
match stage_info {
StageInfo::BasicWitnessGenerator {
prover_jobs_info, ..
} => {
display_prover_jobs_info(prover_jobs_info);
display_prover_jobs_info(prover_jobs_info, max_attempts);
}
StageInfo::LeafWitnessGenerator {
witness_generator_jobs_info,
prover_jobs_info,
} => {
display_leaf_witness_generator_jobs_info(witness_generator_jobs_info);
display_prover_jobs_info(prover_jobs_info);
display_leaf_witness_generator_jobs_info(
witness_generator_jobs_info,
max_attempts,
);
display_prover_jobs_info(prover_jobs_info, max_attempts);
}
StageInfo::NodeWitnessGenerator {
witness_generator_jobs_info,
prover_jobs_info,
} => {
display_node_witness_generator_jobs_info(witness_generator_jobs_info);
display_prover_jobs_info(prover_jobs_info);
display_node_witness_generator_jobs_info(
witness_generator_jobs_info,
max_attempts,
);
display_prover_jobs_info(prover_jobs_info, max_attempts);
}
_ => (),
}
}
Status::Successful => {
println!(
"> {}: {}",
stage_info.to_string().bold(),
stage_info.witness_generator_jobs_status()
);
println!("> {}: {}", stage_info.to_string().bold(), status);
match stage_info {
StageInfo::BasicWitnessGenerator {
prover_jobs_info, ..
Expand All @@ -296,55 +295,58 @@ fn display_info_for_stage(stage_info: StageInfo) {
}
| StageInfo::NodeWitnessGenerator {
prover_jobs_info, ..
} => display_prover_jobs_info(prover_jobs_info),
} => display_prover_jobs_info(prover_jobs_info, max_attempts),
_ => (),
}
}
}
}

fn display_leaf_witness_generator_jobs_info(
mut leaf_witness_generators_jobs_info: Vec<LeafWitnessGeneratorJobInfo>,
mut jobs_info: Vec<LeafWitnessGeneratorJobInfo>,
max_attempts: u32,
) {
leaf_witness_generators_jobs_info.sort_by_key(|job| job.circuit_id);
jobs_info.sort_by_key(|job| job.circuit_id);

leaf_witness_generators_jobs_info.iter().for_each(|job| {
jobs_info.iter().for_each(|job| {
println!(
" > {}: {}",
format!(
"{:?}",
BaseLayerCircuitType::from_numeric_value(job.circuit_id as u8)
)
.bold(),
Status::from(job.status.clone())
get_witness_generator_job_status(job, max_attempts)
)
});
}

fn display_node_witness_generator_jobs_info(
mut node_witness_generators_jobs_info: Vec<NodeWitnessGeneratorJobInfo>,
mut jobs_info: Vec<NodeWitnessGeneratorJobInfo>,
max_attempts: u32,
) {
node_witness_generators_jobs_info.sort_by_key(|job| job.circuit_id);
jobs_info.sort_by_key(|job| job.circuit_id);

node_witness_generators_jobs_info.iter().for_each(|job| {
jobs_info.iter().for_each(|job| {
println!(
" > {}: {}",
format!(
"{:?}",
BaseLayerCircuitType::from_numeric_value(job.circuit_id as u8)
)
.bold(),
Status::from(job.status.clone())
get_witness_generator_job_status(job, max_attempts)
)
});
}

fn display_prover_jobs_info(prover_jobs_info: Vec<ProverJobFriInfo>) {
let prover_jobs_status = Status::from(prover_jobs_info.clone());
fn display_prover_jobs_info(prover_jobs_info: Vec<ProverJobFriInfo>, max_attempts: u32) {
let prover_jobs_status = get_prover_jobs_status_from_vec(&prover_jobs_info, max_attempts);

if matches!(prover_jobs_status, Status::Successful)
|| matches!(prover_jobs_status, Status::JobsNotFound)
{
if matches!(
prover_jobs_status,
Status::Successful | Status::JobsNotFound
) {
println!(
"> {}: {prover_jobs_status}",
"Prover Jobs".to_owned().bold()
Expand All @@ -366,7 +368,7 @@ fn display_prover_jobs_info(prover_jobs_info: Vec<ProverJobFriInfo>) {
});

for (circuit_id, prover_jobs_info) in jobs_by_circuit_id {
let status = Status::from(prover_jobs_info.clone());
let status = get_prover_jobs_status_from_vec(&prover_jobs_info, max_attempts);
println!(
" > {}: {}",
format!(
Expand All @@ -376,8 +378,10 @@ fn display_prover_jobs_info(prover_jobs_info: Vec<ProverJobFriInfo>) {
.bold(),
status
);
if matches!(status, Status::InProgress) {
display_job_status_count(prover_jobs_info);
match status {
Status::InProgress => display_job_status_count(prover_jobs_info),
Status::Stuck => display_stuck_jobs(prover_jobs_info, max_attempts),
_ => (),
}
}
}
Expand All @@ -400,6 +404,20 @@ fn display_job_status_count(jobs: Vec<ProverJobFriInfo>) {
println!(" - Failed: {}", jobs_counts.failed);
}

fn display_stuck_jobs(jobs: Vec<ProverJobFriInfo>, max_attempts: u32) {
jobs.iter().for_each(|job| {
if matches!(
get_prover_job_status(job.clone(), max_attempts),
Status::Stuck
) {
println!(
" - Prover Job: {} stuck after {} attempts",
job.id, job.attempts
);
}
})
}

fn display_aggregation_round(stage_info: &StageInfo) {
if let Some(aggregation_round) = stage_info.aggregation_round() {
println!(
Expand Down
Loading

0 comments on commit 232a817

Please sign in to comment.