merge

JetBrains-Research · Oct 21, 2024 · 169a48a · 169a48a
2 parents 3c26518 + 32677c5
commit 169a48a
Show file tree

Hide file tree

Showing 442 changed files with 240,249 additions and 219 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,10 +18,11 @@ run.sh
 **/.pytest_cache
 run_nagini.py
 /tmp
-results
 results/*
+!results/archive
 /log_tries/
 /log_tries/*
 .direnv
 scripts/processed
 scripts/processed/*
+*dump.txt
diff --git a/benches/HumanEval-Dafny b/benches/HumanEval-Dafny
diff --git a/benches/HumanEval/002_truncate/truncate.prompt b/benches/HumanEval/002_truncate/truncate.prompt
@@ -0,0 +1,9 @@
+def truncate_number(number: float) -> float:
+    """ Given a positive floating point number, it can be decomposed into
+    and integer part (largest integer smaller than given number) and decimals
+    (leftover part always smaller than 1).
+
+    Return the decimal part of the number.
+    >>> truncate_number(3.5)
+    0.5
+    """
diff --git a/gui/src/helpers.rs b/gui/src/helpers.rs
@@ -59,11 +59,6 @@ fn add_common_arguments<'a>(
     token: &str,
     settings: &Settings,
 ) -> &'a mut Command {
-    let bench_type = if settings.incremental_run {
-        String::from("validating")
-    } else {
-        settings.bench_type.to_string()
-    };
     if settings.do_filter {
         cmd.args(["--filter-by-ext", &settings.filter_by_ext]);
     }
@@ -72,10 +67,24 @@ fn add_common_arguments<'a>(
         .args(["--insert-conditions-mode", "llm-single-step"])
         .args(["--llm-profile", settings.llm_profile.as_grazie()])
         .args(["--grazie-token", token])
-        .args(["--bench-type", &bench_type])
+        .args(["--bench-type", &settings.bench_type.to_string()])
         .args(["--tries", &make_tries(&settings.tries)])
         .args(["--retries", &make_retries(&settings.retries)])
-        .args(["--verifier-timeout", &make_timeout(&settings.timeout)])
+        .args(["--verifier-timeout", &make_timeout(&settings.timeout)]);
+
+    if settings.incremental_run && settings.ignore_failed {
+        cmd.arg("--ignore-failed");
+    }
+    if settings.remove_conditions {
+        cmd.arg("--remove-conditions");
+    }
+    if settings.remove_implementations {
+        cmd.arg("--remove-implementations");
+    }
+    if settings.include_text_descriptions {
+        cmd.arg("--include-text-descriptions");
+    }
+    cmd
 }
 
 fn parse_output(output: Output) -> (String, String) {

diff --git a/gui/src/main.rs b/gui/src/main.rs
@@ -47,15 +47,18 @@ fn main() -> eframe::Result {
         "Verified codegen",
         options,
         Box::new(|cc| {
-            let settings = should_restore()
+            let mut settings: Settings = should_restore()
                 .then(|| {
                     cc.storage.and_then(|storage| {
                         let settings = storage.get_string("settings_json")?;
                         serde_json::from_str(&settings).ok()
                     })
                 })
                 .flatten()
-                .unwrap_or_default();
+                .unwrap_or_else(Settings::from_env);
+            if let Ok(token) = std::env::var("GRAZIE_JWT_TOKEN") {
+                settings.grazie_token = token;
+            }
             let state = AppState {
                 settings,
                 ..Default::default()
@@ -152,11 +155,12 @@ struct AppState {
     log: Arc<RwLock<Option<String>>>,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)]
 enum LLMProfile {
     GPT4o,
     GPT4Turbo,
     Claude3Opus,
+    #[default]
     Claude35Sonnet,
 }
 
@@ -191,7 +195,7 @@ impl Display for LLMProfile {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
 struct Settings {
     grazie_token: String,
     llm_profile: LLMProfile,
@@ -208,26 +212,25 @@ struct Settings {
     do_filter: bool,
     filter_by_ext: String,
     incremental_run: bool,
+    ignore_failed: bool,
+    remove_conditions: bool,
+    remove_implementations: bool,
+    include_text_descriptions: bool,
 }
 
-impl Default for Settings {
-    fn default() -> Self {
+impl Settings {
+    fn from_env() -> Self {
         Self {
             grazie_token: std::env::var("GRAZIE_JWT_TOKEN").unwrap_or_default(),
-            llm_profile: LLMProfile::GPT4o,
             verifier_command: std::env::var("VERIFIER_COMMAND").unwrap_or_default(),
             generate_command: String::from("verified-cogen"),
             use_poetry: std::env::var("USE_POETRY").unwrap_or_default() == "1",
             prompts_directory: std::env::var("PROMPTS_DIRECTORY").unwrap_or_default(),
             tries: String::from("1"),
             retries: String::from("0"),
-            bench_type: BenchMode::Invariants,
-            file_mode: FileMode::SingleFile,
             runs: String::from("1"),
             timeout: String::from("60"),
-            incremental_run: false,
-            do_filter: false,
-            filter_by_ext: String::new(),
+            ..Settings::default()
         }
     }
 }

diff --git a/gui/src/ui/run.rs b/gui/src/ui/run.rs
@@ -1,4 +1,4 @@
-use std::{fs::File, io::Read, path::PathBuf, sync::Arc};
+use std::{collections::HashMap, fs::File, io::Read, path::PathBuf, sync::Arc};
 
 use eframe::egui::{TextEdit, Ui};
 
@@ -111,20 +111,26 @@ impl AppState {
                                     false => APP_DIRS.cache_dir().join("total_cnt.json"),
                                 };
 
-                                File::open(results_path)
-                                    .expect("results are not where they should be")
-                                    .read_to_string(&mut results_contents)
-                                    .expect("failed read");
-
-                                if let Ok(mut results) = run_results.write() {
-                                    *results = Some(
-                                        serde_json::from_str(&results_contents)
-                                            .expect("results must contain a valid json"),
-                                    );
-                                    file_count.store(
-                                        cnt.expect("should be dir"),
-                                        std::sync::atomic::Ordering::SeqCst,
-                                    );
+                                if File::open(results_path)
+                                    .ok()
+                                    .and_then(|mut file| {
+                                        file.read_to_string(&mut results_contents).ok()
+                                    })
+                                    .is_some()
+                                {
+                                    if let Ok(mut results) = run_results.write() {
+                                        *results = Some({
+                                            let result: HashMap<String, f64> =
+                                                serde_json::from_str(&results_contents)
+                                                    .expect("results must contain a valid json");
+
+                                            result.into_iter().filter(|(_, v)| *v != -1.0).collect()
+                                        });
+                                        file_count.store(
+                                            cnt.expect("should be dir"),
+                                            std::sync::atomic::Ordering::SeqCst,
+                                        );
+                                    }
                                 }
                             }
                         }

diff --git a/gui/src/ui/settings.rs b/gui/src/ui/settings.rs
@@ -37,9 +37,12 @@ impl AppState {
                 self.settings.file_mode = FileMode::Directory;
             }
             ui.add_space(2.0);
-            if !self.settings.incremental_run {
-                ui.horizontal(|ui| {
-                    ui.label("Bench mode:");
+            ui.columns(2, |cols| {
+                let [left_ui, right_ui] = cols else {
+                    panic!("should have 2 columns")
+                };
+                left_ui.label("Bench mode:");
+                left_ui.horizontal(|ui| {
                     egui::ComboBox::from_id_source("bench-mode-select")
                         .selected_text(format!("{}", self.settings.bench_type.name()))
                         .show_ui(ui, |ui| {
@@ -51,8 +54,21 @@ impl AppState {
                                 );
                             }
                         });
+                    if self.settings.incremental_run {
+                        ui.checkbox(&mut self.settings.ignore_failed, "Ignore failed");
+                    }
                 });
-            }
+
+                right_ui.checkbox(&mut self.settings.remove_conditions, "Remove conditions");
+                right_ui.checkbox(
+                    &mut self.settings.remove_implementations,
+                    "Remove implementations",
+                );
+                right_ui.checkbox(
+                    &mut self.settings.include_text_descriptions,
+                    "Include text descriptions",
+                );
+            });
 
             ui.separator();
             ui.horizontal(|ui| {

diff --git a/prompts/humaneval-dafny-cot-onestep/ask_for_fixed.txt b/prompts/humaneval-dafny-cot-onestep/ask_for_fixed.txt
@@ -0,0 +1,4 @@
+The following errors occurred during verification:
+{error}
+
+Please fix the error by adding, removing or modifying the invariants and/or assertions and return the fixed program.
diff --git a/prompts/humaneval-dafny-cot-onestep/ask_for_fixed_had_errors.txt b/prompts/humaneval-dafny-cot-onestep/ask_for_fixed_had_errors.txt
@@ -0,0 +1,4 @@
+There are still some errors:
+{error}
+
+Could you please fix them?
diff --git a/prompts/humaneval-dafny-cot-onestep/rewrite/examples/001/answer.txt b/prompts/humaneval-dafny-cot-onestep/rewrite/examples/001/answer.txt
@@ -0,0 +1,69 @@
+Let's go over the provided program step by step.
+First, we'll identify what each function does and what its input and output are.
+
+1. The function `sum` takes a `seq<int>` as input and returns an `int` as output. Looking at it's body, we see that it takes the first element of the sequence, adds it to the sum of the rest of the elements, and returns the result.
+
+2. The lemma `sum_prop` states that given that length of a sequence is greater than 0, the sum of the sequence is equal to the sum of first `|s| - 1` elements plus the last element.
+
+3. The method `sum_loop` takes a `seq<int>` as input and returns an `int` as output. It has an accumulator variable `s` that starts at 0. It then loops through the sequence, adding each element to the accumulator. As we see, it uses a `while` loop, so one of the invariants dafny requires is that `0 <= i <= |s|`.
+
+Now, let's try to prove the `sum_loop` method. We'll start by adding the `0 <= i <= |s|` invariant to the while loop:
+```
+while (i < |numbers|)
+    invariant 0 <= i <= |numbers|
+```
+
+Next, we need to use the `sum_prop` lemma to prove that `s == sum(numbers[0..i])`. To do this, we'll need to prove that adding `numbers[i]` to `sum(numbers[0..i])` results in `sum(numbers[0..i + 1])`. That's exactly what `sum_prop(numbers[0..i + 1])` does. For dafny to be able to ensure that, we'll need to prove that `numbers[0..i + 1][..i] == numbers[0..i]`. This is an automatic proof that dafny can do, so we don't need to add any extra proof steps, let's just add an assertion:
+```
+assert numbers[0..i + 1][..i] == numbers[0..i];
+```
+
+Finally, we'll add the `s == sum(numbers[0..i])` invariant to the while loop:
+```
+while (i < |numbers|)
+    invariant 0 <= i <= |numbers|
+    invariant s = sum(numbers[0..i])
+```
+
+Now, we can use the `sum_prop` lemma to prove that `s == sum(numbers[0..i])`:
+
+```
+assert numbers[0..i + 1][..i] == numbers[0..i];
+sum_prop(numbers[0..i + 1]);
+assert sum(numbers[0..i + 1]) == sum(numbers[0..i]) + numbers[i];
+```
+
+Now, the proof is complete:
+
+<answer>
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+        invariant 0 <= i <= |numbers|
+        invariant s == sum(numbers[0..i])
+    {
+        s := s + numbers[i];
+
+        assert numbers[0..i + 1][..i] == numbers[0..i];
+        sum_prop(numbers[0..i + 1]);
+        assert sum(numbers[0..i + 1]) == sum(numbers[0..i]) + numbers[i];
+
+        i := i + 1;
+    }
+}
+</answer>
diff --git a/prompts/humaneval-dafny-cot-onestep/rewrite/examples/001/question.txt b/prompts/humaneval-dafny-cot-onestep/rewrite/examples/001/question.txt
@@ -0,0 +1,31 @@
+Rewrite the following dafny program, adding correct invariants into the loops and adding assertions in nessesary places.
+Do not change the code, only add invariants and assertions. Don't remove any helper functions, they are there to help you.
+Explain all your reasoning in detail before answering. Put the answer in <answer></answer> tags without markdown backticks.
+The program:
+
+```dafny
+function sum(s: seq<int>) : int {
+    if |s| == 0 then 0 else s[0] + sum(s[1..])
+}
+
+lemma sum_prop(s: seq<int>)
+    requires |s| > 0
+    ensures sum(s) == sum(s[..|s| - 1]) + s[ |s| - 1 ]
+{
+    if (|s| > 1) {
+        assert (s[1..][..|s[1..]| - 1]) == s[1..|s| - 1];
+    }
+}
+method sum_loop(numbers: seq<int>) returns (s: int)
+    ensures s == sum(numbers)
+{
+    s := 0;
+    var i := 0;
+    while (i < |numbers|)
+    {
+        s := s + numbers[i];
+
+        i := i + 1;
+    }
+}
+```
diff --git a/prompts/humaneval-dafny-cot-onestep/rewrite/question.txt b/prompts/humaneval-dafny-cot-onestep/rewrite/question.txt
@@ -0,0 +1,5 @@
+Rewrite the following dafny program, adding correct invariants into the loops and adding assertions in nessesary places.
+Do not change the code, only add invariants and assertions. Don't remove any helper functions, they are there to help you.
+Explain all your reasoning in detail before answering. Put the answer in <answer></answer> tags without markdown backticks.
+The program:
+{program}
diff --git a/prompts/humaneval-dafny-cot-onestep/sys.txt b/prompts/humaneval-dafny-cot-onestep/sys.txt
@@ -0,0 +1,3 @@
+You are an expert in Dafny.
+You will be given tasks dealing with Dafny programs including precise annotations.
+You respond only with code blocks.
diff --git a/prompts/humaneval-dafny-cot-onestep/timeout.txt b/prompts/humaneval-dafny-cot-onestep/timeout.txt
@@ -0,0 +1,3 @@
+The verifier timed out during the verification.
+This usually means that the provided invariants were too broad or were difficult to check.
+Could you please try to improve the invariants and try again?
diff --git a/prompts/humaneval-dafny-without-conditions-cot-onestep/ask_for_fixed.txt b/prompts/humaneval-dafny-without-conditions-cot-onestep/ask_for_fixed.txt
@@ -0,0 +1,4 @@
+The following errors occurred during verification:
+{error}
+
+Please fix the error by adding, removing or modifying the invariants and/or assertions and return the fixed program.
diff --git a/prompts/humaneval-dafny-without-conditions-cot-onestep/ask_for_fixed_had_errors.txt b/prompts/humaneval-dafny-without-conditions-cot-onestep/ask_for_fixed_had_errors.txt
@@ -0,0 +1,4 @@
+There are still some errors:
+{error}
+
+Could you please fix them?