From d64b7c50fadee64c1eebe4834689a5ac6fe36e83 Mon Sep 17 00:00:00 2001
From: GlebSolovev <lionyoung@mail.ru>
Date: Fri, 23 Feb 2024 11:46:23 +0100
Subject: [PATCH 01/29] Implement baseline benchmark as a test

---
 package.json                         |   3 +-
 src/test/coreTests/benchmark.test.ts | 143 +++++++++++++++++++++++++++
 src/test/resources/auto_benchmark.v  |  45 +++++++++
 3 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 src/test/coreTests/benchmark.test.ts
 create mode 100644 src/test/resources/auto_benchmark.v

diff --git a/package.json b/package.json
index 3951e36c..5315104e 100644
--- a/package.json
+++ b/package.json
@@ -178,7 +178,8 @@
     "lint": "eslint \"src/**/*.{ts,js}\" --ext .ts",
     "format": "prettier --write \"src/**/*.{ts,js}\" && eslint \"src/**/*.{ts,js}\" --ext .ts --fix",
     "pretest": "npm run compile && npm run lint",
-    "test": "node ./out/test/runTest.js"
+    "test": "node ./out/test/runTest.js",
+    "benchmark": "npm run test -- -t='benchmark.test.js'"
   },
   "devDependencies": {
     "@types/cli-progress": "^3.11.3",
diff --git a/src/test/coreTests/benchmark.test.ts b/src/test/coreTests/benchmark.test.ts
new file mode 100644
index 00000000..9eb6d3e7
--- /dev/null
+++ b/src/test/coreTests/benchmark.test.ts
@@ -0,0 +1,143 @@
+import {
+    CompletionContext,
+    FailureGenerationResult,
+    FailureGenerationStatus,
+    ProcessEnvironment,
+    SourceFileEnvironment,
+    SuccessGenerationResult,
+    generateCompletion,
+} from "../../core/completionGenerator";
+import { inspectSourceFile } from "../../core/inspectSourceFile";
+import { ProofStep } from "../../coqParser/parsedTypes";
+import { CoqLspClient } from "../../coqLsp/coqLspClient";
+import { CoqLspConfig } from "../../coqLsp/coqLspConfig";
+import { CoqProofChecker } from "../../core/coqProofChecker";
+import { LLMServices, ModelsParams } from "../../llm/configurations";
+import { GrazieService } from "../../llm/llmServices/grazie/grazieService";
+import { OpenAiService } from "../../llm/llmServices/openai/openAiService";
+import { PredefinedProofsService } from "../../llm/llmServices/predefinedProofs/predefinedProofsService";
+import { Uri } from "../../utils/uri";
+
+import * as path from "path";
+import * as assert from "assert";
+
+suite("Benchmark Test", () => {
+    test("Complete with `auto`", async () => {
+        const filePath = path.join(getTestResourcesDir(), "auto_benchmark.v");
+        const modelsParams: ModelsParams = {
+            openAiParams: [],
+            grazieParams: [],
+            predefinedProofsModelParams: [
+                {
+                    tactics: ["auto."],
+                },
+            ],
+        };
+        await runTestBenchmark(filePath, modelsParams);
+        console.log("Benchmarking has finished!\n");
+    }).timeout(50000);
+});
+
+function getTestResourcesDir(): string {
+    const dirname: string = path.join(__dirname, "../../../");
+    return path.join(dirname, "src", "test", "resources");
+}
+
+async function prepareForCompletions(
+    modelsParams: ModelsParams,
+    shouldCompleteHole: (hole: ProofStep) => boolean,
+    filePath: string
+): Promise<[CompletionContext[], SourceFileEnvironment, ProcessEnvironment]> {
+    const fileUri = Uri.fromPath(filePath);
+    const coqLspServerConfig = CoqLspConfig.createServerConfig();
+    const coqLspClientConfig = CoqLspConfig.createClientConfig();
+
+    const client = new CoqLspClient(coqLspServerConfig, coqLspClientConfig);
+    await client.openTextDocument(fileUri);
+
+    const coqProofChecker = new CoqProofChecker(client);
+    const mockFileVersion = 1;
+    const [completionContexts, sourceFileEnvironment] = await inspectSourceFile(
+        mockFileVersion,
+        shouldCompleteHole,
+        fileUri,
+        client
+    );
+    const llmServices: LLMServices = {
+        openAiService: new OpenAiService(),
+        grazieService: new GrazieService(),
+        predefinedProofsService: new PredefinedProofsService(),
+    };
+    const processEnvironment: ProcessEnvironment = {
+        coqProofChecker: coqProofChecker,
+        modelsParams: modelsParams,
+        services: llmServices,
+    };
+
+    return [completionContexts, sourceFileEnvironment, processEnvironment];
+}
+
+async function performSingleCompletion(
+    completionContext: CompletionContext,
+    sourceFileEnvironment: SourceFileEnvironment,
+    processEnvironment: ProcessEnvironment
+): Promise<boolean> {
+    const completionPosition = completionContext.admitEndPosition;
+    console.log(
+        `hole position: ${completionPosition.line}:${completionPosition.character}`
+    );
+    const result = await generateCompletion(
+        completionContext,
+        sourceFileEnvironment,
+        processEnvironment
+    );
+    let message = "unknown";
+    let success = false;
+    if (result instanceof SuccessGenerationResult) {
+        message = `success: ${result.data}`;
+        success = true;
+    } else if (result instanceof FailureGenerationResult) {
+        switch (result.status) {
+            case FailureGenerationStatus.excededTimeout:
+                message = "timeout";
+                break;
+            case FailureGenerationStatus.exception:
+                message = `exception: ${result.message}`;
+                break;
+            case FailureGenerationStatus.searchFailed:
+                message = "no proofs for admit";
+                break;
+        }
+    }
+    console.log(message);
+    console.log("");
+    return success;
+}
+
+async function runTestBenchmark(
+    filePath: string,
+    modelsParams: ModelsParams
+): Promise<void> {
+    console.log(`run benchmarks for file: ${filePath}`);
+    const shouldCompleteHole = (_hole: ProofStep) => true;
+
+    const [completionContexts, sourceFileEnvironment, processEnvironment] =
+        await prepareForCompletions(modelsParams, shouldCompleteHole, filePath);
+
+    const totalCompletionsNumber = completionContexts.length;
+    let successfulCompletionsNumber = 0;
+    for (const completionContext of completionContexts) {
+        const success = await performSingleCompletion(
+            completionContext,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        if (success) {
+            successfulCompletionsNumber += 1;
+        }
+    }
+    console.log(
+        `BENCHMARK RESULT: ${successfulCompletionsNumber} / ${totalCompletionsNumber}`
+    );
+    assert.ok(successfulCompletionsNumber === totalCompletionsNumber);
+}
diff --git a/src/test/resources/auto_benchmark.v b/src/test/resources/auto_benchmark.v
new file mode 100644
index 00000000..cda4e14b
--- /dev/null
+++ b/src/test/resources/auto_benchmark.v
@@ -0,0 +1,45 @@
+Theorem test : forall (A : Type) (P : A -> Prop) (x : A), P x -> P x.
+Proof.
+    admit.
+Admitted.
+
+Theorem test2 : forall (A : Type) (P : A -> Prop) (x : A), P x -> P x.
+Proof.
+    intros A P x H.
+    admit.
+Admitted.
+
+Theorem test2nat1 : forall n : nat, n = 0 \/ n <> 0.
+Proof.
+  intros n.
+  destruct n.
+  - admit.
+  - right.
+  discriminate.
+Admitted.
+
+Theorem test2nat2 : forall n : nat, n = 0 \/ n <> 0.
+Proof.
+    intros n.
+    destruct n.
+    {
+        admit.
+    }
+    {
+        admit.
+    }
+Admitted.
+
+Theorem test_thr : forall n:nat, 0 + n = n.
+Proof.
+    intros n. Print plus.
+    admit.
+    (* reflexivity. *)
+Admitted.
+
+Lemma test_thr1 : forall n:nat, 0 + n + 0 = n.
+Proof.
+    intros n. Print plus.
+    admit.
+    (* reflexivity. *)
+Admitted.

From cd1cc64705cd7a5ea56e3cd4dcb09399db0dcc2c Mon Sep 17 00:00:00 2001
From: GlebSolovev <lionyoung@mail.ru>
Date: Fri, 23 Feb 2024 13:21:34 +0100
Subject: [PATCH 02/29] Improve file structure, support settings

---
 .../benchmark.test.ts                         | 119 ++++++++++++++----
 src/test/benchmark/benchmark_settings.json    |  13 ++
 .../dataset}/auto_benchmark.v                 |   0
 src/test/benchmark/dataset/mixed_benchmark.v  |   7 ++
 4 files changed, 113 insertions(+), 26 deletions(-)
 rename src/test/{coreTests => benchmark}/benchmark.test.ts (65%)
 create mode 100644 src/test/benchmark/benchmark_settings.json
 rename src/test/{resources => benchmark/dataset}/auto_benchmark.v (100%)
 create mode 100644 src/test/benchmark/dataset/mixed_benchmark.v

diff --git a/src/test/coreTests/benchmark.test.ts b/src/test/benchmark/benchmark.test.ts
similarity index 65%
rename from src/test/coreTests/benchmark.test.ts
rename to src/test/benchmark/benchmark.test.ts
index 9eb6d3e7..e7f01ac0 100644
--- a/src/test/coreTests/benchmark.test.ts
+++ b/src/test/benchmark/benchmark.test.ts
@@ -18,12 +18,28 @@ import { OpenAiService } from "../../llm/llmServices/openai/openAiService";
 import { PredefinedProofsService } from "../../llm/llmServices/predefinedProofs/predefinedProofsService";
 import { Uri } from "../../utils/uri";
 
+import {
+    OpenAiModelParams,
+    openAiModelParamsSchema,
+    GrazieModelParams,
+    grazieModelParamsSchema,
+    PredefinedProofsModelParams,
+    predefinedProofsModelParamsSchema,
+} from "../../llm/llmServices/modelParamsInterfaces";
+import { JSONSchemaType } from "ajv";
+import Ajv from "ajv/dist/core";
+
+import * as fs from "fs";
 import * as path from "path";
 import * as assert from "assert";
 
 suite("Benchmark Test", () => {
+    const benchmarkDir = getBenchmarkDir();
+    const settingsPath = path.join(benchmarkDir, "benchmark_settings.json");
+    const datasetDir = path.join(benchmarkDir, "dataset");
+
     test("Complete with `auto`", async () => {
-        const filePath = path.join(getTestResourcesDir(), "auto_benchmark.v");
+        const filePath = path.join(datasetDir, "auto_benchmark.v");
         const modelsParams: ModelsParams = {
             openAiParams: [],
             grazieParams: [],
@@ -34,13 +50,46 @@ suite("Benchmark Test", () => {
             ],
         };
         await runTestBenchmark(filePath, modelsParams);
-        console.log("Benchmarking has finished!\n");
+    }).timeout(50000);
+
+    test("Complete with models from settings", async () => {
+        const modelsParams: ModelsParams = parseBenchmarkSettings(settingsPath);
+        const sourcePath = path.join(datasetDir, "mixed_benchmark.v");
+        await runTestBenchmark(sourcePath, modelsParams);
     }).timeout(50000);
 });
 
-function getTestResourcesDir(): string {
+function getBenchmarkDir(): string {
     const dirname: string = path.join(__dirname, "../../../");
-    return path.join(dirname, "src", "test", "resources");
+    return path.join(dirname, "src", "test", "benchmark");
+}
+
+async function runTestBenchmark(
+    filePath: string,
+    modelsParams: ModelsParams
+): Promise<void> {
+    console.log(`run benchmarks for file: ${filePath}\n`);
+    const shouldCompleteHole = (_hole: ProofStep) => true;
+
+    const [completionContexts, sourceFileEnvironment, processEnvironment] =
+        await prepareForCompletions(modelsParams, shouldCompleteHole, filePath);
+
+    const totalCompletionsNumber = completionContexts.length;
+    let successfulCompletionsNumber = 0;
+    for (const completionContext of completionContexts) {
+        const success = await performSingleCompletion(
+            completionContext,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        if (success) {
+            successfulCompletionsNumber += 1;
+        }
+    }
+    console.log(
+        `BENCHMARK RESULT: ${successfulCompletionsNumber} / ${totalCompletionsNumber}`
+    );
+    assert.ok(successfulCompletionsNumber === totalCompletionsNumber);
 }
 
 async function prepareForCompletions(
@@ -114,30 +163,48 @@ async function performSingleCompletion(
     return success;
 }
 
-async function runTestBenchmark(
-    filePath: string,
-    modelsParams: ModelsParams
-): Promise<void> {
-    console.log(`run benchmarks for file: ${filePath}`);
-    const shouldCompleteHole = (_hole: ProofStep) => true;
+function parseBenchmarkSettings(settingsPath: string): ModelsParams {
+    const settingsText = fs.readFileSync(settingsPath, "utf-8");
+    const modelsParams = JSON.parse(settingsText);
 
-    const [completionContexts, sourceFileEnvironment, processEnvironment] =
-        await prepareForCompletions(modelsParams, shouldCompleteHole, filePath);
+    // TODO: find the way to validate schemas correctly in the strict mode
+    const ajv = new Ajv({ strictSchema: "log" });
 
-    const totalCompletionsNumber = completionContexts.length;
-    let successfulCompletionsNumber = 0;
-    for (const completionContext of completionContexts) {
-        const success = await performSingleCompletion(
-            completionContext,
-            sourceFileEnvironment,
-            processEnvironment
+    const openAiParams: OpenAiModelParams[] = modelsParams[
+        "coqpilot.openAiModelsParameters"
+    ].map((params: any) =>
+        validateAndParseJson(params, openAiModelParamsSchema, ajv)
+    );
+    const grazieParams: GrazieModelParams[] = modelsParams[
+        "coqpilot.grazieModelsParameters"
+    ].map((params: any) =>
+        validateAndParseJson(params, grazieModelParamsSchema, ajv)
+    );
+    const predefinedProofsParams: PredefinedProofsModelParams[] = modelsParams[
+        "coqpilot.predefinedProofsModelsParameters"
+    ].map((params: any) =>
+        validateAndParseJson(params, predefinedProofsModelParamsSchema, ajv)
+    );
+
+    return {
+        openAiParams: openAiParams,
+        grazieParams: grazieParams,
+        predefinedProofsModelParams: predefinedProofsParams,
+    };
+}
+
+function validateAndParseJson<T>(
+    json: any,
+    targetClassSchema: JSONSchemaType<T>,
+    jsonSchemaValidator: Ajv
+): T {
+    const instance: T = json as T;
+    const validate = jsonSchemaValidator.compile(targetClassSchema);
+    if (!validate(instance)) {
+        throw new Error(
+            `Unable to validate json against the class: ${JSON.stringify(validate.errors)}`
         );
-        if (success) {
-            successfulCompletionsNumber += 1;
-        }
     }
-    console.log(
-        `BENCHMARK RESULT: ${successfulCompletionsNumber} / ${totalCompletionsNumber}`
-    );
-    assert.ok(successfulCompletionsNumber === totalCompletionsNumber);
+
+    return instance;
 }
diff --git a/src/test/benchmark/benchmark_settings.json b/src/test/benchmark/benchmark_settings.json
new file mode 100644
index 00000000..202a40c2
--- /dev/null
+++ b/src/test/benchmark/benchmark_settings.json
@@ -0,0 +1,13 @@
+{
+    "coqpilot.openAiModelsParameters": [
+        
+    ],
+    "coqpilot.predefinedProofsModelsParameters": [
+        {
+            "tactics": [
+                "auto."
+            ]
+        }
+    ],
+    "coqpilot.grazieModelsParameters": []
+}
\ No newline at end of file
diff --git a/src/test/resources/auto_benchmark.v b/src/test/benchmark/dataset/auto_benchmark.v
similarity index 100%
rename from src/test/resources/auto_benchmark.v
rename to src/test/benchmark/dataset/auto_benchmark.v
diff --git a/src/test/benchmark/dataset/mixed_benchmark.v b/src/test/benchmark/dataset/mixed_benchmark.v
new file mode 100644
index 00000000..6de48105
--- /dev/null
+++ b/src/test/benchmark/dataset/mixed_benchmark.v
@@ -0,0 +1,7 @@
+Theorem test_thr : forall n:nat, 0 + n = n.
+Proof.
+    intros n. Print plus.
+    admit.
+    (* auto. *)
+Admitted.
+

From a22e889de3f7a5ce7191dc9f084cec8abbb5a09e Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 23 Feb 2024 15:21:33 +0100
Subject: [PATCH 03/29] Fix JsonSchema to allow strict in Ajv

---
 src/extension/coqPilot.ts                    |  7 ++---
 src/llm/llmServices/modelParamsInterfaces.ts | 27 ++++++++++++--------
 src/test/benchmark/benchmark.test.ts         |  8 +++---
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/extension/coqPilot.ts b/src/extension/coqPilot.ts
index e358909a..8578ef2d 100644
--- a/src/extension/coqPilot.ts
+++ b/src/extension/coqPilot.ts
@@ -21,7 +21,8 @@ import {
     insertCompletion,
     highlightTextInEditor,
 } from "./documentEditor";
-import Ajv, { JSONSchemaType } from "ajv";
+import { JSONSchemaType } from "ajv";
+import Ajv2019 from "ajv/dist/2019";
 
 import {
     SuccessGenerationResult,
@@ -77,7 +78,7 @@ export class CoqPilot {
     private readonly vscodeExtensionContext: ExtensionContext;
     private readonly pluginId = "coqpilot";
 
-    private readonly jsonSchemaValidator: Ajv;
+    private readonly jsonSchemaValidator: Ajv2019;
 
     constructor(vscodeExtensionContext: ExtensionContext) {
         hideAuxFiles();
@@ -103,7 +104,7 @@ export class CoqPilot {
             this.performCompletionForAllAdmits.bind(this)
         );
 
-        this.jsonSchemaValidator = new Ajv();
+        this.jsonSchemaValidator = new Ajv2019({ strict: true });
 
         this.vscodeExtensionContext.subscriptions.push(this);
     }
diff --git a/src/llm/llmServices/modelParamsInterfaces.ts b/src/llm/llmServices/modelParamsInterfaces.ts
index 48cd57bf..09228789 100644
--- a/src/llm/llmServices/modelParamsInterfaces.ts
+++ b/src/llm/llmServices/modelParamsInterfaces.ts
@@ -30,15 +30,8 @@ export interface PredefinedProofsModelParams extends ModelParams {
 }
 
 export const openAiModelParamsSchema: JSONSchemaType<OpenAiModelParams> = {
+    $schema: "https://json-schema.org/draft/2019-09/schema",
     type: "object",
-    properties: {
-        prompt: { type: "string" },
-        maxTokens: { type: "number" },
-        temperature: { type: "number" },
-        model: { type: "string" },
-        apiKey: { type: "string" },
-        choices: { type: "number" },
-    },
     required: [
         "prompt",
         "maxTokens",
@@ -47,9 +40,19 @@ export const openAiModelParamsSchema: JSONSchemaType<OpenAiModelParams> = {
         "apiKey",
         "choices",
     ],
+    properties: {
+        prompt: { type: "string" },
+        maxTokens: { type: "number" },
+        temperature: { type: "number" },
+        model: { type: "string" },
+        apiKey: { type: "string" },
+        choices: { type: "number" },
+    },
 };
 
 export const grazieModelParamsSchema: JSONSchemaType<GrazieModelParams> = {
+    $schema: "https://json-schema.org/draft/2019-09/schema",
+    required: ["prompt", "model", "apiKey", "choices"],
     type: "object",
     properties: {
         prompt: { type: "string" },
@@ -57,17 +60,19 @@ export const grazieModelParamsSchema: JSONSchemaType<GrazieModelParams> = {
         apiKey: { type: "string" },
         choices: { type: "number" },
     },
-    required: ["prompt", "model", "apiKey", "choices"],
 };
 
 export const predefinedProofsModelParamsSchema: JSONSchemaType<PredefinedProofsModelParams> =
     {
+        $schema: "https://json-schema.org/draft/2019-09/schema",
         type: "object",
+        required: ["tactics"],
         properties: {
             tactics: {
                 type: "array",
-                items: { type: "string" },
+                items: {
+                    type: "string",
+                },
             },
         },
-        required: ["tactics"],
     };
diff --git a/src/test/benchmark/benchmark.test.ts b/src/test/benchmark/benchmark.test.ts
index e7f01ac0..9dfff578 100644
--- a/src/test/benchmark/benchmark.test.ts
+++ b/src/test/benchmark/benchmark.test.ts
@@ -27,7 +27,7 @@ import {
     predefinedProofsModelParamsSchema,
 } from "../../llm/llmServices/modelParamsInterfaces";
 import { JSONSchemaType } from "ajv";
-import Ajv from "ajv/dist/core";
+import Ajv2019 from "ajv/dist/2019";
 
 import * as fs from "fs";
 import * as path from "path";
@@ -167,8 +167,7 @@ function parseBenchmarkSettings(settingsPath: string): ModelsParams {
     const settingsText = fs.readFileSync(settingsPath, "utf-8");
     const modelsParams = JSON.parse(settingsText);
 
-    // TODO: find the way to validate schemas correctly in the strict mode
-    const ajv = new Ajv({ strictSchema: "log" });
+    const ajv = new Ajv2019({ strictSchema: true });
 
     const openAiParams: OpenAiModelParams[] = modelsParams[
         "coqpilot.openAiModelsParameters"
@@ -196,9 +195,10 @@ function parseBenchmarkSettings(settingsPath: string): ModelsParams {
 function validateAndParseJson<T>(
     json: any,
     targetClassSchema: JSONSchemaType<T>,
-    jsonSchemaValidator: Ajv
+    jsonSchemaValidator: Ajv2019
 ): T {
     const instance: T = json as T;
+    console.dir(instance, { depth: null });
     const validate = jsonSchemaValidator.compile(targetClassSchema);
     if (!validate(instance)) {
         throw new Error(

From 0a495870ed1f9a884fa7c874a097138bde0873b1 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 23 Feb 2024 15:35:31 +0100
Subject: [PATCH 04/29] Add couple of gpt solvable examples

---
 src/test/benchmark/dataset/mixed_benchmark.v | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/test/benchmark/dataset/mixed_benchmark.v b/src/test/benchmark/dataset/mixed_benchmark.v
index 6de48105..5443ee22 100644
--- a/src/test/benchmark/dataset/mixed_benchmark.v
+++ b/src/test/benchmark/dataset/mixed_benchmark.v
@@ -1,7 +1,19 @@
+Theorem test_plus_1 : forall n:nat, 1 + n = S n.
+Proof.
+    auto.
+Qed.
+
 Theorem test_thr : forall n:nat, 0 + n = n.
 Proof.
-    intros n. Print plus.
     admit.
-    (* auto. *)
 Admitted.
 
+Theorem add_comm : forall n m:nat, n + m = m + n.
+Proof.
+    admit.
+Admitted.
+
+Theorem add_comm_1 : forall n:nat, n + 1 = 1 + n.
+Proof.
+    admit.
+Admitted.
\ No newline at end of file

From 1d02f2562fb3ea95d8f3211985aef5144b24002e Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 8 Mar 2024 23:20:30 +0100
Subject: [PATCH 05/29] Better extract diagnostic from coq-lsp Reference issue:
 PLAN-66

---
 src/coqLsp/coqLspClient.ts            | 22 ++++++++++++++--------
 src/test/core/coqProofChecker.test.ts |  7 ++++---
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/coqLsp/coqLspClient.ts b/src/coqLsp/coqLspClient.ts
index adb39ca7..4a7794e8 100644
--- a/src/coqLsp/coqLspClient.ts
+++ b/src/coqLsp/coqLspClient.ts
@@ -133,14 +133,20 @@ export class CoqLspClient implements CoqLspClientInterface {
         diagnostics: Diagnostic[],
         position: Position
     ): string | undefined {
-        return (
-            diagnostics
-                .filter((diag) => diag.range.start.line >= position.line)
-                .filter((diag) => diag.severity === 1) // 1 is error
-                .shift()
-                ?.message?.split("\n")
-                .shift() ?? undefined
-        );
+        const traceStartString = "Raised at";
+        const diagnosticMessageWithTrace = diagnostics
+            .filter((diag) => diag.range.start.line >= position.line)
+            .filter((diag) => diag.severity === 1) // 1 is error
+            .shift()?.message;
+
+        if (!diagnosticMessageWithTrace) {
+            return undefined;
+        } else if (!diagnosticMessageWithTrace.includes(traceStartString)) {
+            return diagnosticMessageWithTrace.split("\n").shift();
+        }
+        return diagnosticMessageWithTrace
+            .substring(0, diagnosticMessageWithTrace.indexOf(traceStartString))
+            .trim();
     }
 
     private async getFirstGoalAtPointUnsafe(
diff --git a/src/test/core/coqProofChecker.test.ts b/src/test/core/coqProofChecker.test.ts
index bfc11672..da58c92e 100644
--- a/src/test/core/coqProofChecker.test.ts
+++ b/src/test/core/coqProofChecker.test.ts
@@ -86,7 +86,8 @@ suite("Coq Proof Checker tests", () => {
             {
                 proof: "reflexivity.",
                 isValid: false,
-                diagnostic: "In environment",
+                diagnostic:
+                    'In environment\nn : nat\nUnable to unify "n" with\n "0 + n + 0".',
             },
             {
                 proof: "auto.",
@@ -160,7 +161,7 @@ suite("Coq Proof Checker tests", () => {
                     proof: "kek.",
                     isValid: false,
                     diagnostic:
-                        "The reference kek was not found in the current",
+                        "The reference kek was not found in the current\nenvironment.",
                 },
                 {
                     proof: "right. auto.",
@@ -180,7 +181,7 @@ suite("Coq Proof Checker tests", () => {
                     proof: "lol.",
                     isValid: false,
                     diagnostic:
-                        "The reference lol was not found in the current",
+                        "The reference lol was not found in the current\nenvironment.",
                 },
             ],
         ];

From e576147d72c905ba2276948240d7144f62746cfe Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 8 Mar 2024 23:42:31 +0100
Subject: [PATCH 06/29] Fix order of coqpilot settings Reference issue: PLAN-69

---
 package.json | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/package.json b/package.json
index 6bd31938..7c49f990 100644
--- a/package.json
+++ b/package.json
@@ -145,7 +145,8 @@
                 }
               }
             ],
-            "markdownDescription": "A list of parameters for open-ai models. Each object represents a single model's configuration. Each model will be fetched for completion independently in the order they are listed."
+            "markdownDescription": "A list of parameters for open-ai models. Each object represents a single model's configuration. Each model will be fetched for completion independently in the order they are listed.",
+            "order": 0
           },
           "coqpilot.grazieModelsParameters": {
             "type": "array",
@@ -210,7 +211,8 @@
               }
             },
             "default": [],
-            "markdownDescription": "Now only available in beta for JetBrains employees. A list of parameters for grazie models. Each object represents a single model's configuration. Each model will be fetched for completion independently in the order they are listed."
+            "markdownDescription": "Now only available in beta for JetBrains employees. A list of parameters for grazie models. Each object represents a single model's configuration. Each model will be fetched for completion independently in the order they are listed.",
+            "order": 2
           },
           "coqpilot.predefinedProofsModelsParameters": {
             "type": "array",
@@ -242,7 +244,8 @@
                 ]
               }
             ],
-            "markdownDescription": "A list where each object represents a single model configuration. Here each model is configured with a set of predefined proofs, which coqpilot should try when searching for completion."
+            "markdownDescription": "A list where each object represents a single model configuration. Here each model is configured with a set of predefined proofs, which coqpilot should try when searching for completion.",
+            "order": 1
           },
           "coqpilot.lmStudioModelsParameters": {
             "type": "array",
@@ -312,7 +315,8 @@
               }
             },
             "default": [],
-            "markdownDescription": "Configuration of models which fetch completions from locally running LLM inside the [LM studio](https://lmstudio.ai)."
+            "markdownDescription": "Configuration of models which fetch completions from locally running LLM inside the [LM studio](https://lmstudio.ai).",
+            "order": 3
           },
           "coqpilot.contextTheoremsRankerType": {
             "type": "string",
@@ -325,7 +329,8 @@
               "Theorems are selected randomly."
             ],
             "description": "Context of the LLM is limited. Usually not all theorems from the file may be used in the completion request. This parameter defines the way theorems are selected for the completion.",
-            "default": "distance"
+            "default": "distance",
+            "order": 4
           },
           "coqpilot.loggingVerbosity": {
             "type": "string",
@@ -338,7 +343,8 @@
               "All information is logged."
             ],
             "description": "The verbosity of the logs.",
-            "default": "info"
+            "default": "info",
+            "order": 5
           }
         }
       }

From 27ae94e1f526b4c457a43a2993aca8d235efcd05 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 3 Apr 2024 11:15:26 +0200
Subject: [PATCH 07/29] Catch errors that occur when settings are incorrect
 Reference issue: PLAN-75

---
 src/extension/coqPilot.ts  | 35 +++++++++++++++++++++++++++++------
 src/llm/userModelParams.ts |  4 ++++
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/src/extension/coqPilot.ts b/src/extension/coqPilot.ts
index f9e5a098..1aab8fbf 100644
--- a/src/extension/coqPilot.ts
+++ b/src/extension/coqPilot.ts
@@ -190,10 +190,19 @@ export class CoqPilot {
                 title: `${pluginId}: In progress`,
             },
             async () => {
-                await this.performSpecificCompletions(
-                    shouldCompleteHole,
-                    editor
-                );
+                try {
+                    await this.performSpecificCompletions(
+                        shouldCompleteHole,
+                        editor
+                    );
+                } catch (error) {
+                    if (error instanceof UserSettingsValidationError) {
+                        showMessageToUser(error.toString(), "error");
+                    } else if (error instanceof Error) {
+                        showMessageToUser(error.message, "error");
+                        console.error(error);
+                    }
+                }
             }
         );
     }
@@ -379,8 +388,9 @@ export class CoqPilot {
         const instance: T = json as T;
         const validate = this.jsonSchemaValidator.compile(targetClassSchema);
         if (!validate(instance)) {
-            throw new Error(
-                `Unable to validate json against the class: ${JSON.stringify(validate.errors)}`
+            throw new UserSettingsValidationError(
+                `Unable to validate json against the class: ${JSON.stringify(validate.errors)}`,
+                targetClassSchema.title ?? "Unknown"
             );
         }
 
@@ -403,3 +413,16 @@ export class CoqPilot {
         this.globalExtensionState.dispose();
     }
 }
+
+class UserSettingsValidationError extends Error {
+    constructor(
+        message: string,
+        public readonly settingsName: string
+    ) {
+        super(message);
+    }
+
+    toString(): string {
+        return `Unable to validate user settings for ${this.settingsName}. Please refer to the README for the correct settings format.`;
+    }
+}
diff --git a/src/llm/userModelParams.ts b/src/llm/userModelParams.ts
index 000d5e0a..dd78e41e 100644
--- a/src/llm/userModelParams.ts
+++ b/src/llm/userModelParams.ts
@@ -84,6 +84,7 @@ export const userModelParamsSchema: JSONSchemaType<UserModelParams> = {
 
 export const openAiUserModelParamsSchema: JSONSchemaType<OpenAiUserModelParams> =
     {
+        title: "openAiModelsParameters",
         type: "object",
         properties: {
             temperature: { type: "number" },
@@ -95,6 +96,7 @@ export const openAiUserModelParamsSchema: JSONSchemaType<OpenAiUserModelParams>
 
 export const grazieUserModelParamsSchema: JSONSchemaType<GrazieUserModelParams> =
     {
+        title: "grazieModelsParameters",
         type: "object",
         properties: {
             apiKey: { type: "string" },
@@ -105,6 +107,7 @@ export const grazieUserModelParamsSchema: JSONSchemaType<GrazieUserModelParams>
 
 export const predefinedProofsUserModelParamsSchema: JSONSchemaType<PredefinedProofsUserModelParams> =
     {
+        title: "predefinedProofsModelsParameters",
         type: "object",
         properties: {
             tactics: {
@@ -118,6 +121,7 @@ export const predefinedProofsUserModelParamsSchema: JSONSchemaType<PredefinedPro
 
 export const lmStudioUserModelParamsSchema: JSONSchemaType<LMStudioUserModelParams> =
     {
+        title: "lmStudioModelsParameters",
         type: "object",
         properties: {
             temperature: { type: "number" },

From 13de1e8600f261478f8eb9b786cc678ec6841ed5 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 3 Apr 2024 12:21:36 +0200
Subject: [PATCH 08/29] Add a user settings guide and the table of contents to
 the README Reference issue: PLAN-76

---
 README.md                 | 82 +++++++++++++++++++++++++++++++++++++++
 src/extension/coqPilot.ts |  2 +-
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f6ec7495..e2defe44 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,24 @@ Now `coqpilot` is in early beta and seeks for feedbacks. Please feel free to ope
 
 ***Important:*** This version 2.0.0 is not yet tested very well. Please feel free to open an issue if you encounter any problem. Moreover, stay tuned for the soon release of the 2.0.1.
 
+# Table of Contents
+
+- 🚀 [Coqpilot Overview](#coqpilot)
+- 📋 [Requirements](#requirements)
+- 🔍 [Brief Technical Overview](#brief-technical-overview)
+- 💡 [Example Usage](#example-usage)
+- 🛠 [Installation](#installation)
+  - ▶️ [Coq-lsp Installation](#coq-lsp-installation)
+  - 🤖 [Building Locally](#building-locally)
+- ⚠️ [Important Information](#important)
+- ⚙️ [Extension Settings](#extension-settings)
+- 📐 [Guide to Model Configuration](#guide-to-model-configuration)
+  - 🎛 [How VsCode settings work](#how-vscode-settings-work)
+  - 🧰 [Model Configuration](#model-configuration)
+- 📌 [Contributed Commands](#contributed-commands)
+- 🚧 [Planned Features](#planned-features)
+- 📜 [Release Notes](#release-notes)
+
 ## Requirements
 
 * `coq-lsp` version `0.1.8+8.19.0` is currently required to run the extension.
@@ -109,6 +127,70 @@ This extension contributes the following settings:
 
 Each of these settings are modified in `settings.json` and contain an array of models from this service. Each model will be used for generation independantly. Multiple models for a single service could be defined. For example, you can define parameters for two open-ai gpt models. One would be using `gpt-3.5` and the other one `gpt-4`. CoqPilot will first try to generate proofs using the first model, and if it fails, it will try the second one. This way coqpilot iterates over all services (currently 4 of them) and for each service it iterates over all models. 
 
+## Guide to Model Configuration
+
+### How VsCode settings work
+
+A common way to change the settings, contributed by the extension, is to open the `settings.json` file, or click `Edit in settings.json` on some field in settings UI. Say, by default extension contributes field (setting) `A` with default state `a'`. When you click edit, this field is being copied to the `settings.json` file with the value `a'`: 
+```json
+{
+    "A": "a'"
+}
+```
+From that moment and until you completely remove this field from the `settings.json` file, this will be the source of truth for this setting. Once again, if you want to set the value of the setting `A` back to the default, you have to remove this field from the file completely.
+
+### Model configuration
+
+As mentioned in the previous section, at the moment, four services are supported. 
+
+By default, only `predefinedProofs` and `open-ai` services are enabled. The first one tries `auto.` tactic and the second one has one model -- `gpt-3.5`. Models for other services are defaulted with empty arrays. That denotes that we do not create any models from these services. 
+
+Each and every service is configured with an array of independent models. This was made to easily experiment with different models and their parameters. 
+
+The simplest service to configure is `predefinedProofs`: 
+```json
+{
+    "coqpilot.predefinedProofsModelsParameters": [
+        {
+            "modelName": "Any name",
+            "tactics": [
+                "reflexivity.",
+                "simpl. reflexivity.",
+                "auto."
+            ]
+        }
+    ]
+}
+```
+Model name here is only used for convinience inside code, so may be any string. 
+
+The most commonly used service is `open-ai` (`grazie` and `lmStudio` are configured very similarly). 
+```json
+{
+    "coqpilot.openAiModelsParameters": [
+        {
+            "temperature": 1,
+            "apiKey": "***your-api-key***",
+            "modelName": "gpt-3.5-turbo-0301",
+            "choices": 10,
+            "systemPrompt": "Generate proof...",
+            "newMessageMaxTokens": 2000,
+            "tokensLimit": 4096,
+            "multiroundProfile": {
+                "maxRoundsNumber": 1,
+                "proofFixChoices": 1,
+                "proofFixPrompt": "Unfortunately, the last proof is not correct..."
+            }
+        }
+    ],
+}
+```
+Don't forget to set up the `apiKey` field, by default it is set to `None`. Moreover, make sure that your open-ai key is valid and has enough credits to run the models. If you create a free version of the key, it will not work (it has some weird limitations like 5 requests per inf). You can check you key here: https://platform.openai.com/playground/chat. If the playground works, the key is probably valid.
+
+Multi-round profile setting configures the number of attempts to fix the proof if it is incorrect. If the proof is incorrect, the compiler message is sent to the LLM with a request to repair it. The number of round attempts for one proof is set by `maxRoundsNumber`. The number of choices for the proof fixing is set by `proofFixChoices`. By default, values are set to 1 and that means that **NO** attempts to fix the proof are made. That means that proof is only being generated once. That's equivalent to say that multi-round fixing is turned off. 0 is not a valid value for `maxRoundsNumber` nor for `proofFixChoices`.  
+
+Another thing to keep in mind: We are still in beta and changes in settings may occur pretty often. When that happens, and your re-defined settings (which are stored aside from the extension) are not updated (by hand), this can lead to exceptions. Keep in mind that if you get an error or plugin does not start after the update, you may want double check the settings. Easy way is remove the setting completely in `settings.json`, e.g. `openAiModelsParameters`, than go to the UI, click `Edit in settings.json` on the `openAiModelsParameters` field. It will fill up with updated default values. Afterwards you can re-define the settings as you want.
+
 ## Contributed Commands
 
 * `coqpilot.perform_completion_under_cursor`: Try to generate proof for the goal under the cursor.
diff --git a/src/extension/coqPilot.ts b/src/extension/coqPilot.ts
index 1aab8fbf..001a136a 100644
--- a/src/extension/coqPilot.ts
+++ b/src/extension/coqPilot.ts
@@ -423,6 +423,6 @@ class UserSettingsValidationError extends Error {
     }
 
     toString(): string {
-        return `Unable to validate user settings for ${this.settingsName}. Please refer to the README for the correct settings format.`;
+        return `Unable to validate user settings for ${this.settingsName}. Please refer to the README for the correct settings format: https://github.com/JetBrains-Research/coqpilot/blob/main/README.md#guide-to-model-configuration.`;
     }
 }

From 027d85544970d2fc63070515e8494e8f368275b7 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 3 Apr 2024 12:26:12 +0200
Subject: [PATCH 09/29] Tiny update README

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index e2defe44..f21e2385 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,6 @@
 
 Now `coqpilot` is in early beta and seeks for feedbacks. Please feel free to open an issue regarding any problem you encounter or any feature you want to see in the future. 
 
-***Important:*** This version 2.0.0 is not yet tested very well. Please feel free to open an issue if you encounter any problem. Moreover, stay tuned for the soon release of the 2.0.1.
-
 # Table of Contents
 
 - 🚀 [Coqpilot Overview](#coqpilot)

From 7dfb1f1b81634ca82098208d41fa65c53f35ec22 Mon Sep 17 00:00:00 2001
From: GlebSolovev <lionyoung@mail.ru>
Date: Thu, 11 Apr 2024 22:11:40 +0200
Subject: [PATCH 10/29] Adopt single file benchmark to v2.0.0

---
 package-lock.json                            |   4 +-
 package.json                                 |   1 +
 src/test/benchmark/benchmark.test.ts         | 216 +++++++++++++++++++
 src/test/benchmark/benchmark_settings.json   |  13 ++
 src/test/benchmark/dataset/auto_benchmark.v  |  45 ++++
 src/test/benchmark/dataset/mixed_benchmark.v |  19 ++
 6 files changed, 296 insertions(+), 2 deletions(-)
 create mode 100644 src/test/benchmark/benchmark.test.ts
 create mode 100644 src/test/benchmark/benchmark_settings.json
 create mode 100644 src/test/benchmark/dataset/auto_benchmark.v
 create mode 100644 src/test/benchmark/dataset/mixed_benchmark.v

diff --git a/package-lock.json b/package-lock.json
index 2a470209..454f0e37 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "coqpilot",
-  "version": "1.9.1",
+  "version": "2.0.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "coqpilot",
-      "version": "1.9.1",
+      "version": "2.0.0",
       "dependencies": {
         "@codemirror/autocomplete": "^6.9.1",
         "ajv": "^8.12.0",
diff --git a/package.json b/package.json
index 6bd31938..d66b00bd 100644
--- a/package.json
+++ b/package.json
@@ -353,6 +353,7 @@
     "pretest": "npm run compile && npm run lint",
     "test": "node ./out/test/runTest.js",
     "test-ci": "npm test -- -g=\"--non-ci\" -i=true",
+    "benchmark": "npm run test -- -g='Benchmark Test'",
     "clean": "rm -rf out"
   },
   "devDependencies": {
diff --git a/src/test/benchmark/benchmark.test.ts b/src/test/benchmark/benchmark.test.ts
new file mode 100644
index 00000000..494de04a
--- /dev/null
+++ b/src/test/benchmark/benchmark.test.ts
@@ -0,0 +1,216 @@
+import * as assert from "assert";
+import * as path from "path";
+
+import { LLMServices } from "../../llm/llmServices";
+import { GrazieService } from "../../llm/llmServices/grazie/grazieService";
+import { LMStudioService } from "../../llm/llmServices/lmStudio/lmStudioService";
+import { OpenAiService } from "../../llm/llmServices/openai/openAiService";
+import { PredefinedProofsService } from "../../llm/llmServices/predefinedProofs/predefinedProofsService";
+import { UserModelsParams } from "../../llm/userModelParams";
+
+import { CoqLspClient } from "../../coqLsp/coqLspClient";
+import { CoqLspConfig } from "../../coqLsp/coqLspConfig";
+
+import {
+    CompletionContext,
+    FailureGenerationResult,
+    FailureGenerationStatus,
+    ProcessEnvironment,
+    SourceFileEnvironment,
+    SuccessGenerationResult,
+    generateCompletion,
+} from "../../core/completionGenerator";
+import { CoqProofChecker } from "../../core/coqProofChecker";
+import { inspectSourceFile } from "../../core/inspectSourceFile";
+
+import { ProofStep } from "../../coqParser/parsedTypes";
+import { Uri } from "../../utils/uri";
+
+suite("Benchmark Test", () => {
+    const benchmarkDir = getBenchmarkDir();
+
+    // const settingsPath = path.join(benchmarkDir, "benchmark_settings.json");
+    const datasetDir = path.join(benchmarkDir, "dataset");
+
+    test("Complete with `auto`", async () => {
+        // const filePath = path.join(datasetDir, "auto_benchmark.v");
+        const filePath = path.join(datasetDir, "CombRelations.v");
+        const modelsParams: UserModelsParams = {
+            openAiParams: [],
+            grazieParams: [],
+            predefinedProofsModelParams: [
+                {
+                    modelName: "test",
+                    choices: undefined,
+
+                    systemPrompt: undefined,
+
+                    newMessageMaxTokens: undefined,
+                    tokensLimit: undefined,
+
+                    multiroundProfile: undefined,
+                    tactics: ["auto."],
+                },
+            ],
+            lmStudioParams: [],
+        };
+        await runTestBenchmark(filePath, modelsParams);
+    }).timeout(50000);
+
+    // test("Complete with models from settings", async () => {
+    //     const modelsParams: ModelsParams = parseBenchmarkSettings(settingsPath);
+    //     const sourcePath = path.join(datasetDir, "mixed_benchmark.v");
+    //     await runTestBenchmark(sourcePath, modelsParams);
+    // }).timeout(50000);
+});
+
+function getBenchmarkDir(): string {
+    const dirname: string = path.join(__dirname, "../../../");
+    return path.join(dirname, "src", "test", "benchmark");
+}
+
+async function runTestBenchmark(
+    filePath: string,
+    modelsParams: UserModelsParams
+): Promise<void> {
+    console.log(`run benchmarks for file: ${filePath}\n`);
+    const shouldCompleteHole = (_hole: ProofStep) => true;
+
+    const [completionContexts, sourceFileEnvironment, processEnvironment] =
+        await prepareForCompletions(modelsParams, shouldCompleteHole, filePath);
+
+    const totalCompletionsNumber = completionContexts.length;
+    let successfulCompletionsNumber = 0;
+    for (const completionContext of completionContexts) {
+        const success = await performSingleCompletion(
+            completionContext,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        if (success) {
+            successfulCompletionsNumber += 1;
+        }
+    }
+    console.log(
+        `BENCHMARK RESULT: ${successfulCompletionsNumber} / ${totalCompletionsNumber}`
+    );
+    assert.ok(successfulCompletionsNumber === totalCompletionsNumber);
+}
+
+async function prepareForCompletions(
+    modelsParams: UserModelsParams,
+    shouldCompleteHole: (hole: ProofStep) => boolean,
+    filePath: string
+): Promise<[CompletionContext[], SourceFileEnvironment, ProcessEnvironment]> {
+    const fileUri = Uri.fromPath(filePath);
+    const coqLspServerConfig = CoqLspConfig.createServerConfig();
+    const coqLspClientConfig = CoqLspConfig.createClientConfig();
+
+    const client = new CoqLspClient(coqLspServerConfig, coqLspClientConfig);
+    await client.openTextDocument(fileUri);
+
+    const coqProofChecker = new CoqProofChecker(client);
+    const mockFileVersion = 1;
+    const [completionContexts, sourceFileEnvironment] = await inspectSourceFile(
+        mockFileVersion,
+        shouldCompleteHole,
+        fileUri,
+        client
+    );
+    const llmServices: LLMServices = {
+        openAiService: new OpenAiService(),
+        grazieService: new GrazieService(),
+        predefinedProofsService: new PredefinedProofsService(),
+        lmStudioService: new LMStudioService(),
+    };
+    const processEnvironment: ProcessEnvironment = {
+        coqProofChecker: coqProofChecker,
+        modelsParams: modelsParams,
+        services: llmServices,
+    };
+
+    return [completionContexts, sourceFileEnvironment, processEnvironment];
+}
+
+async function performSingleCompletion(
+    completionContext: CompletionContext,
+    sourceFileEnvironment: SourceFileEnvironment,
+    processEnvironment: ProcessEnvironment
+): Promise<boolean> {
+    const completionPosition = completionContext.admitEndPosition;
+    console.log(
+        `hole position: ${completionPosition.line}:${completionPosition.character}`
+    );
+    const result = await generateCompletion(
+        completionContext,
+        sourceFileEnvironment,
+        processEnvironment
+    );
+    let message = "unknown";
+    let success = false;
+    if (result instanceof SuccessGenerationResult) {
+        message = `success: ${result.data}`;
+        success = true;
+    } else if (result instanceof FailureGenerationResult) {
+        switch (result.status) {
+            case FailureGenerationStatus.excededTimeout:
+                message = "timeout";
+                break;
+            case FailureGenerationStatus.exception:
+                message = `exception: ${result.message}`;
+                break;
+            case FailureGenerationStatus.searchFailed:
+                message = "no proofs for admit";
+                break;
+        }
+    }
+    console.log(message);
+    console.log("");
+    return success;
+}
+
+// function parseBenchmarkSettings(settingsPath: string): ModelsParams {
+//     const settingsText = fs.readFileSync(settingsPath, "utf-8");
+//     const modelsParams = JSON.parse(settingsText);
+
+//     const ajv = new Ajv2019({ strictSchema: true });
+
+//     const openAiParams: OpenAiModelParams[] = modelsParams[
+//         "coqpilot.openAiModelsParameters"
+//     ].map((params: any) =>
+//         validateAndParseJson(params, openAiModelParamsSchema, ajv)
+//     );
+//     const grazieParams: GrazieModelParams[] = modelsParams[
+//         "coqpilot.grazieModelsParameters"
+//     ].map((params: any) =>
+//         validateAndParseJson(params, grazieModelParamsSchema, ajv)
+//     );
+//     const predefinedProofsParams: PredefinedProofsModelParams[] = modelsParams[
+//         "coqpilot.predefinedProofsModelsParameters"
+//     ].map((params: any) =>
+//         validateAndParseJson(params, predefinedProofsModelParamsSchema, ajv)
+//     );
+
+//     return {
+//         openAiParams: openAiParams,
+//         grazieParams: grazieParams,
+//         predefinedProofsModelParams: predefinedProofsParams,
+//     };
+// }
+
+// function validateAndParseJson<T>(
+//     json: any,
+//     targetClassSchema: JSONSchemaType<T>,
+//     jsonSchemaValidator: Ajv2019
+// ): T {
+//     const instance: T = json as T;
+//     console.dir(instance, { depth: null });
+//     const validate = jsonSchemaValidator.compile(targetClassSchema);
+//     if (!validate(instance)) {
+//         throw new Error(
+//             `Unable to validate json against the class: ${JSON.stringify(validate.errors)}`
+//         );
+//     }
+
+//     return instance;
+// }
diff --git a/src/test/benchmark/benchmark_settings.json b/src/test/benchmark/benchmark_settings.json
new file mode 100644
index 00000000..59727b36
--- /dev/null
+++ b/src/test/benchmark/benchmark_settings.json
@@ -0,0 +1,13 @@
+{
+    "coqpilot.openAiModelsParameters": [
+        
+    ],
+    "coqpilot.predefinedProofsModelsParameters": [
+        {
+            "tactics": [
+                "auto."
+            ]
+        }
+    ],
+    "coqpilot.grazieModelsParameters": []
+}
diff --git a/src/test/benchmark/dataset/auto_benchmark.v b/src/test/benchmark/dataset/auto_benchmark.v
new file mode 100644
index 00000000..cda4e14b
--- /dev/null
+++ b/src/test/benchmark/dataset/auto_benchmark.v
@@ -0,0 +1,45 @@
+Theorem test : forall (A : Type) (P : A -> Prop) (x : A), P x -> P x.
+Proof.
+    admit.
+Admitted.
+
+Theorem test2 : forall (A : Type) (P : A -> Prop) (x : A), P x -> P x.
+Proof.
+    intros A P x H.
+    admit.
+Admitted.
+
+Theorem test2nat1 : forall n : nat, n = 0 \/ n <> 0.
+Proof.
+  intros n.
+  destruct n.
+  - admit.
+  - right.
+  discriminate.
+Admitted.
+
+Theorem test2nat2 : forall n : nat, n = 0 \/ n <> 0.
+Proof.
+    intros n.
+    destruct n.
+    {
+        admit.
+    }
+    {
+        admit.
+    }
+Admitted.
+
+Theorem test_thr : forall n:nat, 0 + n = n.
+Proof.
+    intros n. Print plus.
+    admit.
+    (* reflexivity. *)
+Admitted.
+
+Lemma test_thr1 : forall n:nat, 0 + n + 0 = n.
+Proof.
+    intros n. Print plus.
+    admit.
+    (* reflexivity. *)
+Admitted.
diff --git a/src/test/benchmark/dataset/mixed_benchmark.v b/src/test/benchmark/dataset/mixed_benchmark.v
new file mode 100644
index 00000000..b16ff72e
--- /dev/null
+++ b/src/test/benchmark/dataset/mixed_benchmark.v
@@ -0,0 +1,19 @@
+Theorem test_plus_1 : forall n:nat, 1 + n = S n.
+Proof.
+    auto.
+Qed.
+
+Theorem test_thr : forall n:nat, 0 + n = n.
+Proof.
+    admit.
+Admitted.
+
+Theorem add_comm : forall n m:nat, n + m = m + n.
+Proof.
+    admit.
+Admitted.
+
+Theorem add_comm_1 : forall n:nat, n + 1 = 1 + n.
+Proof.
+    admit.
+Admitted.

From b2fe877e5e54c1ed9cf91e14098d2355b71f0510 Mon Sep 17 00:00:00 2001
From: GlebSolovev <lionyoung@mail.ru>
Date: Fri, 12 Apr 2024 06:03:19 +0200
Subject: [PATCH 11/29] Design & implement new benchmarking api

Support switching workspaces, traversing directories and proving theorems
---
 package-lock.json                           | 524 +++++++++++++++++++-
 package.json                                |   7 +-
 src/test/benchmark/benchmark.test.ts        | 216 --------
 src/test/benchmark/benchmark_settings.json  |  13 -
 src/test/benchmark/benchmarkingFramework.ts | 333 +++++++++++++
 src/test/benchmark/loggingUtils.ts          |  37 ++
 src/test/benchmark/presets.ts               |  21 +
 src/test/benchmark/runBenchmark.test.ts     | 127 +++++
 8 files changed, 1021 insertions(+), 257 deletions(-)
 delete mode 100644 src/test/benchmark/benchmark.test.ts
 delete mode 100644 src/test/benchmark/benchmark_settings.json
 create mode 100644 src/test/benchmark/benchmarkingFramework.ts
 create mode 100644 src/test/benchmark/loggingUtils.ts
 create mode 100644 src/test/benchmark/presets.ts
 create mode 100644 src/test/benchmark/runBenchmark.test.ts

diff --git a/package-lock.json b/package-lock.json
index 454f0e37..d22ba0b9 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -44,7 +44,7 @@
         "eslint": "^8.56.0",
         "eslint-config-prettier": "^9.1.0",
         "eslint-plugin-prettier": "^5.1.3",
-        "glob": "^8.1.0",
+        "glob": "^10.3.12",
         "mocha": "^10.2.0",
         "prettier": "^3.2.5",
         "typescript": "^5.3.3"
@@ -619,6 +619,102 @@
       "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==",
       "dev": true
     },
+    "node_modules/@isaacs/cliui": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
+      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
+      "dev": true,
+      "dependencies": {
+        "string-width": "^5.1.2",
+        "string-width-cjs": "npm:string-width@^4.2.0",
+        "strip-ansi": "^7.0.1",
+        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
+        "wrap-ansi": "^8.1.0",
+        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/@isaacs/cliui/node_modules/ansi-regex": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
+      "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
+      }
+    },
+    "node_modules/@isaacs/cliui/node_modules/ansi-styles": {
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
+      "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
+      "dev": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@isaacs/cliui/node_modules/emoji-regex": {
+      "version": "9.2.2",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
+      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
+      "dev": true
+    },
+    "node_modules/@isaacs/cliui/node_modules/string-width": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
+      "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
+      "dev": true,
+      "dependencies": {
+        "eastasianwidth": "^0.2.0",
+        "emoji-regex": "^9.2.2",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/@isaacs/cliui/node_modules/strip-ansi": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
+      "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
+      }
+    },
+    "node_modules/@isaacs/cliui/node_modules/wrap-ansi": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
+      "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^6.1.0",
+        "string-width": "^5.0.1",
+        "strip-ansi": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
     "node_modules/@jridgewell/gen-mapping": {
       "version": "0.3.5",
       "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
@@ -723,6 +819,16 @@
         "node": ">= 8"
       }
     },
+    "node_modules/@pkgjs/parseargs": {
+      "version": "0.11.0",
+      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
+      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
+      "dev": true,
+      "optional": true,
+      "engines": {
+        "node": ">=14"
+      }
+    },
     "node_modules/@pkgr/core": {
       "version": "0.1.1",
       "resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.1.1.tgz",
@@ -1622,6 +1728,12 @@
         "error-stack-parser": "^2.1.4"
       }
     },
+    "node_modules/eastasianwidth": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
+      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==",
+      "dev": true
+    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -2097,6 +2209,22 @@
         }
       }
     },
+    "node_modules/foreground-child": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz",
+      "integrity": "sha512-TMKDUnIte6bfb5nWv7V/caI169OHgvwjb7V4WkeUvbQQdjr5rWKqHFiKWb/fcOwB+CzBT+qbWjvj+DVwRskpIg==",
+      "dev": true,
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "signal-exit": "^4.0.1"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
     "node_modules/form-data": {
       "version": "3.0.1",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.1.tgz",
@@ -2155,18 +2283,22 @@
       }
     },
     "node_modules/glob": {
-      "version": "8.1.0",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz",
-      "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==",
+      "version": "10.3.12",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
+      "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
+      "dev": true,
       "dependencies": {
-        "fs.realpath": "^1.0.0",
-        "inflight": "^1.0.4",
-        "inherits": "2",
-        "minimatch": "^5.0.1",
-        "once": "^1.3.0"
+        "foreground-child": "^3.1.0",
+        "jackspeak": "^2.3.6",
+        "minimatch": "^9.0.1",
+        "minipass": "^7.0.4",
+        "path-scurry": "^1.10.2"
+      },
+      "bin": {
+        "glob": "dist/esm/bin.mjs"
       },
       "engines": {
-        "node": ">=12"
+        "node": ">=16 || 14 >=14.17"
       },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
@@ -2188,19 +2320,24 @@
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
       "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dev": true,
       "dependencies": {
         "balanced-match": "^1.0.0"
       }
     },
     "node_modules/glob/node_modules/minimatch": {
-      "version": "5.1.6",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
-      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "version": "9.0.4",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
+      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
+      "dev": true,
       "dependencies": {
         "brace-expansion": "^2.0.1"
       },
       "engines": {
-        "node": ">=10"
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
       }
     },
     "node_modules/globals": {
@@ -2271,6 +2408,43 @@
         "readable-stream": "^3.6.0"
       }
     },
+    "node_modules/help-me/node_modules/brace-expansion": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/help-me/node_modules/glob": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz",
+      "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==",
+      "dependencies": {
+        "fs.realpath": "^1.0.0",
+        "inflight": "^1.0.4",
+        "inherits": "2",
+        "minimatch": "^5.0.1",
+        "once": "^1.3.0"
+      },
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/help-me/node_modules/minimatch": {
+      "version": "5.1.6",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
+      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/help-me/node_modules/readable-stream": {
       "version": "3.6.2",
       "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
@@ -2497,6 +2671,24 @@
       "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
       "dev": true
     },
+    "node_modules/jackspeak": {
+      "version": "2.3.6",
+      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.6.tgz",
+      "integrity": "sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==",
+      "dev": true,
+      "dependencies": {
+        "@isaacs/cliui": "^8.0.2"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      },
+      "optionalDependencies": {
+        "@pkgjs/parseargs": "^0.11.0"
+      }
+    },
     "node_modules/javascript-natural-sort": {
       "version": "0.7.1",
       "resolved": "https://registry.npmjs.org/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz",
@@ -2726,6 +2918,15 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/minipass": {
+      "version": "7.0.4",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
+      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      }
+    },
     "node_modules/mocha": {
       "version": "10.2.0",
       "resolved": "https://registry.npmjs.org/mocha/-/mocha-10.2.0.tgz",
@@ -5614,6 +5815,31 @@
         "node": ">=8"
       }
     },
+    "node_modules/path-scurry": {
+      "version": "1.10.2",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
+      "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
+      "dev": true,
+      "dependencies": {
+        "lru-cache": "^10.2.0",
+        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
+      },
+      "engines": {
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/path-scurry/node_modules/lru-cache": {
+      "version": "10.2.0",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz",
+      "integrity": "sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==",
+      "dev": true,
+      "engines": {
+        "node": "14 || >=16.14"
+      }
+    },
     "node_modules/path-type": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz",
@@ -6085,6 +6311,18 @@
         "node": ">=8"
       }
     },
+    "node_modules/signal-exit": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "dev": true,
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
     "node_modules/slash": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
@@ -6146,6 +6384,21 @@
         "node": ">=8"
       }
     },
+    "node_modules/string-width-cjs": {
+      "name": "string-width",
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/strip-ansi": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
@@ -6157,6 +6410,19 @@
         "node": ">=8"
       }
     },
+    "node_modules/strip-ansi-cjs": {
+      "name": "strip-ansi",
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/strip-json-comments": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
@@ -6459,6 +6725,24 @@
         "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
       }
     },
+    "node_modules/wrap-ansi-cjs": {
+      "name": "wrap-ansi",
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
     "node_modules/wrappy": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
@@ -6988,6 +7272,71 @@
       "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==",
       "dev": true
     },
+    "@isaacs/cliui": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
+      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
+      "dev": true,
+      "requires": {
+        "string-width": "^5.1.2",
+        "string-width-cjs": "npm:string-width@^4.2.0",
+        "strip-ansi": "^7.0.1",
+        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
+        "wrap-ansi": "^8.1.0",
+        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
+      },
+      "dependencies": {
+        "ansi-regex": {
+          "version": "6.0.1",
+          "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
+          "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
+          "dev": true
+        },
+        "ansi-styles": {
+          "version": "6.2.1",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
+          "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
+          "dev": true
+        },
+        "emoji-regex": {
+          "version": "9.2.2",
+          "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
+          "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
+          "dev": true
+        },
+        "string-width": {
+          "version": "5.1.2",
+          "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
+          "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
+          "dev": true,
+          "requires": {
+            "eastasianwidth": "^0.2.0",
+            "emoji-regex": "^9.2.2",
+            "strip-ansi": "^7.0.1"
+          }
+        },
+        "strip-ansi": {
+          "version": "7.1.0",
+          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
+          "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
+          "dev": true,
+          "requires": {
+            "ansi-regex": "^6.0.1"
+          }
+        },
+        "wrap-ansi": {
+          "version": "8.1.0",
+          "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
+          "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^6.1.0",
+            "string-width": "^5.0.1",
+            "strip-ansi": "^7.0.1"
+          }
+        }
+      }
+    },
     "@jridgewell/gen-mapping": {
       "version": "0.3.5",
       "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
@@ -7074,6 +7423,13 @@
         "fastq": "^1.6.0"
       }
     },
+    "@pkgjs/parseargs": {
+      "version": "0.11.0",
+      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
+      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
+      "dev": true,
+      "optional": true
+    },
     "@pkgr/core": {
       "version": "0.1.1",
       "resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.1.1.tgz",
@@ -7704,6 +8060,12 @@
         "error-stack-parser": "^2.1.4"
       }
     },
+    "eastasianwidth": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
+      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==",
+      "dev": true
+    },
     "emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -8051,6 +8413,16 @@
       "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.5.tgz",
       "integrity": "sha512-vSFWUON1B+yAw1VN4xMfxgn5fTUiaOzAJCKBwIIgT/+7CuGy9+r+5gITvP62j3RmaD5Ph65UaERdOSRGUzZtgw=="
     },
+    "foreground-child": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz",
+      "integrity": "sha512-TMKDUnIte6bfb5nWv7V/caI169OHgvwjb7V4WkeUvbQQdjr5rWKqHFiKWb/fcOwB+CzBT+qbWjvj+DVwRskpIg==",
+      "dev": true,
+      "requires": {
+        "cross-spawn": "^7.0.0",
+        "signal-exit": "^4.0.1"
+      }
+    },
     "form-data": {
       "version": "3.0.1",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.1.tgz",
@@ -8093,29 +8465,32 @@
       "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg=="
     },
     "glob": {
-      "version": "8.1.0",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz",
-      "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==",
+      "version": "10.3.12",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
+      "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
+      "dev": true,
       "requires": {
-        "fs.realpath": "^1.0.0",
-        "inflight": "^1.0.4",
-        "inherits": "2",
-        "minimatch": "^5.0.1",
-        "once": "^1.3.0"
+        "foreground-child": "^3.1.0",
+        "jackspeak": "^2.3.6",
+        "minimatch": "^9.0.1",
+        "minipass": "^7.0.4",
+        "path-scurry": "^1.10.2"
       },
       "dependencies": {
         "brace-expansion": {
           "version": "2.0.1",
           "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
           "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+          "dev": true,
           "requires": {
             "balanced-match": "^1.0.0"
           }
         },
         "minimatch": {
-          "version": "5.1.6",
-          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
-          "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+          "version": "9.0.4",
+          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
+          "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
+          "dev": true,
           "requires": {
             "brace-expansion": "^2.0.1"
           }
@@ -8181,6 +8556,34 @@
         "readable-stream": "^3.6.0"
       },
       "dependencies": {
+        "brace-expansion": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
+          "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+          "requires": {
+            "balanced-match": "^1.0.0"
+          }
+        },
+        "glob": {
+          "version": "8.1.0",
+          "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz",
+          "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==",
+          "requires": {
+            "fs.realpath": "^1.0.0",
+            "inflight": "^1.0.4",
+            "inherits": "2",
+            "minimatch": "^5.0.1",
+            "once": "^1.3.0"
+          }
+        },
+        "minimatch": {
+          "version": "5.1.6",
+          "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
+          "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+          "requires": {
+            "brace-expansion": "^2.0.1"
+          }
+        },
         "readable-stream": {
           "version": "3.6.2",
           "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
@@ -8344,6 +8747,16 @@
       "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
       "dev": true
     },
+    "jackspeak": {
+      "version": "2.3.6",
+      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.6.tgz",
+      "integrity": "sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==",
+      "dev": true,
+      "requires": {
+        "@isaacs/cliui": "^8.0.2",
+        "@pkgjs/parseargs": "^0.11.0"
+      }
+    },
     "javascript-natural-sort": {
       "version": "0.7.1",
       "resolved": "https://registry.npmjs.org/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz",
@@ -8525,6 +8938,12 @@
       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
       "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA=="
     },
+    "minipass": {
+      "version": "7.0.4",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
+      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
+      "dev": true
+    },
     "mocha": {
       "version": "10.2.0",
       "resolved": "https://registry.npmjs.org/mocha/-/mocha-10.2.0.tgz",
@@ -10420,6 +10839,24 @@
       "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
       "dev": true
     },
+    "path-scurry": {
+      "version": "1.10.2",
+      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
+      "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
+      "dev": true,
+      "requires": {
+        "lru-cache": "^10.2.0",
+        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
+      },
+      "dependencies": {
+        "lru-cache": {
+          "version": "10.2.0",
+          "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz",
+          "integrity": "sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==",
+          "dev": true
+        }
+      }
+    },
     "path-type": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz",
@@ -10753,6 +11190,12 @@
       "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
       "dev": true
     },
+    "signal-exit": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "dev": true
+    },
     "slash": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
@@ -10802,6 +11245,17 @@
         "strip-ansi": "^6.0.1"
       }
     },
+    "string-width-cjs": {
+      "version": "npm:string-width@4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
+      "requires": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      }
+    },
     "strip-ansi": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
@@ -10810,6 +11264,15 @@
         "ansi-regex": "^5.0.1"
       }
     },
+    "strip-ansi-cjs": {
+      "version": "npm:strip-ansi@6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "requires": {
+        "ansi-regex": "^5.0.1"
+      }
+    },
     "strip-json-comments": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
@@ -11051,6 +11514,17 @@
         "strip-ansi": "^6.0.0"
       }
     },
+    "wrap-ansi-cjs": {
+      "version": "npm:wrap-ansi@7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "dev": true,
+      "requires": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      }
+    },
     "wrappy": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
diff --git a/package.json b/package.json
index d66b00bd..8c7d4fd3 100644
--- a/package.json
+++ b/package.json
@@ -353,8 +353,9 @@
     "pretest": "npm run compile && npm run lint",
     "test": "node ./out/test/runTest.js",
     "test-ci": "npm test -- -g=\"--non-ci\" -i=true",
-    "benchmark": "npm run test -- -g='Benchmark Test'",
-    "clean": "rm -rf out"
+    "clean": "rm -rf out",
+    "prebenchmark": "npm run clean",
+    "benchmark": "npm run test -- -g='Benchmark'"
   },
   "devDependencies": {
     "@trivago/prettier-plugin-sort-imports": "^4.3.0",
@@ -373,7 +374,7 @@
     "eslint": "^8.56.0",
     "eslint-config-prettier": "^9.1.0",
     "eslint-plugin-prettier": "^5.1.3",
-    "glob": "^8.1.0",
+    "glob": "^10.3.12",
     "mocha": "^10.2.0",
     "prettier": "^3.2.5",
     "typescript": "^5.3.3"
diff --git a/src/test/benchmark/benchmark.test.ts b/src/test/benchmark/benchmark.test.ts
deleted file mode 100644
index 494de04a..00000000
--- a/src/test/benchmark/benchmark.test.ts
+++ /dev/null
@@ -1,216 +0,0 @@
-import * as assert from "assert";
-import * as path from "path";
-
-import { LLMServices } from "../../llm/llmServices";
-import { GrazieService } from "../../llm/llmServices/grazie/grazieService";
-import { LMStudioService } from "../../llm/llmServices/lmStudio/lmStudioService";
-import { OpenAiService } from "../../llm/llmServices/openai/openAiService";
-import { PredefinedProofsService } from "../../llm/llmServices/predefinedProofs/predefinedProofsService";
-import { UserModelsParams } from "../../llm/userModelParams";
-
-import { CoqLspClient } from "../../coqLsp/coqLspClient";
-import { CoqLspConfig } from "../../coqLsp/coqLspConfig";
-
-import {
-    CompletionContext,
-    FailureGenerationResult,
-    FailureGenerationStatus,
-    ProcessEnvironment,
-    SourceFileEnvironment,
-    SuccessGenerationResult,
-    generateCompletion,
-} from "../../core/completionGenerator";
-import { CoqProofChecker } from "../../core/coqProofChecker";
-import { inspectSourceFile } from "../../core/inspectSourceFile";
-
-import { ProofStep } from "../../coqParser/parsedTypes";
-import { Uri } from "../../utils/uri";
-
-suite("Benchmark Test", () => {
-    const benchmarkDir = getBenchmarkDir();
-
-    // const settingsPath = path.join(benchmarkDir, "benchmark_settings.json");
-    const datasetDir = path.join(benchmarkDir, "dataset");
-
-    test("Complete with `auto`", async () => {
-        // const filePath = path.join(datasetDir, "auto_benchmark.v");
-        const filePath = path.join(datasetDir, "CombRelations.v");
-        const modelsParams: UserModelsParams = {
-            openAiParams: [],
-            grazieParams: [],
-            predefinedProofsModelParams: [
-                {
-                    modelName: "test",
-                    choices: undefined,
-
-                    systemPrompt: undefined,
-
-                    newMessageMaxTokens: undefined,
-                    tokensLimit: undefined,
-
-                    multiroundProfile: undefined,
-                    tactics: ["auto."],
-                },
-            ],
-            lmStudioParams: [],
-        };
-        await runTestBenchmark(filePath, modelsParams);
-    }).timeout(50000);
-
-    // test("Complete with models from settings", async () => {
-    //     const modelsParams: ModelsParams = parseBenchmarkSettings(settingsPath);
-    //     const sourcePath = path.join(datasetDir, "mixed_benchmark.v");
-    //     await runTestBenchmark(sourcePath, modelsParams);
-    // }).timeout(50000);
-});
-
-function getBenchmarkDir(): string {
-    const dirname: string = path.join(__dirname, "../../../");
-    return path.join(dirname, "src", "test", "benchmark");
-}
-
-async function runTestBenchmark(
-    filePath: string,
-    modelsParams: UserModelsParams
-): Promise<void> {
-    console.log(`run benchmarks for file: ${filePath}\n`);
-    const shouldCompleteHole = (_hole: ProofStep) => true;
-
-    const [completionContexts, sourceFileEnvironment, processEnvironment] =
-        await prepareForCompletions(modelsParams, shouldCompleteHole, filePath);
-
-    const totalCompletionsNumber = completionContexts.length;
-    let successfulCompletionsNumber = 0;
-    for (const completionContext of completionContexts) {
-        const success = await performSingleCompletion(
-            completionContext,
-            sourceFileEnvironment,
-            processEnvironment
-        );
-        if (success) {
-            successfulCompletionsNumber += 1;
-        }
-    }
-    console.log(
-        `BENCHMARK RESULT: ${successfulCompletionsNumber} / ${totalCompletionsNumber}`
-    );
-    assert.ok(successfulCompletionsNumber === totalCompletionsNumber);
-}
-
-async function prepareForCompletions(
-    modelsParams: UserModelsParams,
-    shouldCompleteHole: (hole: ProofStep) => boolean,
-    filePath: string
-): Promise<[CompletionContext[], SourceFileEnvironment, ProcessEnvironment]> {
-    const fileUri = Uri.fromPath(filePath);
-    const coqLspServerConfig = CoqLspConfig.createServerConfig();
-    const coqLspClientConfig = CoqLspConfig.createClientConfig();
-
-    const client = new CoqLspClient(coqLspServerConfig, coqLspClientConfig);
-    await client.openTextDocument(fileUri);
-
-    const coqProofChecker = new CoqProofChecker(client);
-    const mockFileVersion = 1;
-    const [completionContexts, sourceFileEnvironment] = await inspectSourceFile(
-        mockFileVersion,
-        shouldCompleteHole,
-        fileUri,
-        client
-    );
-    const llmServices: LLMServices = {
-        openAiService: new OpenAiService(),
-        grazieService: new GrazieService(),
-        predefinedProofsService: new PredefinedProofsService(),
-        lmStudioService: new LMStudioService(),
-    };
-    const processEnvironment: ProcessEnvironment = {
-        coqProofChecker: coqProofChecker,
-        modelsParams: modelsParams,
-        services: llmServices,
-    };
-
-    return [completionContexts, sourceFileEnvironment, processEnvironment];
-}
-
-async function performSingleCompletion(
-    completionContext: CompletionContext,
-    sourceFileEnvironment: SourceFileEnvironment,
-    processEnvironment: ProcessEnvironment
-): Promise<boolean> {
-    const completionPosition = completionContext.admitEndPosition;
-    console.log(
-        `hole position: ${completionPosition.line}:${completionPosition.character}`
-    );
-    const result = await generateCompletion(
-        completionContext,
-        sourceFileEnvironment,
-        processEnvironment
-    );
-    let message = "unknown";
-    let success = false;
-    if (result instanceof SuccessGenerationResult) {
-        message = `success: ${result.data}`;
-        success = true;
-    } else if (result instanceof FailureGenerationResult) {
-        switch (result.status) {
-            case FailureGenerationStatus.excededTimeout:
-                message = "timeout";
-                break;
-            case FailureGenerationStatus.exception:
-                message = `exception: ${result.message}`;
-                break;
-            case FailureGenerationStatus.searchFailed:
-                message = "no proofs for admit";
-                break;
-        }
-    }
-    console.log(message);
-    console.log("");
-    return success;
-}
-
-// function parseBenchmarkSettings(settingsPath: string): ModelsParams {
-//     const settingsText = fs.readFileSync(settingsPath, "utf-8");
-//     const modelsParams = JSON.parse(settingsText);
-
-//     const ajv = new Ajv2019({ strictSchema: true });
-
-//     const openAiParams: OpenAiModelParams[] = modelsParams[
-//         "coqpilot.openAiModelsParameters"
-//     ].map((params: any) =>
-//         validateAndParseJson(params, openAiModelParamsSchema, ajv)
-//     );
-//     const grazieParams: GrazieModelParams[] = modelsParams[
-//         "coqpilot.grazieModelsParameters"
-//     ].map((params: any) =>
-//         validateAndParseJson(params, grazieModelParamsSchema, ajv)
-//     );
-//     const predefinedProofsParams: PredefinedProofsModelParams[] = modelsParams[
-//         "coqpilot.predefinedProofsModelsParameters"
-//     ].map((params: any) =>
-//         validateAndParseJson(params, predefinedProofsModelParamsSchema, ajv)
-//     );
-
-//     return {
-//         openAiParams: openAiParams,
-//         grazieParams: grazieParams,
-//         predefinedProofsModelParams: predefinedProofsParams,
-//     };
-// }
-
-// function validateAndParseJson<T>(
-//     json: any,
-//     targetClassSchema: JSONSchemaType<T>,
-//     jsonSchemaValidator: Ajv2019
-// ): T {
-//     const instance: T = json as T;
-//     console.dir(instance, { depth: null });
-//     const validate = jsonSchemaValidator.compile(targetClassSchema);
-//     if (!validate(instance)) {
-//         throw new Error(
-//             `Unable to validate json against the class: ${JSON.stringify(validate.errors)}`
-//         );
-//     }
-
-//     return instance;
-// }
diff --git a/src/test/benchmark/benchmark_settings.json b/src/test/benchmark/benchmark_settings.json
deleted file mode 100644
index 59727b36..00000000
--- a/src/test/benchmark/benchmark_settings.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "coqpilot.openAiModelsParameters": [
-        
-    ],
-    "coqpilot.predefinedProofsModelsParameters": [
-        {
-            "tactics": [
-                "auto."
-            ]
-        }
-    ],
-    "coqpilot.grazieModelsParameters": []
-}
diff --git a/src/test/benchmark/benchmarkingFramework.ts b/src/test/benchmark/benchmarkingFramework.ts
new file mode 100644
index 00000000..12699c6a
--- /dev/null
+++ b/src/test/benchmark/benchmarkingFramework.ts
@@ -0,0 +1,333 @@
+import * as assert from "assert";
+
+import { LLMServices } from "../../llm/llmServices";
+import { GrazieService } from "../../llm/llmServices/grazie/grazieService";
+import { LMStudioService } from "../../llm/llmServices/lmStudio/lmStudioService";
+import { OpenAiService } from "../../llm/llmServices/openai/openAiService";
+import { PredefinedProofsService } from "../../llm/llmServices/predefinedProofs/predefinedProofsService";
+import { UserModelsParams } from "../../llm/userModelParams";
+
+import { CoqLspClient } from "../../coqLsp/coqLspClient";
+import { CoqLspConfig } from "../../coqLsp/coqLspConfig";
+import { Goal, PpString } from "../../coqLsp/coqLspTypes";
+
+import {
+    CompletionContext,
+    FailureGenerationResult,
+    FailureGenerationStatus,
+    ProcessEnvironment,
+    SourceFileEnvironment,
+    SuccessGenerationResult,
+    generateCompletion,
+} from "../../core/completionGenerator";
+import { CoqProofChecker } from "../../core/coqProofChecker";
+import { createSourceFileEnvironment } from "../../core/inspectSourceFile";
+
+import { ProofStep, Theorem } from "../../coqParser/parsedTypes";
+import { Uri } from "../../utils/uri";
+
+import { consoleLog, consoleLogLine } from "./loggingUtils";
+
+export async function runTestBenchmark(
+    filePath: string,
+    modelsParams: UserModelsParams,
+    workspaceRootPath?: string,
+    requireAllAdmitsCompleted: Boolean = false
+): Promise<BenchmarkReport> {
+    consoleLog(`run benchmarks for file: ${filePath}\n`, "blue");
+    const shouldCompleteHole = (_hole: ProofStep) => true;
+
+    const [completionTargets, sourceFileEnvironment, processEnvironment] =
+        await prepareForBenchmarkCompletions(
+            modelsParams,
+            shouldCompleteHole,
+            workspaceRootPath,
+            filePath
+        );
+
+    consoleLogLine("\n");
+    console.log("try to complete admits\n");
+    const admitTargetsResults = await benchmarkTargets(
+        completionTargets.admitTargets,
+        sourceFileEnvironment,
+        processEnvironment
+    );
+    console.log(`BENCHMARK RESULT, ADMITS COMPLETED: ${admitTargetsResults}\n`);
+    consoleLogLine("\n");
+
+    console.log("try to prove theorems\n");
+    const theoremTargetsResults = await benchmarkTargets(
+        completionTargets.theoremTargets,
+        sourceFileEnvironment,
+        processEnvironment
+    );
+    console.log(
+        `BENCHMARK RESULT, THEOREMS PROVED: ${theoremTargetsResults}\n`
+    );
+    consoleLogLine();
+
+    if (requireAllAdmitsCompleted) {
+        assert.ok(admitTargetsResults.allCompleted());
+    }
+
+    return {
+        admitsCompleted: admitTargetsResults,
+        theoremsProved: theoremTargetsResults,
+    };
+}
+
+export interface BenchmarkingCompletionContext extends CompletionContext {
+    parentTheorem: Theorem;
+}
+
+export interface BenchmarkingCompletionTargets {
+    admitTargets: BenchmarkingCompletionContext[];
+    theoremTargets: BenchmarkingCompletionContext[];
+}
+
+export class BenchmarkResult {
+    constructor(
+        public totalCompletionsNumber: number,
+        public successfulCompletionsNumber: number
+    ) {}
+
+    toString = (): string => {
+        return `${this.successfulCompletionsNumber} / ${this.totalCompletionsNumber}`;
+    };
+
+    allCompleted(): Boolean {
+        return this.totalCompletionsNumber === this.successfulCompletionsNumber;
+    }
+
+    add(other: BenchmarkResult) {
+        this.totalCompletionsNumber += other.totalCompletionsNumber;
+        this.successfulCompletionsNumber += other.successfulCompletionsNumber;
+    }
+}
+
+export interface BenchmarkReport {
+    admitsCompleted: BenchmarkResult;
+    theoremsProved: BenchmarkResult;
+}
+
+export async function benchmarkTargets(
+    targets: BenchmarkingCompletionContext[],
+    sourceFileEnvironment: SourceFileEnvironment,
+    processEnvironment: ProcessEnvironment
+): Promise<BenchmarkResult> {
+    const totalCompletionsNumber = targets.length;
+    let successfulCompletionsNumber = 0;
+    for (const completionContext of targets) {
+        const success = await benchmarkCompletionGeneration(
+            completionContext,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        if (success) {
+            successfulCompletionsNumber += 1;
+        }
+    }
+    return new BenchmarkResult(
+        totalCompletionsNumber,
+        successfulCompletionsNumber
+    );
+}
+
+async function benchmarkCompletionGeneration(
+    completionContext: BenchmarkingCompletionContext,
+    sourceFileEnvironment: SourceFileEnvironment,
+    processEnvironment: ProcessEnvironment
+): Promise<boolean> {
+    const completionPosition = completionContext.admitEndPosition;
+    console.log(
+        `Completion position: ${completionPosition.line}:${completionPosition.character}`
+    );
+    console.log(`Theorem name: \`${completionContext.parentTheorem.name}\``);
+    console.log(`Proof goal: \`${goalToString(completionContext.proofGoal)}\``);
+
+    const sourceFileEnvironmentWithFilteredContext: SourceFileEnvironment = {
+        ...sourceFileEnvironment,
+        fileTheorems: sourceFileEnvironment.fileTheorems.filter(
+            (thr) => completionContext.parentTheorem !== thr
+        ),
+    };
+
+    const result = await generateCompletion(
+        completionContext,
+        sourceFileEnvironmentWithFilteredContext,
+        processEnvironment
+    );
+    let message = "unknown";
+    let success = false;
+    if (result instanceof SuccessGenerationResult) {
+        message = `Success: ${result.data}`;
+        success = true;
+    } else if (result instanceof FailureGenerationResult) {
+        switch (result.status) {
+            case FailureGenerationStatus.excededTimeout:
+                message = "Timeout";
+                break;
+            case FailureGenerationStatus.exception:
+                message = `Exception: ${result.message}`;
+                break;
+            case FailureGenerationStatus.searchFailed:
+                message = "Proofs not found";
+                break;
+        }
+    }
+    consoleLog(message, success ? "green" : "red");
+    console.log("");
+    return success;
+}
+
+function goalToString(proofGoal: Goal<PpString>): string {
+    return `${proofGoal?.ty}`;
+}
+
+async function prepareForBenchmarkCompletions(
+    modelsParams: UserModelsParams,
+    shouldCompleteHole: (hole: ProofStep) => boolean,
+    workspaceRootPath: string | undefined,
+    filePath: string
+): Promise<
+    [BenchmarkingCompletionTargets, SourceFileEnvironment, ProcessEnvironment]
+> {
+    const fileUri = Uri.fromPath(filePath);
+
+    const client = createCoqLspClient(workspaceRootPath);
+    await client.openTextDocument(fileUri);
+
+    const coqProofChecker = new CoqProofChecker(client);
+    const mockFileVersion = 1;
+    const [completionTargets, sourceFileEnvironment] =
+        await extractCompletionTargets(
+            mockFileVersion,
+            shouldCompleteHole,
+            fileUri,
+            client
+        );
+    const llmServices: LLMServices = {
+        openAiService: new OpenAiService(),
+        grazieService: new GrazieService(),
+        predefinedProofsService: new PredefinedProofsService(),
+        lmStudioService: new LMStudioService(),
+    };
+    const processEnvironment: ProcessEnvironment = {
+        coqProofChecker: coqProofChecker,
+        modelsParams: modelsParams,
+        services: llmServices,
+    };
+
+    return [completionTargets, sourceFileEnvironment, processEnvironment];
+}
+
+function createCoqLspClient(workspaceRootPath?: string): CoqLspClient {
+    const coqLspServerConfig = CoqLspConfig.createServerConfig();
+    const coqLspClientConfig = CoqLspConfig.createClientConfig(
+        process.env.COQ_LSP_PATH || "coq-lsp",
+        workspaceRootPath
+    );
+    return new CoqLspClient(coqLspServerConfig, coqLspClientConfig);
+}
+
+async function extractCompletionTargets(
+    fileVersion: number,
+    shouldCompleteHole: (hole: ProofStep) => boolean,
+    fileUri: Uri,
+    client: CoqLspClient
+): Promise<[BenchmarkingCompletionTargets, SourceFileEnvironment]> {
+    const sourceFileEnvironment = await createSourceFileEnvironment(
+        fileVersion,
+        fileUri,
+        client
+    );
+    const completionTargets = await createCompletionTargets(
+        fileVersion,
+        shouldCompleteHole,
+        sourceFileEnvironment.fileTheorems,
+        fileUri,
+        client
+    );
+    const sourceFileEnvironmentWithCompleteProofs: SourceFileEnvironment = {
+        ...sourceFileEnvironment,
+        fileTheorems: sourceFileEnvironment.fileTheorems.filter(
+            (thr) => thr.proof && !thr.proof.is_incomplete
+        ),
+    };
+
+    return [completionTargets, sourceFileEnvironmentWithCompleteProofs];
+}
+
+interface ParentedProofStep {
+    parentTheorem: Theorem;
+    proofStep: ProofStep;
+}
+
+async function createCompletionTargets(
+    fileVersion: number,
+    shouldCompleteHole: (hole: ProofStep) => boolean,
+    fileTheorems: Theorem[],
+    fileUri: Uri,
+    client: CoqLspClient
+): Promise<BenchmarkingCompletionTargets> {
+    const theoremsWithProofs = fileTheorems.filter((thr) => thr.proof);
+    const admitHolesToComplete = theoremsWithProofs
+        .map((thr) =>
+            thr.proof!.holes.map((hole) => {
+                return {
+                    parentTheorem: thr,
+                    proofStep: hole,
+                };
+            })
+        )
+        .flat()
+        .filter((parentedProofStep) =>
+            shouldCompleteHole(parentedProofStep.proofStep)
+        );
+    const firstProofSteps = theoremsWithProofs.map((thr) => {
+        return {
+            parentTheorem: thr,
+            proofStep: thr.proof!.proof_steps[1],
+        };
+    });
+
+    return {
+        admitTargets: await resolveProofStepsToCompletionContexts(
+            admitHolesToComplete,
+            fileVersion,
+            fileUri,
+            client
+        ),
+        theoremTargets: await resolveProofStepsToCompletionContexts(
+            firstProofSteps,
+            fileVersion,
+            fileUri,
+            client
+        ),
+    };
+}
+
+async function resolveProofStepsToCompletionContexts(
+    parentedProofSteps: ParentedProofStep[],
+    fileVersion: number,
+    fileUri: Uri,
+    client: CoqLspClient
+): Promise<BenchmarkingCompletionContext[]> {
+    let completionContexts: BenchmarkingCompletionContext[] = [];
+    for (const parentedProofStep of parentedProofSteps) {
+        const goal = await client.getFirstGoalAtPoint(
+            parentedProofStep.proofStep.range.start,
+            fileUri,
+            fileVersion
+        );
+        if (!(goal instanceof Error)) {
+            completionContexts.push({
+                proofGoal: goal,
+                prefixEndPosition: parentedProofStep.proofStep.range.start,
+                admitEndPosition: parentedProofStep.proofStep.range.end,
+                parentTheorem: parentedProofStep.parentTheorem,
+            });
+        }
+    }
+    return completionContexts;
+}
diff --git a/src/test/benchmark/loggingUtils.ts b/src/test/benchmark/loggingUtils.ts
new file mode 100644
index 00000000..91b28fa7
--- /dev/null
+++ b/src/test/benchmark/loggingUtils.ts
@@ -0,0 +1,37 @@
+export type LogColor = "red" | "green" | "blue" | "magenta" | "reset";
+
+export function consoleLog(
+    message: string,
+    color: LogColor | undefined = undefined
+) {
+    if (!color) {
+        console.log(message);
+        return;
+    }
+    const resetCode = code("reset");
+    const colorCode = code(color);
+    console.log(`${colorCode}%s${resetCode}`, message);
+}
+
+export function code(color: LogColor): string {
+    if (color === "reset") {
+        return "\x1b[0m";
+    }
+    if (color === "red") {
+        return "\x1b[31m";
+    }
+    if (color === "green") {
+        return "\x1b[32m";
+    }
+    if (color === "blue") {
+        return "\x1b[34m";
+    }
+    if (color === "magenta") {
+        return "\x1b[35m";
+    }
+    throw Error(`unknown LogColor: ${color}`);
+}
+
+export function consoleLogLine(suffix: string = "") {
+    console.log(`----------------------------${suffix}`);
+}
diff --git a/src/test/benchmark/presets.ts b/src/test/benchmark/presets.ts
new file mode 100644
index 00000000..6d7bd1bc
--- /dev/null
+++ b/src/test/benchmark/presets.ts
@@ -0,0 +1,21 @@
+import { UserModelsParams } from "../../llm/userModelParams";
+
+export const onlyAutoModelsParams: UserModelsParams = {
+    openAiParams: [],
+    grazieParams: [],
+    predefinedProofsModelParams: [
+        {
+            modelName: "Predefined `auto`",
+            choices: undefined,
+
+            systemPrompt: undefined,
+
+            newMessageMaxTokens: undefined,
+            tokensLimit: undefined,
+
+            multiroundProfile: undefined,
+            tactics: ["auto."],
+        },
+    ],
+    lmStudioParams: [],
+};
diff --git a/src/test/benchmark/runBenchmark.test.ts b/src/test/benchmark/runBenchmark.test.ts
new file mode 100644
index 00000000..9d11497b
--- /dev/null
+++ b/src/test/benchmark/runBenchmark.test.ts
@@ -0,0 +1,127 @@
+import * as fs from "fs";
+import * as path from "path";
+
+import { UserModelsParams } from "../../llm/userModelParams";
+
+import { BenchmarkResult, runTestBenchmark } from "./benchmarkingFramework";
+import { code, consoleLogLine } from "./loggingUtils";
+import { onlyAutoModelsParams } from "./presets";
+
+interface Benchmark {
+    name: string;
+    items: DatasetItem[];
+    modelsParams: UserModelsParams;
+    requireAllAdmitsCompleted: Boolean;
+    timeoutMinutes: number;
+}
+
+class DatasetItem {
+    workspaceRootPath: string | undefined;
+
+    /* Paths should be relative to 'dataset' folder */
+    constructor(
+        public path: string,
+        workspaceRootPath: string | undefined = undefined
+    ) {
+        this.workspaceRootPath = workspaceRootPath;
+    }
+}
+
+const simpleAutoBenchmark: Benchmark = {
+    name: "Complete simple examples with `auto`",
+    items: [new DatasetItem("auto_benchmark.v")],
+    modelsParams: onlyAutoModelsParams,
+    requireAllAdmitsCompleted: true,
+    timeoutMinutes: 1,
+};
+
+const mixedAutoBenchmark: Benchmark = {
+    name: "Complete mixed examples (both simple & hard) with `auto`",
+    items: [new DatasetItem("mixed_benchmark.v")],
+    modelsParams: onlyAutoModelsParams,
+    requireAllAdmitsCompleted: false,
+    timeoutMinutes: 1,
+};
+
+const benchmarks: Benchmark[] = [simpleAutoBenchmark, mixedAutoBenchmark];
+
+suite("Benchmark", () => {
+    const benchmarkDir = getBenchmarkDir();
+    const datasetDir = path.join(benchmarkDir, "dataset");
+
+    for (const benchmark of benchmarks) {
+        test(benchmark.name, async () => {
+            const admitsCompletedInTotal = new BenchmarkResult(0, 0);
+            const theoremsProvedInTotal = new BenchmarkResult(0, 0);
+
+            for (const item of benchmark.items) {
+                const resolvedWorkspaceRootPath = item.workspaceRootPath
+                    ? path.join(datasetDir, item.workspaceRootPath)
+                    : undefined;
+
+                const resolvedItemPath = path.join(datasetDir, item.path);
+                const itemPathStats = getPathStats(resolvedItemPath);
+                if (!itemPathStats.isDirectory() && !itemPathStats.isFile()) {
+                    throw Error(`unsupported path type: ${item.path}`);
+                }
+                const resolvedFilePaths = itemPathStats.isDirectory()
+                    ? findSourceFiles(resolvedItemPath)
+                    : [resolvedItemPath];
+
+                for (const resolvedFilePath of resolvedFilePaths) {
+                    const { admitsCompleted, theoremsProved } =
+                        await runTestBenchmark(
+                            resolvedFilePath,
+                            benchmark.modelsParams,
+                            resolvedWorkspaceRootPath,
+                            benchmark.requireAllAdmitsCompleted
+                        );
+                    admitsCompletedInTotal.add(admitsCompleted);
+                    theoremsProvedInTotal.add(theoremsProved);
+                }
+            }
+
+            consoleLogLine();
+            consoleLogLine("\n");
+            console.log(
+                `${code("magenta")}BENCHMARK REPORT:${code("reset")} ${benchmark.name}`
+            );
+            console.log(
+                `- ADMITS COMPLETED IN TOTAL: ${admitsCompletedInTotal}`
+            );
+            console.log(
+                `- THEOREMS PROVED IN TOTAL: ${theoremsProvedInTotal}\n`
+            );
+        }).timeout(benchmark.timeoutMinutes * 60 * 1000);
+    }
+});
+
+function getBenchmarkDir(): string {
+    const dirname: string = path.join(__dirname, "../../../");
+    return path.join(dirname, "src", "test", "benchmark");
+}
+
+function getPathStats(path: string): fs.Stats {
+    return fs.lstatSync(path);
+}
+
+function findSourceFiles(directoryPath: string): string[] {
+    let sourceFilePaths: string[] = [];
+    function traverseDirectory(curDirectoryPath: string) {
+        fs.readdirSync(curDirectoryPath).forEach((child) => {
+            const childPath = path.join(curDirectoryPath, child);
+            if (fs.statSync(childPath).isDirectory()) {
+                traverseDirectory(childPath);
+            } else {
+                if (
+                    path.extname(childPath) === ".v" &&
+                    !childPath.endsWith("_cp_aux.v")
+                ) {
+                    sourceFilePaths.push(childPath);
+                }
+            }
+        });
+    }
+    traverseDirectory(directoryPath);
+    return sourceFilePaths;
+}

From 72def287187ab198540e6ffd744014dd3bf4de24 Mon Sep 17 00:00:00 2001
From: GlebSolovev <lionyoung@mail.ru>
Date: Tue, 16 Apr 2024 01:52:39 +0200
Subject: [PATCH 12/29] Move dataset directory

---
 .gitignore                                          |  5 ++++-
 .../benchmark/dataset => dataset}/auto_benchmark.v  |  0
 .../benchmark/dataset => dataset}/mixed_benchmark.v |  0
 src/test/benchmark/benchmark_settings.json          | 13 -------------
 src/test/benchmark/runBenchmark.test.ts             |  7 +++----
 5 files changed, 7 insertions(+), 18 deletions(-)
 rename {src/test/benchmark/dataset => dataset}/auto_benchmark.v (100%)
 rename {src/test/benchmark/dataset => dataset}/mixed_benchmark.v (100%)
 delete mode 100644 src/test/benchmark/benchmark_settings.json

diff --git a/.gitignore b/.gitignore
index 52869655..0c0b61c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,7 @@ src/test/resources/coqProj/.vscode
 **/*.aux
 src/test/resources/coqProj/.Makefile.coq.d
 src/test/resources/coqProj/Makefile.coq
-src/test/resources/coqProj/Makefile.coq.conf
\ No newline at end of file
+src/test/resources/coqProj/Makefile.coq.conf
+
+# Benchmark's dataset
+dataset/externalProjects/
\ No newline at end of file
diff --git a/src/test/benchmark/dataset/auto_benchmark.v b/dataset/auto_benchmark.v
similarity index 100%
rename from src/test/benchmark/dataset/auto_benchmark.v
rename to dataset/auto_benchmark.v
diff --git a/src/test/benchmark/dataset/mixed_benchmark.v b/dataset/mixed_benchmark.v
similarity index 100%
rename from src/test/benchmark/dataset/mixed_benchmark.v
rename to dataset/mixed_benchmark.v
diff --git a/src/test/benchmark/benchmark_settings.json b/src/test/benchmark/benchmark_settings.json
deleted file mode 100644
index 202a40c2..00000000
--- a/src/test/benchmark/benchmark_settings.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "coqpilot.openAiModelsParameters": [
-        
-    ],
-    "coqpilot.predefinedProofsModelsParameters": [
-        {
-            "tactics": [
-                "auto."
-            ]
-        }
-    ],
-    "coqpilot.grazieModelsParameters": []
-}
\ No newline at end of file
diff --git a/src/test/benchmark/runBenchmark.test.ts b/src/test/benchmark/runBenchmark.test.ts
index 9d11497b..0e14a99a 100644
--- a/src/test/benchmark/runBenchmark.test.ts
+++ b/src/test/benchmark/runBenchmark.test.ts
@@ -46,8 +46,7 @@ const mixedAutoBenchmark: Benchmark = {
 const benchmarks: Benchmark[] = [simpleAutoBenchmark, mixedAutoBenchmark];
 
 suite("Benchmark", () => {
-    const benchmarkDir = getBenchmarkDir();
-    const datasetDir = path.join(benchmarkDir, "dataset");
+    const datasetDir = getDatasetDir();
 
     for (const benchmark of benchmarks) {
         test(benchmark.name, async () => {
@@ -96,9 +95,9 @@ suite("Benchmark", () => {
     }
 });
 
-function getBenchmarkDir(): string {
+function getDatasetDir(): string {
     const dirname: string = path.join(__dirname, "../../../");
-    return path.join(dirname, "src", "test", "benchmark");
+    return path.join(dirname, "dataset");
 }
 
 function getPathStats(path: string): fs.Stats {

From f8330f5a5b9a81dca4bad3186d6bf6a0774210dc Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 17 Apr 2024 20:49:26 +0200
Subject: [PATCH 13/29] Figure out how to properly use nix-env + minors

---
 .gitignore                              |  6 ++-
 .gitmodules                             |  5 +++
 .vscodeignore                           |  3 +-
 README.md                               | 31 ++++++++++++++++
 src/coqLsp/coqLspConnector.ts           |  2 +
 src/logging/outputChannelEmulator.ts    | 49 +++++++++++++++++++++++++
 src/test/benchmark/dataset/imm          |  1 +
 src/test/benchmark/runBenchmark.test.ts | 14 ++++++-
 8 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 src/logging/outputChannelEmulator.ts
 create mode 160000 src/test/benchmark/dataset/imm

diff --git a/.gitignore b/.gitignore
index 52869655..da12b220 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,8 @@ src/test/resources/coqProj/.vscode
 **/*.aux
 src/test/resources/coqProj/.Makefile.coq.d
 src/test/resources/coqProj/Makefile.coq
-src/test/resources/coqProj/Makefile.coq.conf
\ No newline at end of file
+src/test/resources/coqProj/Makefile.coq.conf
+
+# Ignore the generated build files in datatests inside benchmarks
+src/test/benchmark/dataset/**/result
+src/test/benchmark/dataset/**/.vscode/
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..f49d8b4b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,5 @@
+[submodule "src/test/benchmark/dataset/imm"]
+	path = src/test/benchmark/dataset/imm
+	url = https://github.com/weakmemory/imm.git
+	brach = coq819
+	ignore = dirty
diff --git a/.vscodeignore b/.vscodeignore
index 0eef022f..37e3f0c4 100644
--- a/.vscodeignore
+++ b/.vscodeignore
@@ -9,4 +9,5 @@ vsc-extension-quickstart.md
 **/*.map
 **/*.ts
 logs/**
-_opam/**
\ No newline at end of file
+_opam/**
+./out/test/**
\ No newline at end of file
diff --git a/README.md b/README.md
index f6ec7495..70f08220 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,37 @@ Each of these settings are modified in `settings.json` and contain an array of m
 
 - Add benchmarking options for various models: soon. 
 
+## Benchmark
+
+To run benchmarks on some project, apart from installing and building CoqPilot manually as described above, you will need to download the necessary projects that are used as datasets for the benchmarks. These projects are added as submodules to the repository. To download them, run the following commands:
+```bash
+git submodule init
+git submodule update
+```
+After that, you need to build the projects. And be careful, the actively maintained way to build this projects is `nix`. Moreover, when adding your own projects, make sure that they are built using `coq-8.19.0`. 
+
+First things first, the process of running the benchmark is not perfectly automated yet. We are working on it. For now, one project (one unit containing nix environment) shall be ran at a time. Let's say you are going to run the benchmark on the `imm` project. You will have to do the following: 
+
+1. Install nix, as specified in the [here](https://nixos.org/download.html). 
+
+2. Install needed caches: 
+    ```bash
+    nix-env -iA nixpkgs.cachix && cachix use coq && cachix use coq-community && cachix use math-comp
+    cachix use weakmemory
+    ```
+
+3. Go to the `imm` subdirectory, apply the nix environment (without it the project will NOT build) and build the project: 
+    ```bash
+    cd src/test/benchmark/dataset/imm 
+    nix-shell 
+    nix-build
+    ```
+4. Return to the project root not exiting the nix-shell. Run the benchmark: 
+    ```bash
+    cd ../../../../..
+    npm run benchmark
+    ```    
+
 ## Release Notes
 
 Release notes could be found in the [CHANGELOG.md](https://github.com/JetBrains-Research/coqpilot/blob/refactor/CHANGELOG.md) file.
diff --git a/src/coqLsp/coqLspConnector.ts b/src/coqLsp/coqLspConnector.ts
index c33da631..9d0fc0e2 100644
--- a/src/coqLsp/coqLspConnector.ts
+++ b/src/coqLsp/coqLspConnector.ts
@@ -2,6 +2,7 @@ import { Uri } from "vscode";
 import {
     LanguageClientOptions,
     RevealOutputChannelOn,
+    Trace,
 } from "vscode-languageclient";
 import { LanguageClient, ServerOptions } from "vscode-languageclient/node";
 
@@ -58,6 +59,7 @@ export class CoqLspConnector extends LanguageClient {
     }
 
     override async start(): Promise<void> {
+        super.setTrace(Trace.Verbose);
         await super
             .start()
             .then(this.logStatusUpdate.bind(this, "started"))
diff --git a/src/logging/outputChannelEmulator.ts b/src/logging/outputChannelEmulator.ts
new file mode 100644
index 00000000..ec97e2f7
--- /dev/null
+++ b/src/logging/outputChannelEmulator.ts
@@ -0,0 +1,49 @@
+import { appendFileSync, existsSync, writeFileSync } from "fs";
+import { OutputChannel, ViewColumn } from "vscode";
+
+export class OutputChannelEmulator implements OutputChannel {
+    name: string;
+
+    constructor(
+        private logFilePath: string,
+        name: string = "OutputChannelEmulator"
+    ) {
+        this.name = name;
+        if (!existsSync(this.logFilePath)) {
+            writeFileSync(this.logFilePath, "");
+        }
+    }
+
+    append(value: string): void {
+        appendFileSync(this.logFilePath, value);
+    }
+
+    appendLine(value: string): void {
+        appendFileSync(this.logFilePath, value + "\n");
+    }
+
+    replace(value: string): void {
+        writeFileSync(this.logFilePath, value);
+    }
+
+    clear(): void {
+        writeFileSync(this.logFilePath, "");
+    }
+
+    show(preserveFocus?: boolean | undefined): void;
+
+    show(
+        column?: ViewColumn | undefined,
+        preserveFocus?: boolean | undefined
+    ): void;
+
+    show(_column?: unknown, _preserveFocus?: unknown): void {
+        throw new Error("Method not implemented.");
+    }
+
+    hide(): void {
+        throw new Error("Method not implemented.");
+    }
+
+    dispose(): void {}
+}
diff --git a/src/test/benchmark/dataset/imm b/src/test/benchmark/dataset/imm
new file mode 160000
index 00000000..14025d11
--- /dev/null
+++ b/src/test/benchmark/dataset/imm
@@ -0,0 +1 @@
+Subproject commit 14025d11ee217b5397c544917c5623a80abbd28e
diff --git a/src/test/benchmark/runBenchmark.test.ts b/src/test/benchmark/runBenchmark.test.ts
index 9d11497b..a57e7dd2 100644
--- a/src/test/benchmark/runBenchmark.test.ts
+++ b/src/test/benchmark/runBenchmark.test.ts
@@ -43,7 +43,19 @@ const mixedAutoBenchmark: Benchmark = {
     timeoutMinutes: 1,
 };
 
-const benchmarks: Benchmark[] = [simpleAutoBenchmark, mixedAutoBenchmark];
+const immBenchmark: Benchmark = {
+    name: "Complete imm with auto",
+    items: [new DatasetItem("imm/src/basic/Events.v", "imm")],
+    modelsParams: onlyAutoModelsParams,
+    requireAllAdmitsCompleted: false,
+    timeoutMinutes: 60,
+};
+
+const benchmarks: Benchmark[] = [
+    simpleAutoBenchmark,
+    mixedAutoBenchmark,
+    immBenchmark,
+];
 
 suite("Benchmark", () => {
     const benchmarkDir = getBenchmarkDir();

From 20161f896edc72c045799ab1d72ab564afe6e883 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 17 Apr 2024 21:14:22 +0200
Subject: [PATCH 14/29] Fix merge

---
 .gitmodules                                 | 6 +++---
 .vscodeignore                               | 3 ++-
 README.md                                   | 4 ++--
 {src/test/benchmark/dataset => dataset}/imm | 0
 4 files changed, 7 insertions(+), 6 deletions(-)
 rename {src/test/benchmark/dataset => dataset}/imm (100%)

diff --git a/.gitmodules b/.gitmodules
index f49d8b4b..b845a808 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,5 +1,5 @@
-[submodule "src/test/benchmark/dataset/imm"]
-	path = src/test/benchmark/dataset/imm
+[submodule "dataset/imm"]
+	path = dataset/imm
 	url = https://github.com/weakmemory/imm.git
-	brach = coq819
+	branch = coq819
 	ignore = dirty
diff --git a/.vscodeignore b/.vscodeignore
index 37e3f0c4..a12b1aff 100644
--- a/.vscodeignore
+++ b/.vscodeignore
@@ -10,4 +10,5 @@ vsc-extension-quickstart.md
 **/*.ts
 logs/**
 _opam/**
-./out/test/**
\ No newline at end of file
+./out/test/**
+dataset
\ No newline at end of file
diff --git a/README.md b/README.md
index 70f08220..460b7b57 100644
--- a/README.md
+++ b/README.md
@@ -140,13 +140,13 @@ First things first, the process of running the benchmark is not perfectly automa
 
 3. Go to the `imm` subdirectory, apply the nix environment (without it the project will NOT build) and build the project: 
     ```bash
-    cd src/test/benchmark/dataset/imm 
+    cd dataset/imm 
     nix-shell 
     nix-build
     ```
 4. Return to the project root not exiting the nix-shell. Run the benchmark: 
     ```bash
-    cd ../../../../..
+    cd ../../
     npm run benchmark
     ```    
 
diff --git a/src/test/benchmark/dataset/imm b/dataset/imm
similarity index 100%
rename from src/test/benchmark/dataset/imm
rename to dataset/imm

From 5c04ba5684208878b5eaabe5e979eded7e1325c1 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Thu, 25 Apr 2024 15:02:47 +0200
Subject: [PATCH 15/29] Update benchmarking guide

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 460b7b57..a82abc7a 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,13 @@ After that, you need to build the projects. And be careful, the actively maintai
 
 First things first, the process of running the benchmark is not perfectly automated yet. We are working on it. For now, one project (one unit containing nix environment) shall be ran at a time. Let's say you are going to run the benchmark on the `imm` project. You will have to do the following: 
 
+0. Go the the `imm` subdirectory and add a `_CoqProject` file in the root with the following: 
+    ```
+    -I result/lib/coq/8.19/user-contrib/imm
+    -R result/lib/coq/8.19/user-contrib/imm imm
+    ```
+    This is needed for the lsp-server to correctly resolve file dependencies.
+
 1. Install nix, as specified in the [here](https://nixos.org/download.html). 
 
 2. Install needed caches: 

From a22dc106fe85e95e32ddcc81035b8ee236ebf591 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Thu, 25 Apr 2024 16:53:30 +0200
Subject: [PATCH 16/29] Prepare for small benchmarking

---
 src/test/benchmark/benchmarkingFramework.ts | 71 ++++++++++++++-------
 src/test/benchmark/runBenchmark.test.ts     | 66 ++++++++++++-------
 2 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/src/test/benchmark/benchmarkingFramework.ts b/src/test/benchmark/benchmarkingFramework.ts
index 12699c6a..630a5c2e 100644
--- a/src/test/benchmark/benchmarkingFramework.ts
+++ b/src/test/benchmark/benchmarkingFramework.ts
@@ -31,6 +31,9 @@ import { consoleLog, consoleLogLine } from "./loggingUtils";
 export async function runTestBenchmark(
     filePath: string,
     modelsParams: UserModelsParams,
+    specificTheoremsForBenchmark: string[] | undefined,
+    benchmarkFullTheorems: Boolean = true,
+    benchmarkAdmits: Boolean = true,
     workspaceRootPath?: string,
     requireAllAdmitsCompleted: Boolean = false
 ): Promise<BenchmarkReport> {
@@ -44,30 +47,54 @@ export async function runTestBenchmark(
             workspaceRootPath,
             filePath
         );
+    const filteredCompletionTargets = {
+        admitTargets: completionTargets.admitTargets.filter(
+            (target) =>
+                specificTheoremsForBenchmark?.includes(
+                    target.parentTheorem.name
+                ) ?? true
+        ),
+        theoremTargets: completionTargets.theoremTargets.filter(
+            (target) =>
+                specificTheoremsForBenchmark?.includes(
+                    target.parentTheorem.name
+                ) ?? true
+        ),
+    };
 
     consoleLogLine("\n");
-    console.log("try to complete admits\n");
-    const admitTargetsResults = await benchmarkTargets(
-        completionTargets.admitTargets,
-        sourceFileEnvironment,
-        processEnvironment
-    );
-    console.log(`BENCHMARK RESULT, ADMITS COMPLETED: ${admitTargetsResults}\n`);
-    consoleLogLine("\n");
 
-    console.log("try to prove theorems\n");
-    const theoremTargetsResults = await benchmarkTargets(
-        completionTargets.theoremTargets,
-        sourceFileEnvironment,
-        processEnvironment
-    );
-    console.log(
-        `BENCHMARK RESULT, THEOREMS PROVED: ${theoremTargetsResults}\n`
-    );
-    consoleLogLine();
+    let admitTargetsResults: BenchmarkResult | undefined = undefined;
+    let theoremTargetsResults: BenchmarkResult | undefined = undefined;
+
+    if (benchmarkAdmits) {
+        console.log("try to complete admits\n");
+        admitTargetsResults = await benchmarkTargets(
+            filteredCompletionTargets.admitTargets,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        console.log(
+            `BENCHMARK RESULT, ADMITS COMPLETED: ${admitTargetsResults}\n`
+        );
+        consoleLogLine("\n");
 
-    if (requireAllAdmitsCompleted) {
-        assert.ok(admitTargetsResults.allCompleted());
+        if (requireAllAdmitsCompleted) {
+            assert.ok(admitTargetsResults.allCompleted());
+        }
+    }
+
+    if (benchmarkFullTheorems) {
+        console.log("try to prove theorems\n");
+        theoremTargetsResults = await benchmarkTargets(
+            filteredCompletionTargets.theoremTargets,
+            sourceFileEnvironment,
+            processEnvironment
+        );
+        console.log(
+            `BENCHMARK RESULT, THEOREMS PROVED: ${theoremTargetsResults}\n`
+        );
+        consoleLogLine();
     }
 
     return {
@@ -106,8 +133,8 @@ export class BenchmarkResult {
 }
 
 export interface BenchmarkReport {
-    admitsCompleted: BenchmarkResult;
-    theoremsProved: BenchmarkResult;
+    admitsCompleted?: BenchmarkResult;
+    theoremsProved?: BenchmarkResult;
 }
 
 export async function benchmarkTargets(
diff --git a/src/test/benchmark/runBenchmark.test.ts b/src/test/benchmark/runBenchmark.test.ts
index 2231e46f..a6d04f55 100644
--- a/src/test/benchmark/runBenchmark.test.ts
+++ b/src/test/benchmark/runBenchmark.test.ts
@@ -12,50 +12,63 @@ interface Benchmark {
     items: DatasetItem[];
     modelsParams: UserModelsParams;
     requireAllAdmitsCompleted: Boolean;
+    benchmarkFullTheorems: Boolean;
+    benchmarkAdmits: Boolean;
     timeoutMinutes: number;
 }
 
 class DatasetItem {
     workspaceRootPath: string | undefined;
+    specificTheoremForBenchmark: string[] | undefined;
 
     /* Paths should be relative to 'dataset' folder */
     constructor(
         public path: string,
+        specificTheoremForBenchmark: string[] | undefined = undefined,
         workspaceRootPath: string | undefined = undefined
     ) {
         this.workspaceRootPath = workspaceRootPath;
+        this.specificTheoremForBenchmark = specificTheoremForBenchmark;
     }
 }
 
-const simpleAutoBenchmark: Benchmark = {
-    name: "Complete simple examples with `auto`",
-    items: [new DatasetItem("auto_benchmark.v")],
-    modelsParams: onlyAutoModelsParams,
-    requireAllAdmitsCompleted: true,
-    timeoutMinutes: 1,
-};
-
-const mixedAutoBenchmark: Benchmark = {
-    name: "Complete mixed examples (both simple & hard) with `auto`",
-    items: [new DatasetItem("mixed_benchmark.v")],
-    modelsParams: onlyAutoModelsParams,
-    requireAllAdmitsCompleted: false,
-    timeoutMinutes: 1,
-};
+// const simpleAutoBenchmark: Benchmark = {
+//     name: "Complete simple examples with `auto`",
+//     items: [new DatasetItem("auto_benchmark.v")],
+//     modelsParams: onlyAutoModelsParams,
+//     requireAllAdmitsCompleted: true,
+//     benchmarkFullTheorems: true,
+//     benchmarkAdmits: true,
+//     timeoutMinutes: 1,
+// };
+
+// const mixedAutoBenchmark: Benchmark = {
+//     name: "Complete mixed examples (both simple & hard) with `auto`",
+//     items: [new DatasetItem("mixed_benchmark.v")],
+//     modelsParams: onlyAutoModelsParams,
+//     requireAllAdmitsCompleted: false,
+//     benchmarkFullTheorems: true,
+//     benchmarkAdmits: true,
+//     timeoutMinutes: 1,
+// };
 
 const immBenchmark: Benchmark = {
     name: "Complete imm with auto",
-    items: [new DatasetItem("imm/src/basic/Events.v", "imm")],
+    items: [
+        new DatasetItem(
+            "imm/src/basic/Execution_eco.v",
+            ["rf_rmw_ct_in_co"],
+            "imm"
+        ),
+    ],
     modelsParams: onlyAutoModelsParams,
     requireAllAdmitsCompleted: false,
+    benchmarkFullTheorems: true,
+    benchmarkAdmits: false,
     timeoutMinutes: 60,
 };
 
-const benchmarks: Benchmark[] = [
-    simpleAutoBenchmark,
-    mixedAutoBenchmark,
-    immBenchmark,
-];
+const benchmarks: Benchmark[] = [immBenchmark];
 
 suite("Benchmark", () => {
     const datasetDir = getDatasetDir();
@@ -84,11 +97,18 @@ suite("Benchmark", () => {
                         await runTestBenchmark(
                             resolvedFilePath,
                             benchmark.modelsParams,
+                            item.specificTheoremForBenchmark,
+                            benchmark.benchmarkFullTheorems,
+                            benchmark.benchmarkAdmits,
                             resolvedWorkspaceRootPath,
                             benchmark.requireAllAdmitsCompleted
                         );
-                    admitsCompletedInTotal.add(admitsCompleted);
-                    theoremsProvedInTotal.add(theoremsProved);
+                    admitsCompletedInTotal.add(
+                        admitsCompleted ?? new BenchmarkResult(0, 0)
+                    );
+                    theoremsProvedInTotal.add(
+                        theoremsProved ?? new BenchmarkResult(0, 0)
+                    );
                 }
             }
 

From 7aea67b9aa74d49d023e51d7ad5933441ed83d4f Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 26 Apr 2024 15:32:51 +0200
Subject: [PATCH 17/29] Hot fixes during benchmark testing

---
 .gitignore                              |  2 +-
 package-lock.json                       | 13 +++++++++++++
 package.json                            |  1 +
 src/coqLsp/coqLspClient.ts              |  2 +-
 src/llm/llmServices/grazie/grazieApi.ts |  1 -
 src/logging/outputChannelEmulator.ts    |  6 ++----
 6 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index b84fb3c8..e863dd48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@ node_modules
 .vscode-test/
 *.vsix
 logs
-.DS_Store
+**/**/.DS_Store
 
 # Test resources 
 src/test/resources/coqProj/.vscode
diff --git a/package-lock.json b/package-lock.json
index 454f0e37..b56f78ef 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -35,6 +35,7 @@
         "@types/glob": "^8.1.0",
         "@types/mocha": "^10.0.1",
         "@types/node": "20.2.5",
+        "@types/qs": "^6.9.15",
         "@types/vscode": "^1.82.0",
         "@types/yargs": "^17.0.24",
         "@typescript-eslint/eslint-plugin": "^5.62.0",
@@ -830,6 +831,12 @@
         "form-data": "^3.0.0"
       }
     },
+    "node_modules/@types/qs": {
+      "version": "6.9.15",
+      "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.15.tgz",
+      "integrity": "sha512-uXHQKES6DQKKCLh441Xv/dwxOq1TVS3JPUMlEqoEglvlhR6Mxnlew/Xq/LRVHpLyk7iK3zODe1qYHIMltO7XGg==",
+      "dev": true
+    },
     "node_modules/@types/semver": {
       "version": "7.5.1",
       "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.1.tgz",
@@ -7163,6 +7170,12 @@
         "form-data": "^3.0.0"
       }
     },
+    "@types/qs": {
+      "version": "6.9.15",
+      "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.15.tgz",
+      "integrity": "sha512-uXHQKES6DQKKCLh441Xv/dwxOq1TVS3JPUMlEqoEglvlhR6Mxnlew/Xq/LRVHpLyk7iK3zODe1qYHIMltO7XGg==",
+      "dev": true
+    },
     "@types/semver": {
       "version": "7.5.1",
       "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.1.tgz",
diff --git a/package.json b/package.json
index 8700c4b5..c8fc3154 100644
--- a/package.json
+++ b/package.json
@@ -365,6 +365,7 @@
     "@types/glob": "^8.1.0",
     "@types/mocha": "^10.0.1",
     "@types/node": "20.2.5",
+    "@types/qs": "^6.9.15",
     "@types/vscode": "^1.82.0",
     "@types/yargs": "^17.0.24",
     "@typescript-eslint/eslint-plugin": "^5.62.0",
diff --git a/src/coqLsp/coqLspClient.ts b/src/coqLsp/coqLspClient.ts
index adb39ca7..b7389178 100644
--- a/src/coqLsp/coqLspClient.ts
+++ b/src/coqLsp/coqLspClient.ts
@@ -184,7 +184,7 @@ export class CoqLspClient implements CoqLspClientInterface {
         params: any,
         uri: Uri,
         lastDocumentEndPosition?: Position,
-        timeout: number = 50000
+        timeout: number = 300000
     ): Promise<DiagnosticMessage> {
         await this.client.sendNotification(requestType, params);
 
diff --git a/src/llm/llmServices/grazie/grazieApi.ts b/src/llm/llmServices/grazie/grazieApi.ts
index 2ae022ad..4fa586aa 100644
--- a/src/llm/llmServices/grazie/grazieApi.ts
+++ b/src/llm/llmServices/grazie/grazieApi.ts
@@ -50,7 +50,6 @@ export class GrazieApi {
         apiToken: string
     ): Promise<string> {
         const headers = this.createHeaders(apiToken);
-        headers["Content-Length"] = body.length;
         this.eventLogger?.log(
             "grazie-fetch-started",
             "Completion from Grazie requested",
diff --git a/src/logging/outputChannelEmulator.ts b/src/logging/outputChannelEmulator.ts
index ec97e2f7..c1e5a739 100644
--- a/src/logging/outputChannelEmulator.ts
+++ b/src/logging/outputChannelEmulator.ts
@@ -1,4 +1,4 @@
-import { appendFileSync, existsSync, writeFileSync } from "fs";
+import { appendFileSync, writeFileSync } from "fs";
 import { OutputChannel, ViewColumn } from "vscode";
 
 export class OutputChannelEmulator implements OutputChannel {
@@ -9,9 +9,7 @@ export class OutputChannelEmulator implements OutputChannel {
         name: string = "OutputChannelEmulator"
     ) {
         this.name = name;
-        if (!existsSync(this.logFilePath)) {
-            writeFileSync(this.logFilePath, "");
-        }
+        writeFileSync(this.logFilePath, "");
     }
 
     append(value: string): void {

From 88e111b3790a0a9e229705b7729958f71b7bb273 Mon Sep 17 00:00:00 2001
From: "Nikita.Khramov" <nikita.khramov@gmail.com>
Date: Mon, 29 Apr 2024 17:11:45 +0200
Subject: [PATCH 18/29] Added Experiments Report

---
 etc/docs/report.md | 603 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 603 insertions(+)
 create mode 100644 etc/docs/report.md

diff --git a/etc/docs/report.md b/etc/docs/report.md
new file mode 100644
index 00000000..f0f35e7b
--- /dev/null
+++ b/etc/docs/report.md
@@ -0,0 +1,603 @@
+# Experiments Report
+
+We have conducted several experiments using the [imm project](https://github.com/weakmemory/imm). In this report we tell about the process of data preparations, used methods, conducted experiments, and their detailed results.
+
+## Data Preparation
+
+ We measured the performance of our solution on three groups of theorems. Grouping was done on the basis of length of human-written proofs measured in proof steps. Also the groups sizes were chosen with respect to the distribution of proofs lengths in the imm project. We have considered the theorems with proofs length in the interval $[3; 20]$. Such theorems amount to 83% of proofs from the imm project. We have randomly chosen theorems for each group. As the result we got 45 thereoms divided into the following groups:
+
+| Group | Length Interval | Size |
+|----------|----------|----------|
+| A | 3 – 4   | 20 |
+| B | 5 – 8   | 14 |
+| C | 9 – 20  | 11 |
+
+The list of the chosen theorems divided by groups you can find in the table provided in the [Results](#results) section.
+
+## Methods
+
+In our experiments we compared different methods which can be used by CoqPilot:  Predefined tactic (`auto`, `intuition`, `easy`), OpenAI GPT-4, OpenAI GPT-3.5, LLaMA-2 13B Chat, Anthropic Claude. We have used following parameters for each of the models:
+
+### OpenAI GPT-4 
+
+```
+{
+    modelName: "openai-gpt-4",
+    choices: 20,
+
+    systemPrompt: "..." ,
+
+
+    newMessageMaxTokens: 2000,
+    tokensLimit: 4096,
+
+    multiroundProfile: ...,
+    apiKey: "...",
+}
+```
+
+### OpenAI GPT-3.5
+
+```
+{
+    modelName: "openai-chat-gpt",
+    choices: 20,
+
+    systemPrompt: "..." ,
+
+
+    newMessageMaxTokens: 800,
+    tokensLimit: 2500,
+
+    multiroundProfile: ...,
+    apiKey: "...",
+}
+```
+
+### LLaMA-2 13B Chat
+
+```
+{
+    modelName: "grazie-chat-llama-v2-13b",
+    choices: 20,
+
+    systemPrompt: "..." ,
+
+
+    newMessageMaxTokens: 160,
+    tokensLimit: 1160,
+
+    multiroundProfile: ...,
+    apiKey: "...",
+}
+
+```
+
+### Anthropic Claude
+
+```
+{
+    modelName: "anthropic-claude",
+    choices: 9,
+
+    systemPrompt: "..." ,
+
+
+    newMessageMaxTokens: 2000,
+    tokensLimit: 4096,
+
+    multiroundProfile: ...,
+    apiKey: "...",
+}
+```
+
+
+The `systemPrompt` used for all the models:
+
+ 
+
+```
+Generate proof of the theorem from user input in Coq. You should only generate proofs in Coq. Never add special comments to the proof. Your answer should be a valid Coq proof. It should start with 'Proof.' and end with 'Qed.'.
+```
+
+The `proofFixPrompt` used for all the models:
+
+```
+Unfortunately, the last proof is not correct. Here is the compiler's feedback: '${diagnostic}'. Please, fix the proof.
+```
+
+The `multiroundProfile` used for all the models:
+
+```
+{
+    maxRoundsNumber: 1,
+    proofFixChoices: 1,
+    proofFixPrompt: "...",
+}
+```
+
+## Results
+
+In the table below you can find the results of our experiments. For each of the theorems we show its group  and the methods which proved the theorem during our experiments.
+
+
+| Group | File | Theorem Name | Predefined tactic | OpenAI GPT-4 | OpenAI GPT-3.5 | LLaMA-2 13B Chat | Anthropic Claude | 
+|-------|------|--------------|-------------------|--------------|----------------|------------------|------------------|
+| A | [basic/Execution_eco.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution_eco.v) | `rf_rmw_ct_in_co`| &#x2717; | [&#x2713;](#rf_rmw_ct_in_co) | [&#x2713;](#rf_rmw_ct_in_co) | &#x2717; | &#x2717; |
+| A | [imm/imm_hb.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_hb.v) | `cr_hb_hb`| &#x2717; | [&#x2713;](#cr_hb_hb) | [&#x2713;](#cr_hb_hb) | &#x2717; | [&#x2713;](#cr_hb_hb) |
+| A | [basic/FinExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/FinExecution.v) | `fin_exec_same_events`| &#x2717; | [&#x2713;](#fin_exec_same_events) | [&#x2713;](#fin_exec_same_events) | &#x2717; | [&#x2713;](#fin_exec_same_events) |
+| A | [basic/Execution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution.v) | `sb_trans`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [basic/Execution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution.v) | `sb_same_loc_W_trans`| &#x2717; | [&#x2713;](#sb_same_loc_w_trans) | [&#x2713;](#sb_same_loc_w_trans) | &#x2717; | &#x2717; |
+| A | [basic/Events.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Events.v) | `restr_eq_rel_same_loc`| [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) |
+| A | [basic/Events.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Events.v) | `same_loc_loceq`| &#x2717; | [&#x2713;](#same_loc_loceq) | [&#x2713;](#same_loc_loceq) | &#x2717; | &#x2717; |
+| A | [traversal/TraversalConfigAltOld.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfigAltOld.v) | `otc_I_ar_rfrmw_I_implied_helper_2`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [simhelpers/CertCOhelper.v](https://github.com/weakmemory/imm/tree/coq819/src/simhelpers/CertCOhelper.v) | `wf_colD`| &#x2717; | [&#x2713;](#wf_cold) | [&#x2713;](#wf_cold) | &#x2717; | &#x2717; |
+| A | [imm/SubExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/SubExecution.v) | `sub_rfe`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [imm/SubExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/SubExecution.v) | `sub_Rel`| &#x2717; | [&#x2713;](#sub_rel) | &#x2717; | &#x2717; | &#x2717; |
+| A | [imm/imm_s.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_s.v) | `wf_rfirmwsf`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `issuableW`| &#x2717; | &#x2717; | [&#x2713;](#issuablew) | &#x2717; | [&#x2713;](#issuablew) |
+| A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `w_coverable_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `ar_rfrmw_rt_CI_in_I`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [hardware/imm_sToARM.v](https://github.com/weakmemory/imm/tree/coq819/src/hardware/imm_sToARM.v) | `WF`| &#x2717; | [&#x2713;](#wf) | &#x2717; | &#x2717; | [&#x2713;](#wf) |
+| A | [travorder/TraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TraversalOrder.v) | `FWBOBFWBOB`| &#x2717; | [&#x2713;](#fwbobfwbob) | [&#x2713;](#fwbobfwbob) | &#x2717; | &#x2717; |
+| A | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `dom_rfe_acq_sb_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `rfrmw_coverable_in_I`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| A | [imm/imm_s_hb.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_s_hb.v) | `hb_trans`| &#x2717; | [&#x2713;](#hb_trans) | [&#x2713;](#hb_trans) | &#x2717; | &#x2717; |
+| B | [travorder/TraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TraversalOrder.v) | `set_pair_cancel_action`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [travorder/TraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TraversalOrder.v) | `event_issue_finite_inj`| &#x2717; | [&#x2713;](#event_issue_finite_inj) | [&#x2713;](#event_issue_finite_inj) | &#x2717; | &#x2717; |
+| B | [basic/ProgToExecutionProperties.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/ProgToExecutionProperties.v) | `ectrl_increasing`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [imm/Sc_opt.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/Sc_opt.v) | `scb_in_hb_eco`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | [&#x2713;](#scb_in_hb_eco) |
+| B | [travorder/TLSCoherency.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TLSCoherency.v) | `tls_coherent_ext_union`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [ldrf/LDRF_Fsc.v](https://github.com/weakmemory/imm/tree/coq819/src/ldrf/LDRF_Fsc.v) | `RFE1`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [imm/imm_hb.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_hb.v) | `wf_swD`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [imm/imm_s_ppo.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_s_ppo.v) | `ar_int_rfe_ct_rfrmw_rt_in_ar_int_rfe_ct`| &#x2717; | &#x2717; | [&#x2713;](#ar_int_rfe_ct_rfrmw_rt_in_ar_int_rfe_ct) | &#x2717; | &#x2717; |
+| B | [imm/CombRelations.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/CombRelations.v) | `eco_furr_irr`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [imm/CombRelations.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/CombRelations.v) | `urr_hb_sc_hb_irr`| &#x2717; | [&#x2713;](#urr_hb_sc_hb_irr) | [&#x2713;](#urr_hb_sc_hb_irr) | &#x2717; | &#x2717; |
+| B | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `W_ar_coverable_issuable_in_CI`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `ar_coverable_in_CI`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [hardware/Power_fences.v](https://github.com/weakmemory/imm/tree/coq819/src/hardware/Power_fences.v) | `lwsync_sync`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [basic/Execution_eco.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution_eco.v) | `eco_refl`| &#x2717; | [&#x2713;](#eco_refl) | [&#x2713;](#eco_refl) | &#x2717; | [&#x2713;](#eco_refl) |
+| C | [imm/imm_common_more.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_common_more.v) | `W_sb_same_loc_detour`| &#x2717; | &#x2717; | [&#x2713;](#w_sb_same_loc_detour) | &#x2717; | &#x2717; |
+| C | [traversal/SimTraversalProperties.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/SimTraversalProperties.v) | `isim_trav_step_new_e_tid`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `rf_rmw_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [basic/Execution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution.v) | `rmw_non_init_lr`| &#x2717; | [&#x2713;](#rmw_non_init_lr) | [&#x2713;](#rmw_non_init_lr) | &#x2717; | &#x2717; |
+| C | [ocamlmm/OCamlToimm_s.v](https://github.com/weakmemory/imm/tree/coq819/src/ocamlmm/OCamlToimm_s.v) | `wr_mode`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [ocamlmm/OCamlToimm_s.v](https://github.com/weakmemory/imm/tree/coq819/src/ocamlmm/OCamlToimm_s.v) | `sbnl_hb_scb`| &#x2717; | [&#x2713;](#sbnl_hb_scb) | &#x2717; | &#x2717; | &#x2717; |
+| C | [travorder/TLSCoherency.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TLSCoherency.v) | `iord_exec_tls`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [travorder/TlsViewRelHelpers.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TlsViewRelHelpers.v) | `t_acq_n_sc_fence_step`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `init_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `dom_detour_rfe_ppo_issuable`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| C | [travorder/TlsEventSets.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TlsEventSets.v) | `issued_ta_issue`| &#x2717; | [&#x2713;](#issued_ta_issue) | [&#x2713;](#issued_ta_issue) | &#x2717; | &#x2717; |
+
+
+
+## Generated Proofs
+
+Here you can find the generated proofs for each of proved theorems during the experiments.
+
+### Group A
+
+#### Theorem name: 
+#### `rf_rmw_ct_in_co`
+
+
+#### OpenAI GPT-4 Proof:
+
+```
+rewrite (rf_rmw_in_co WF SC_PER_LOC) at 1.
+apply (ct_of_trans (co_trans WF)).
+```
+
+#### OpenAI GPT-3.5 Proof:
+
+```
+rewrite ct_begin at 1.
+rewrite !seqA.
+arewrite (rf ⨾ rmw ⊆ rf ⨾ ⦗R⦘ ⨾ (⦗W⦘ ⨾ rmw)).
+by rewrite (wf_rmwi WF) at 1; basic_solver.
+sin_rewrite rf_rmw_in_eco; try edone.
+rewrite <- ct_begin at 1.
+rewrite !seqA.
+by rewrite eco_trans; eauto using eco_irr.
+```
+
+
+#### Theorem name: 
+#### `cr_hb_hb`
+
+#### OpenAI GPT-4 Proof:
+
+    generalize hb_trans; basic_solver 10.
+
+#### OpenAI GPT-3.5 Proof:
+
+    split; [|basic_solver].
+    unfold hb; rewrite ct_end at 1; rels.
+
+#### Claude Proof:
+
+    generalize hb_trans; basic_solver 10.
+
+
+#### Theorem name: 
+#### `fin_exec_same_events`
+
+#### OpenAI GPT-4 Proof:
+
+    red in FIN.
+    red.
+    rewrite <- SAME.
+    apply FIN.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold fin_exec in *.
+    rewrite <-SAME in *.
+    intuition.
+
+#### Claude Proof:
+
+    unfold fin_exec in *.
+    intros.
+    rewrite <- SAME.
+    apply FIN.
+
+
+
+#### Theorem name: 
+#### `sb_same_loc_W_trans`
+
+#### OpenAI GPT-4 Proof:
+
+    unfold Events.same_loc.
+    unfolder; ins; desf; eauto.
+    splits; auto.
+    generalize sb_trans; basic_solver 21.
+    congruence.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold transitive; intros x y z SBLOC.
+    unfold sb, Events.same_loc in *.
+    unfolder in *; desf.
+    all: splits; auto.
+    all: eapply sb_trans; eauto.
+
+
+#### Theorem name: 
+#### `restr_eq_rel_same_loc`
+
+#### Predefined Tactic Proof:
+
+    auto.
+
+#### OpenAI GPT-4 Proof:
+
+    unfolder; split; ins; desf; splits; auto.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold restr_eq_rel, same_loc; basic_solver.
+
+#### LLaMA Proof:
+
+    unfold same_loc; basic_solver.
+
+#### Claude Proof:
+
+    unfold same_loc; basic_solver.
+
+#### Theorem name: 
+#### `same_loc_loceq`
+
+#### OpenAI GPT-4 Proof:
+
+    unfold same_loc; unfolder; firstorder.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold funeq, same_loc; firstorder.
+
+#### Theorem name: 
+#### `wf_colD`
+#### OpenAI GPT-4 Proof:
+    apply dom_helper_3.
+    unfold col; rewrite wf_coD; basic_solver.
+#### OpenAI GPT-3.5 Proof:
+    unfold col; split; [|basic_solver].
+    rewrite wf_coD.
+    unfold Events.same_loc; unfolder; ins; desf; eauto.
+
+#### Theorem name:  
+#### `sub_Rel`
+#### OpenAI GPT-4 Proof:
+    by rewrite (sub_lab SUB).
+
+
+#### Theorem name:  
+#### `issuableW`
+#### OpenAI GPT-3.5 Proof:
+    unfold issuable.
+    basic_solver.
+#### Claude Proof:
+    unfold issuable.
+    basic_solver.
+
+#### Theorem name:  
+#### `WF`
+#### OpenAI GPT-4 Proof:
+    by apply CON.
+#### Claude Proof:
+    apply CON.
+
+#### Theorem name:  
+#### `FWBOBFWBOB`
+#### OpenAI GPT-4 Proof:
+    iord_dom_solver.
+#### OpenAI GPT-3.5 Proof:
+    unfold FWBOB.
+    unfold rfi, co, bob.
+    clear.
+    basic_solver.
+
+
+#### Theorem name: 
+#### `hb_trans`
+#### OpenAI GPT-4 Proof:
+    unfold hb; apply transitive_ct.
+#### OpenAI GPT-3.5 Proof:
+    unfold hb.
+    apply transitive_ct.
+
+### Group B
+
+#### Theorem name: 
+#### `event_issue_finite_inj`
+
+#### OpenAI GPT-4 Proof:
+
+    exists [mkTL ta_issue y]; ins; desf.
+    red in IN0; destruct x as [[]]; ins; auto.
+
+#### OpenAI GPT-3.5 Proof:
+
+    exists [mkTL ta_issue y].
+    ins.
+    desf.
+    red in IN0.
+    destruct x as [[]]; ins; auto.
+
+
+#### Theorem name: 
+#### `scb_in_hb_eco`
+
+#### Claude Proof:
+
+    unfold imm.scb.
+    rewrite sb_in_hb, co_in_eco, fr_in_eco. 
+    arewrite (hb ∪ (hb \ same_loc) ⨾ hb ⨾ (hb \ same_loc) ∪ hb ∩ same_loc ⊆ hb).
+    {
+      generalize (@hb_trans G).
+      basic_solver.
+    }
+    arewrite (hb ∪ eco ∪ eco ⊆ hb ∪ eco).
+    basic_solver.
+
+
+#### Theorem name: 
+#### `ar_int_rfe_ct_rfrmw_rt_in_ar_int_rfe_ct`
+
+#### OpenAI GPT-3.5 Proof:
+
+    rewrite rtE.
+    relsf; unionL.
+    2: { basic_solver. }
+    rewrite <- ct_step.
+    rewrite !seqA.
+    arewrite (rf ⨾ rmw ⊆ rfi).
+    { rewrite (wf_rfD WF), (wf_rmwD WF); basic_solver. }
+    arewrite (rfi⁺ ⊆ rfe).
+    { rewrite (wf_rfiD WF), (wf_rfeD WF); basic_solver 10. }
+    arewrite (rfe ⊆ (rfe ∪ ar_int)⁺) at 1.
+    relsf.
+
+
+#### Theorem name: 
+#### `urr_hb_sc_hb_irr`
+
+#### OpenAI GPT-4 Proof:
+
+    unfold urr.
+    arewrite_id ⦗W_ l⦘.
+    arewrite_id ⦗F ∩₁ Sc⦘.
+    rels.
+    rewrite (crE sc).
+    relsf; repeat (splits; try apply irreflexive_union).
+    - generalize (@hb_trans G); ins; relsf.
+    - rewrite (crE hb); relsf.
+    apply irreflexive_union; splits.
+    apply hb_sc_hb_irr; auto.
+    red in COH; revert COH; basic_solver 20.
+    - rewrite (dom_r (wf_scD WF_SC)).
+    rewrite (crE sc); relsf; repeat (splits; try apply irreflexive_union).
+    rewrite (dom_r (wf_rfD WF)).
+    type_solver 16.
+    rewrite !crE; relsf; unionL; try done.
+    unfold sc; unfold scb; basic_solver 12.
+    unfold coh_sc; basic_solver 12.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold urr.
+    rewrite (no_hb_to_init WF), (no_sc_to_init WF), (no_rf_to_init WF).
+    unfold imm_s_hb.hb.
+    relsf; unionL; [basic_solver 12|].
+    rewrite ct_end, <- ct_step; relsf.
+    rewrite rfi_union_rfe.
+    arewrite (rfi ⊆ hb); rewrite <- ct_step; seq_rewrite <- ct_end.
+    rewrite <- ct_step, rfe_ct, inclusion_t_rt, <- ct_step; relsf.
+    rewrite !seqA, <- ct_begin, !seqA.
+    arewrite (hb ⊆ hb^?) at 1.
+    hahn_frame_r.
+    rewrite !seqA.
+    arewrite (sc ⨾ hb ⊆ hb).
+    by apply (@hb_trans G).
+    rewrite <- ct_end.
+    rewrite (dom_l (wf_rfD WF)) at 1; type_solver.
+
+
+#### Theorem name: 
+#### `eco_refl`
+
+#### OpenAI GPT-4 Proof:
+
+    unfold eco, Execution.fr; ins.
+    relsf; unionL; eauto 8 using ct_step with hahn.
+    - arewrite_id ⦗W⦘ at 1; rels.
+    - arewrite_id ⦗R⦘ at 1; rels.
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold fri, rfi, rfe, fre, eco; relsf.
+    rewrite !crE; relsf; unionL; auto 8 with hahn.
+
+#### Claude Proof:
+
+    unfold eco.
+    relsf.
+    rewrite !crE, !seqA, !unionC; relsf.
+    rewrite (co_fr WF), (fr_co WF); auto.
+    rewrite (rf_fr WF); relsf.
+    setoid_rewrite fr_comm.
+    rewrite !seqA, !unionC; relsf.
+    rewrite union_semi_l, unionC; relsf.
+    rewrite union_abso. 
+    rewrite !unionA; relsf.
+    apply subset_refl.
+    repeat f_equal; auto.
+
+
+### Group C
+
+#### Theorem name: 
+#### `W_sb_same_loc_detour`
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold detour.
+    arewrite (⦗fun x : actid => ~ is_init x⦘ ⨾ ⦗W⦘ ⨾ sb ∩ same_loc ⨾ ⦗W⦘ ⊆ 
+             ⦗fun x : actid => ~ is_init x⦘ ⨾ ⦗W⦘ ⨾ sb ⨾ same_loc ⨾ ⦗W⦘).
+    basic_solver.
+    rewrite (wf_detourD WF).
+    rewrite (dom_r (wf_same_locD WF)).
+    rewrite (wf_sbE WF).
+    rewrite (sc_per_loc_Wf WF SC_PER_LOC).
+    basic_solver 42.
+
+
+#### Theorem name: 
+#### `rmw_non_init_lr`
+
+#### OpenAI GPT-3.5 Proof:
+
+    ​​split; [|basic_solver].
+    rewrite (wf_rmwD WF).
+    rewrite (wf_rmwE WF).
+    rewrite !seqA.
+    arewrite (⦗R⦘ ⨾ rmw ⨾ ⦗W⦘ ⊆ ⦗set_compl is_init⦘ ⨾ ⦗R⦘ ⨾ rmw ⨾ ⦗W⦘ ⨾ ⦗set_compl is_init⦘).
+    { arewrite (⦗R⦘ ⨾ rmw ⨾ ⦗W⦘ ⊆ sb^?).
+      { rewrite (rmw_in_sb WF). basic_solver. }
+      basic_solver. }
+    basic_solver.
+
+#### OpenAI GPT-4 Proof:
+
+    split; [|basic_solver].
+    rewrite (rmw_from_non_init WF).
+    rewrite (wf_rmwi WF); clear -WF.
+    rewrite (sb_immediate_adjacent WF).
+    clear; basic_solver.
+
+
+#### Theorem name: 
+#### `sbnl_hb_scb`
+
+#### OpenAI GPT-4 Proof:
+
+    unfold OCaml.hb.
+    arewrite (hb ⊆ sb ∪ sb ⨾ ((sb \ same_loc) ∪ rf ⨾ ⦗Sc⦘ ∪ co ∪ fr)^* ⨾ sb).
+    hahn_frame.
+    apply inclusion_t_ind.
+    2: by apply transitiveI; basic_solver.
+    rewrite <- !(union_ct rt_rw) at 1.
+    bynre.
+    rewrite seq_union_r.
+    apply inclusion_union_l.
+    rewrite seq_union_r.
+    apply inclusion_union_l.
+    - rewrite (dom_l (wf_sbE G)).
+      rewrite !seqA, (seq_eqvC (same_loc)).
+      arewrite (⦗E \₁ F ∩₁ R ∩₁ Sc⦘ ⨾ sb ⊆ ⦗E \₁ F⦘ ⨾ sb ⨾ ⦗Sc⦘) by (mode_solver 21).
+      sin_rewrite sb_rf_sc_sc.
+      basic_solver.
+    - rewrite (dom_l (wf_sbE G)).
+      rewrite !seqA, (seq_eqvC (same_loc)).
+      arewrite (⦗E \₁ F⦘ ⨾ sb ⊆ sb ⨾ ⦗F⦘ ⨾ sb) by (mode_solver 21).
+      sin_rewrite sb_rf_sc_sc.
+      basic_solver.
+    - rewrite (dom_l (wf_sbE G)).
+      rewrite !seqA, (seq_eqvC (same_loc)).
+      arewrite (⦗E \₁ F ∩₁ W⦘ ⨾ sb ⊆ ⦗E \₁ F⦘ ⨾ sb ⨾ ⦗Sc⦘) by (mode_solver 21).
+      arewrite (rf ⨾ ⦗Sc⦘ ⊆ sb ∪ co) at 1.
+      { rewrite WRLXF, FSUPPCO; basic_solver. }
+      rewrite rt_unionE.
+      bynb.
+      rewrite !seq_union_l, !seq_union_r.
+      apply inclusion_union_l.
+      rewrite unionC.
+      sin_rewrite sb_rf_sc_sc.
+      basic_solver.
+      arewrite (rf ⨾ ⦗Sc⦘ ⊆ sb ∪ co) at 1.
+      { rewrite RMWSC; basic_solver. }
+      rewrite rt_unionE.
+      bynb.
+      rewrite !seq_union_l, !seq_union_r.
+      apply inclusion_union_l.
+      rewrite RMWSC at 2 4.
+      basic_solver.
+      arewrite (co ⊆ rf^? ⨾ co ⨾ rf^?) by (rewrite fsupp_imm_t; eauto; basic_solver).
+      rewrite !seqA.
+      bynb.
+      rewrite !seq_union_l, !seq_union_r.
+      apply inclusion_union_l.
+      rewrite RMWSC at 2 4.
+      basic_solver.
+      rewrite RMWSC at 2 4.
+      basic_solver.
+
+#### Theorem name: 
+#### `issued_ta_issue`
+
+#### OpenAI GPT-3.5 Proof:
+
+    unfold issued.
+    unfolder; split; ins; desf.
+    {
+    destruct y; ins.
+    desf.
+    }
+    eexists (_, _).
+    splits; ins; eauto.
+
+#### OpenAI GPT-4 Proof:
+
+    unfold issued.
+    unfolder; split; ins; desf.
+    {
+    destruct y; ins; desf.
+    }
+    by eexists (_, _); splits; ins.
+
+
+
+<!-- TODO: reference to the the results in abstract? -->
\ No newline at end of file

From 824dfde4d8031148165df89ed9bb821a66e8e0ab Mon Sep 17 00:00:00 2001
From: "Nikita.Khramov" <nikita.khramov@gmail.com>
Date: Mon, 29 Apr 2024 17:47:57 +0200
Subject: [PATCH 19/29] Fixed Experiments Report

---
 etc/docs/{report.md => benchmarking_imm_45theorems_report.md} | 2 ++
 1 file changed, 2 insertions(+)
 rename etc/docs/{report.md => benchmarking_imm_45theorems_report.md} (99%)

diff --git a/etc/docs/report.md b/etc/docs/benchmarking_imm_45theorems_report.md
similarity index 99%
rename from etc/docs/report.md
rename to etc/docs/benchmarking_imm_45theorems_report.md
index f0f35e7b..5c480d90 100644
--- a/etc/docs/report.md
+++ b/etc/docs/benchmarking_imm_45theorems_report.md
@@ -116,6 +116,8 @@ The `multiroundProfile` used for all the models:
 }
 ```
 
+The temperature parameter in all models was set to `1`.
+
 ## Results
 
 In the table below you can find the results of our experiments. For each of the theorems we show its group  and the methods which proved the theorem during our experiments.

From 6a4bef001452f9b31d916fa5220d4443cf81bda5 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 1 May 2024 16:52:26 +0200
Subject: [PATCH 20/29] Fix comment in proofChecker, fix completionGenerator
 for Proof using.

---
 src/core/completionGenerator.ts             |  2 +-
 src/core/coqProofChecker.ts                 | 11 +++--
 src/llm/llmServices/openai/openAiService.ts |  1 -
 src/test/benchmark/runBenchmark.test.ts     | 48 +++++++--------------
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/src/core/completionGenerator.ts b/src/core/completionGenerator.ts
index ebdb8db4..a321f02c 100644
--- a/src/core/completionGenerator.ts
+++ b/src/core/completionGenerator.ts
@@ -270,7 +270,7 @@ function prepareProofToCheck(proof: string) {
 
     // 2. Remove Proof. and Qed.
     preparedProof = preparedProof
-        .replace(/Proof using\./g, "")
+        .replace(/Proof using.*?\./g, "")
         .replace(/Proof\./g, "")
         .replace(/Qed\./g, "");
 
diff --git a/src/core/coqProofChecker.ts b/src/core/coqProofChecker.ts
index 09fd73dc..e4647932 100644
--- a/src/core/coqProofChecker.ts
+++ b/src/core/coqProofChecker.ts
@@ -111,8 +111,7 @@ export class CoqProofChecker implements CoqProofCheckerInterface {
             await this.coqLspClient.openTextDocument(auxFileUri);
             let auxFileVersion = 1;
 
-            // 3. Iterate over the proofs and issue getFirstGoalAtPoint request with
-            // pretac = proof
+            // 3. Iterate over the proofs and сheck them
             for (const proof of proofs) {
                 // 3.1. Check if the proof contains admit
                 if (this.checkIfProofContainsAdmit(proof)) {
@@ -125,10 +124,10 @@ export class CoqProofChecker implements CoqProofCheckerInterface {
                 }
 
                 auxFileVersion += 1;
-                // 4. Append the proof the end of the aux file
+                // 3.2. Append the proof the end of the aux file
                 appendFileSync(auxFileUri.fsPath, proof);
 
-                // 5. Issue update text request
+                // 3.3. Issue update text request
                 const diagnosticMessage =
                     await this.coqLspClient.updateTextDocument(
                         sourceFileContentPrefix,
@@ -137,14 +136,14 @@ export class CoqProofChecker implements CoqProofCheckerInterface {
                         auxFileVersion
                     );
 
-                // 6. Check diagnostics
+                // 3.4. Check diagnostics
                 results.push({
                     proof: proof,
                     isValid: diagnosticMessage === undefined,
                     diagnostic: diagnosticMessage,
                 });
 
-                // 7. Bring file to the previous state
+                // 3.5. Bring file to the previous state
                 writeFileSync(auxFileUri.fsPath, sourceFileContent);
             }
         } finally {
diff --git a/src/llm/llmServices/openai/openAiService.ts b/src/llm/llmServices/openai/openAiService.ts
index b26596fc..e41ccf4a 100644
--- a/src/llm/llmServices/openai/openAiService.ts
+++ b/src/llm/llmServices/openai/openAiService.ts
@@ -32,7 +32,6 @@ export class OpenAiService extends LLMService {
         params: ModelParams,
         choices: number
     ): Promise<string[]> {
-        // TODO: support retries
         if (choices <= 0) {
             return [];
         }
diff --git a/src/test/benchmark/runBenchmark.test.ts b/src/test/benchmark/runBenchmark.test.ts
index a6d04f55..f5b29e6a 100644
--- a/src/test/benchmark/runBenchmark.test.ts
+++ b/src/test/benchmark/runBenchmark.test.ts
@@ -32,43 +32,27 @@ class DatasetItem {
     }
 }
 
-// const simpleAutoBenchmark: Benchmark = {
-//     name: "Complete simple examples with `auto`",
-//     items: [new DatasetItem("auto_benchmark.v")],
-//     modelsParams: onlyAutoModelsParams,
-//     requireAllAdmitsCompleted: true,
-//     benchmarkFullTheorems: true,
-//     benchmarkAdmits: true,
-//     timeoutMinutes: 1,
-// };
-
-// const mixedAutoBenchmark: Benchmark = {
-//     name: "Complete mixed examples (both simple & hard) with `auto`",
-//     items: [new DatasetItem("mixed_benchmark.v")],
-//     modelsParams: onlyAutoModelsParams,
-//     requireAllAdmitsCompleted: false,
-//     benchmarkFullTheorems: true,
-//     benchmarkAdmits: true,
-//     timeoutMinutes: 1,
-// };
-
-const immBenchmark: Benchmark = {
-    name: "Complete imm with auto",
-    items: [
-        new DatasetItem(
-            "imm/src/basic/Execution_eco.v",
-            ["rf_rmw_ct_in_co"],
-            "imm"
-        ),
-    ],
+const simpleAutoBenchmark: Benchmark = {
+    name: "Complete simple examples with `auto`",
+    items: [new DatasetItem("auto_benchmark.v")],
+    modelsParams: onlyAutoModelsParams,
+    requireAllAdmitsCompleted: true,
+    benchmarkFullTheorems: true,
+    benchmarkAdmits: true,
+    timeoutMinutes: 1,
+};
+
+const mixedAutoBenchmark: Benchmark = {
+    name: "Complete mixed examples (both simple & hard) with `auto`",
+    items: [new DatasetItem("mixed_benchmark.v")],
     modelsParams: onlyAutoModelsParams,
     requireAllAdmitsCompleted: false,
     benchmarkFullTheorems: true,
-    benchmarkAdmits: false,
-    timeoutMinutes: 60,
+    benchmarkAdmits: true,
+    timeoutMinutes: 1,
 };
 
-const benchmarks: Benchmark[] = [immBenchmark];
+const benchmarks: Benchmark[] = [simpleAutoBenchmark, mixedAutoBenchmark];
 
 suite("Benchmark", () => {
     const datasetDir = getDatasetDir();

From 17ad4e52a8fcba66ec752a351c0925189fe5f42a Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 1 May 2024 16:55:03 +0200
Subject: [PATCH 21/29] fix: change name of the benchmarking report

---
 ...hmarking_imm_45theorems_report.md => benchmarking_report01.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename etc/docs/{benchmarking_imm_45theorems_report.md => benchmarking_report01.md} (100%)

diff --git a/etc/docs/benchmarking_imm_45theorems_report.md b/etc/docs/benchmarking_report01.md
similarity index 100%
rename from etc/docs/benchmarking_imm_45theorems_report.md
rename to etc/docs/benchmarking_report01.md

From e5a0250d78c31c8a266eea294413dec86e816e2c Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 1 May 2024 16:57:04 +0200
Subject: [PATCH 22/29] fix: Remove unused lib

---
 package-lock.json | 13 -------------
 package.json      |  1 -
 2 files changed, 14 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index b56f78ef..454f0e37 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -35,7 +35,6 @@
         "@types/glob": "^8.1.0",
         "@types/mocha": "^10.0.1",
         "@types/node": "20.2.5",
-        "@types/qs": "^6.9.15",
         "@types/vscode": "^1.82.0",
         "@types/yargs": "^17.0.24",
         "@typescript-eslint/eslint-plugin": "^5.62.0",
@@ -831,12 +830,6 @@
         "form-data": "^3.0.0"
       }
     },
-    "node_modules/@types/qs": {
-      "version": "6.9.15",
-      "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.15.tgz",
-      "integrity": "sha512-uXHQKES6DQKKCLh441Xv/dwxOq1TVS3JPUMlEqoEglvlhR6Mxnlew/Xq/LRVHpLyk7iK3zODe1qYHIMltO7XGg==",
-      "dev": true
-    },
     "node_modules/@types/semver": {
       "version": "7.5.1",
       "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.1.tgz",
@@ -7170,12 +7163,6 @@
         "form-data": "^3.0.0"
       }
     },
-    "@types/qs": {
-      "version": "6.9.15",
-      "resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.9.15.tgz",
-      "integrity": "sha512-uXHQKES6DQKKCLh441Xv/dwxOq1TVS3JPUMlEqoEglvlhR6Mxnlew/Xq/LRVHpLyk7iK3zODe1qYHIMltO7XGg==",
-      "dev": true
-    },
     "@types/semver": {
       "version": "7.5.1",
       "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.1.tgz",
diff --git a/package.json b/package.json
index c8fc3154..8700c4b5 100644
--- a/package.json
+++ b/package.json
@@ -365,7 +365,6 @@
     "@types/glob": "^8.1.0",
     "@types/mocha": "^10.0.1",
     "@types/node": "20.2.5",
-    "@types/qs": "^6.9.15",
     "@types/vscode": "^1.82.0",
     "@types/yargs": "^17.0.24",
     "@typescript-eslint/eslint-plugin": "^5.62.0",

From 3776e7dbc9dc8950b8527cabf8cc76d2c8ea3c6c Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 1 May 2024 17:12:47 +0200
Subject: [PATCH 23/29] Prepare Changelog for v2.1.0 release

---
 CHANGELOG.md      | 12 ++++++++++++
 package-lock.json |  4 ++--
 package.json      |  2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54e7901a..d55efdc3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # Change Log
 
+### 2.1.0
+Major: 
+- Create a (still in development and improvement) benchmarking system. A guide on how to use it is in the README.
+- Conduct an experiment on the performance of different LLMs, using the developed infrastructure. Benchmarking report is located in the [docs folder](etc/docs/benchmarking_report.md).
+- Correctly handle and display settings which occur when the user settings are not correctly set.
+
+Minor: 
+- Set order of contributed settings.
+- Add a comprehensive user settings guide to the README. 
+- Fix issue with Grazie service not being able to correctly accept coq ligatures.
+- Fix issue that occured when generated proof contained `Proof using {...}.` construct. 
+
 ### 2.0.0
 - Added multiple strategies for ranking theorems from the working file. As LLM context window is limited, we sometimes should somehow choose a subset of theorems we want to provide as references to the LLM. Thus, we have made a few strategies for ranking theorems. Now there are only 2 of them, but there are more to come. Now we have a strategy that randomly picks theorems, and also the one that ranks them depending on the distance from the hole.
 - Now different holes are solved in parallel. This is a huge improvement in terms of performance.
diff --git a/package-lock.json b/package-lock.json
index 454f0e37..f90ba9cf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "coqpilot",
-  "version": "2.0.0",
+  "version": "2.1.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "coqpilot",
-      "version": "2.0.0",
+      "version": "2.1.0",
       "dependencies": {
         "@codemirror/autocomplete": "^6.9.1",
         "ajv": "^8.12.0",
diff --git a/package.json b/package.json
index 8700c4b5..1d989ca0 100644
--- a/package.json
+++ b/package.json
@@ -8,7 +8,7 @@
     "url": "https://github.com/K-dizzled/coqpilot"
   },
   "publisher": "JetBrains-Research",
-  "version": "2.0.0",
+  "version": "2.1.0",
   "engines": {
     "vscode": "^1.82.0"
   },

From 4b8376d1644533277ee45e1676cdf6dd563bcc12 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Wed, 1 May 2024 17:14:31 +0200
Subject: [PATCH 24/29] fix: fix path in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d55efdc3..86379362 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 ### 2.1.0
 Major: 
 - Create a (still in development and improvement) benchmarking system. A guide on how to use it is in the README.
-- Conduct an experiment on the performance of different LLMs, using the developed infrastructure. Benchmarking report is located in the [docs folder](etc/docs/benchmarking_report.md).
+- Conduct an experiment on the performance of different LLMs, using the developed infrastructure. Benchmarking report is located in the [docs folder](etc/docs/benchmarking_report01.md).
 - Correctly handle and display settings which occur when the user settings are not correctly set.
 
 Minor: 

From 47afdf53604183238c5e4c18c8f4925554c47ed3 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Thu, 2 May 2024 23:17:56 +0200
Subject: [PATCH 25/29] fix: fix incorrect llm response format in 0-shot +
 proof-repair

---
 src/llm/llmServices/llmService.ts | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/llm/llmServices/llmService.ts b/src/llm/llmServices/llmService.ts
index 348a0d0a..59366fff 100644
--- a/src/llm/llmServices/llmService.ts
+++ b/src/llm/llmServices/llmService.ts
@@ -132,6 +132,12 @@ export abstract class GeneratedProof {
         llmService: LLMService,
         previousProofVersions?: ProofVersion[]
     ) {
+        // Sometimes, expecially when 0-shot prompting,
+        // Gpt wraps the proof in a tone of comments and other text.
+        // The code block is somewhere in the middle.
+        // This method extracts the code block from the message.
+        proof = this.parseProofFromMessage(proof);
+
         this.llmService = llmService;
         this.modelParams = modelParams;
 
@@ -167,8 +173,7 @@ export abstract class GeneratedProof {
 
     protected async generateNextVersion(
         chat: ChatHistory,
-        choices: number,
-        postprocessProof: (proof: string) => string = (proof) => proof
+        choices: number
     ): Promise<GeneratedProof[]> {
         if (!this.nextVersionCanBeGenerated() || choices <= 0) {
             return [];
@@ -180,7 +185,7 @@ export abstract class GeneratedProof {
         );
         return newProofs.map((proof: string) =>
             this.llmService.constructGeneratedProof(
-                postprocessProof(proof),
+                proof,
                 this.proofGenerationContext,
                 this.modelParams,
                 this.proofVersions
@@ -210,15 +215,11 @@ export abstract class GeneratedProof {
             this.modelParams
         );
 
-        return this.generateNextVersion(
-            chat,
-            choices,
-            this.parseFixedProofFromMessage.bind(this)
-        );
+        return this.generateNextVersion(chat, choices);
     }
 
-    parseFixedProofFromMessage(message: string): string {
-        const regex = /Proof\.(.*?)Qed\./s;
+    parseProofFromMessage(message: string): string {
+        const regex = /Proof(.*?)Qed\./s;
         const match = regex.exec(message);
         if (match) {
             return match[0];

From 7bb1c773588ad1d75003b916aa1c55c651a41566 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 3 May 2024 15:19:02 +0200
Subject: [PATCH 26/29] Update benchmark report with experiments with
 firstorder

---
 .../{ => benchmark}/benchmarking_report01.md  | 31 ++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)
 rename etc/docs/{ => benchmark}/benchmarking_report01.md (95%)

diff --git a/etc/docs/benchmarking_report01.md b/etc/docs/benchmark/benchmarking_report01.md
similarity index 95%
rename from etc/docs/benchmarking_report01.md
rename to etc/docs/benchmark/benchmarking_report01.md
index 5c480d90..0d2cfafb 100644
--- a/etc/docs/benchmarking_report01.md
+++ b/etc/docs/benchmark/benchmarking_report01.md
@@ -127,20 +127,20 @@ In the table below you can find the results of our experiments. For each of the
 |-------|------|--------------|-------------------|--------------|----------------|------------------|------------------|
 | A | [basic/Execution_eco.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution_eco.v) | `rf_rmw_ct_in_co`| &#x2717; | [&#x2713;](#rf_rmw_ct_in_co) | [&#x2713;](#rf_rmw_ct_in_co) | &#x2717; | &#x2717; |
 | A | [imm/imm_hb.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_hb.v) | `cr_hb_hb`| &#x2717; | [&#x2713;](#cr_hb_hb) | [&#x2713;](#cr_hb_hb) | &#x2717; | [&#x2713;](#cr_hb_hb) |
-| A | [basic/FinExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/FinExecution.v) | `fin_exec_same_events`| &#x2717; | [&#x2713;](#fin_exec_same_events) | [&#x2713;](#fin_exec_same_events) | &#x2717; | [&#x2713;](#fin_exec_same_events) |
+| A | [basic/FinExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/FinExecution.v) | `fin_exec_same_events`| [&#x2713;](#fin_exec_same_events) | [&#x2713;](#fin_exec_same_events) | [&#x2713;](#fin_exec_same_events) | &#x2717; | [&#x2713;](#fin_exec_same_events) |
 | A | [basic/Execution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution.v) | `sb_trans`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | A | [basic/Execution.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Execution.v) | `sb_same_loc_W_trans`| &#x2717; | [&#x2713;](#sb_same_loc_w_trans) | [&#x2713;](#sb_same_loc_w_trans) | &#x2717; | &#x2717; |
 | A | [basic/Events.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Events.v) | `restr_eq_rel_same_loc`| [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) | [&#x2713;](#restr_eq_rel_same_loc) |
-| A | [basic/Events.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Events.v) | `same_loc_loceq`| &#x2717; | [&#x2713;](#same_loc_loceq) | [&#x2713;](#same_loc_loceq) | &#x2717; | &#x2717; |
+| A | [basic/Events.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/Events.v) | `same_loc_loceq`| [&#x2713;](#same_loc_loceq) | [&#x2713;](#same_loc_loceq) | [&#x2713;](#same_loc_loceq) | &#x2717; | &#x2717; |
 | A | [traversal/TraversalConfigAltOld.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfigAltOld.v) | `otc_I_ar_rfrmw_I_implied_helper_2`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | A | [simhelpers/CertCOhelper.v](https://github.com/weakmemory/imm/tree/coq819/src/simhelpers/CertCOhelper.v) | `wf_colD`| &#x2717; | [&#x2713;](#wf_cold) | [&#x2713;](#wf_cold) | &#x2717; | &#x2717; |
 | A | [imm/SubExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/SubExecution.v) | `sub_rfe`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | A | [imm/SubExecution.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/SubExecution.v) | `sub_Rel`| &#x2717; | [&#x2713;](#sub_rel) | &#x2717; | &#x2717; | &#x2717; |
 | A | [imm/imm_s.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_s.v) | `wf_rfirmwsf`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
-| A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `issuableW`| &#x2717; | &#x2717; | [&#x2713;](#issuablew) | &#x2717; | [&#x2713;](#issuablew) |
+| A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `issuableW`| [&#x2713;](#issuablew) | &#x2717; | [&#x2713;](#issuablew) | &#x2717; | [&#x2713;](#issuablew) |
 | A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `w_coverable_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | A | [traversal/TraversalConfig.v](https://github.com/weakmemory/imm/tree/coq819/src/traversal/TraversalConfig.v) | `ar_rfrmw_rt_CI_in_I`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
-| A | [hardware/imm_sToARM.v](https://github.com/weakmemory/imm/tree/coq819/src/hardware/imm_sToARM.v) | `WF`| &#x2717; | [&#x2713;](#wf) | &#x2717; | &#x2717; | [&#x2713;](#wf) |
+| A | [hardware/imm_sToARM.v](https://github.com/weakmemory/imm/tree/coq819/src/hardware/imm_sToARM.v) | `WF`| [&#x2713;](#wf) | [&#x2713;](#wf) | &#x2717; | &#x2717; | [&#x2713;](#wf) |
 | A | [travorder/TraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TraversalOrder.v) | `FWBOBFWBOB`| &#x2717; | [&#x2713;](#fwbobfwbob) | [&#x2713;](#fwbobfwbob) | &#x2717; | &#x2717; |
 | A | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `dom_rfe_acq_sb_issued`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | A | [travorder/EventsTraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/EventsTraversalOrder.v) | `rfrmw_coverable_in_I`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
@@ -149,7 +149,7 @@ In the table below you can find the results of our experiments. For each of the
 | B | [travorder/TraversalOrder.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TraversalOrder.v) | `event_issue_finite_inj`| &#x2717; | [&#x2713;](#event_issue_finite_inj) | [&#x2713;](#event_issue_finite_inj) | &#x2717; | &#x2717; |
 | B | [basic/ProgToExecutionProperties.v](https://github.com/weakmemory/imm/tree/coq819/src/basic/ProgToExecutionProperties.v) | `ectrl_increasing`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | B | [imm/Sc_opt.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/Sc_opt.v) | `scb_in_hb_eco`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | [&#x2713;](#scb_in_hb_eco) |
-| B | [travorder/TLSCoherency.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TLSCoherency.v) | `tls_coherent_ext_union`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
+| B | [travorder/TLSCoherency.v](https://github.com/weakmemory/imm/tree/coq819/src/travorder/TLSCoherency.v) | `tls_coherent_ext_union`| [&#x2713;](#tls_coherent_ext_union) | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | B | [ldrf/LDRF_Fsc.v](https://github.com/weakmemory/imm/tree/coq819/src/ldrf/LDRF_Fsc.v) | `RFE1`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | B | [imm/imm_hb.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_hb.v) | `wf_swD`| &#x2717; | &#x2717; | &#x2717; | &#x2717; | &#x2717; |
 | B | [imm/imm_s_ppo.v](https://github.com/weakmemory/imm/tree/coq819/src/imm/imm_s_ppo.v) | `ar_int_rfe_ct_rfrmw_rt_in_ar_int_rfe_ct`| &#x2717; | &#x2717; | [&#x2713;](#ar_int_rfe_ct_rfrmw_rt_in_ar_int_rfe_ct) | &#x2717; | &#x2717; |
@@ -224,6 +224,10 @@ by rewrite eco_trans; eauto using eco_irr.
 #### Theorem name: 
 #### `fin_exec_same_events`
 
+#### Firstorder tactic Proof:
+
+    firstorder auto with *.
+
 #### OpenAI GPT-4 Proof:
 
     red in FIN.
@@ -269,9 +273,9 @@ by rewrite eco_trans; eauto using eco_irr.
 #### Theorem name: 
 #### `restr_eq_rel_same_loc`
 
-#### Predefined Tactic Proof:
+#### Firstorder tactic Proof:
 
-    auto.
+    firstorder auto with *.
 
 #### OpenAI GPT-4 Proof:
 
@@ -292,6 +296,10 @@ by rewrite eco_trans; eauto using eco_irr.
 #### Theorem name: 
 #### `same_loc_loceq`
 
+#### Firstorder tactic Proof:
+
+    firstorder auto with *.
+
 #### OpenAI GPT-4 Proof:
 
     unfold same_loc; unfolder; firstorder.
@@ -318,6 +326,8 @@ by rewrite eco_trans; eauto using eco_irr.
 
 #### Theorem name:  
 #### `issuableW`
+#### Firstorder tactic Proof:
+    firstorder auto with *.
 #### OpenAI GPT-3.5 Proof:
     unfold issuable.
     basic_solver.
@@ -327,6 +337,8 @@ by rewrite eco_trans; eauto using eco_irr.
 
 #### Theorem name:  
 #### `WF`
+#### Firstorder tactic Proof:
+    firstorder auto with *.
 #### OpenAI GPT-4 Proof:
     by apply CON.
 #### Claude Proof:
@@ -369,6 +381,11 @@ by rewrite eco_trans; eauto using eco_irr.
     red in IN0.
     destruct x as [[]]; ins; auto.
 
+#### Theorem name: 
+#### `tls_coherent_ext_union`
+
+#### Firstorder tactic Proof:
+    firstorder auto with *.
 
 #### Theorem name: 
 #### `scb_in_hb_eco`

From 634e7dfa5a23c15ab9c5a6cebeda5ffc04f829eb Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 3 May 2024 15:20:43 +0200
Subject: [PATCH 27/29] fix: fix typo in benchmark report

---
 etc/docs/benchmark/benchmarking_report01.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/docs/benchmark/benchmarking_report01.md b/etc/docs/benchmark/benchmarking_report01.md
index 0d2cfafb..f71cc1e6 100644
--- a/etc/docs/benchmark/benchmarking_report01.md
+++ b/etc/docs/benchmark/benchmarking_report01.md
@@ -16,7 +16,7 @@ The list of the chosen theorems divided by groups you can find in the table prov
 
 ## Methods
 
-In our experiments we compared different methods which can be used by CoqPilot:  Predefined tactic (`auto`, `intuition`, `easy`), OpenAI GPT-4, OpenAI GPT-3.5, LLaMA-2 13B Chat, Anthropic Claude. We have used following parameters for each of the models:
+In our experiments we compared different methods which can be used by CoqPilot:  Predefined tactic (`firstorder auto with *.`), OpenAI GPT-4, OpenAI GPT-3.5, LLaMA-2 13B Chat, Anthropic Claude. We have used following parameters for each of the models:
 
 ### OpenAI GPT-4 
 

From c5bb13720e1f4980b5ead83ef84728b9a51d01d9 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Fri, 3 May 2024 17:36:41 +0200
Subject: [PATCH 28/29] feature: add JaccardIndex metric theorem ranker

---
 README.md                                     |  2 +-
 package.json                                  |  6 ++-
 .../JaccardIndexContextTheoremsRanker.ts      | 53 +++++++++++++++++++
 src/extension/coqPilot.ts                     |  3 ++
 4 files changed, 61 insertions(+), 3 deletions(-)
 create mode 100644 src/core/contextTheoremRanker/JaccardIndexContextTheoremsRanker.ts

diff --git a/README.md b/README.md
index a82abc7a..3be9387e 100644
--- a/README.md
+++ b/README.md
@@ -102,7 +102,7 @@ Comment: Such files are not visible in the vscode explorer, because plugin adds
 
 This extension contributes the following settings:
 
-* `coqpilot.contextTheoremsRankerType` : The type of theorems ranker that will be used to select theorems for proof generation (when context is smaller than taking all of them). Either randomly or by distance from the theorem, with the currently generated admit. 
+* `coqpilot.contextTheoremsRankerType` : The type of theorems ranker that will be used to select theorems for proof generation (when context is smaller than taking all of them). Either randomly, by Jacard index (similarity metric) or by distance from the theorem, with the currently observed admit. 
 * `coqpilot.loggingVerbosity` : Verbosity of the logs. Could be `info`, `debug`.
 
 * `coqpilot.openAiModelsParameters`, `coqpilot.predefinedProofsModelsParameters`, `coqpilot.grazieModelsParameters` and `coqpilot.lmStudioModelsParameters`:
diff --git a/package.json b/package.json
index 1d989ca0..a4981794 100644
--- a/package.json
+++ b/package.json
@@ -318,11 +318,13 @@
             "type": "string",
             "enum": [
               "distance",
-              "random"
+              "random",
+              "jaccardIndex"
             ],
             "markdownEnumDescriptions": [
               "Theorems are selected based on the distance to the current cursor position.",
-              "Theorems are selected randomly."
+              "Theorems are selected randomly.", 
+              "Theorems are selected based on the Jaccard index. This metric aims to pick the most similar theorems to the goal currenly being proved."
             ],
             "description": "Context of the LLM is limited. Usually not all theorems from the file may be used in the completion request. This parameter defines the way theorems are selected for the completion.",
             "default": "distance"
diff --git a/src/core/contextTheoremRanker/JaccardIndexContextTheoremsRanker.ts b/src/core/contextTheoremRanker/JaccardIndexContextTheoremsRanker.ts
new file mode 100644
index 00000000..a8126881
--- /dev/null
+++ b/src/core/contextTheoremRanker/JaccardIndexContextTheoremsRanker.ts
@@ -0,0 +1,53 @@
+import { Goal, Hyp, PpString } from "../../coqLsp/coqLspTypes";
+
+import { Theorem } from "../../coqParser/parsedTypes";
+import { CompletionContext } from "../completionGenerator";
+
+import { ContextTheoremsRanker } from "./contextTheoremsRanker";
+
+/**
+ * Ranks theorems based on how similar their statements are to
+ * the current goal context. Metric is calculated on the
+ * concatenated hypothesis and conclusion.
+ *
+ * J(A, B) = |A ∩ B| / |A ∪ B|
+ */
+export class JaccardIndexContextTheoremsRanker
+    implements ContextTheoremsRanker
+{
+    private hypToString(hyp: Hyp<PpString>): string {
+        return `${hyp.names.join(" ")} : ${hyp.ty}`;
+    }
+
+    private goalAsTheorem(proofGoal: Goal<PpString>): string {
+        const auxTheoremConcl = proofGoal?.ty;
+        const theoremIndeces = proofGoal?.hyps
+            .map((hyp) => `(${this.hypToString(hyp)})`)
+            .join(" ");
+        return `Theorem helper_theorem ${theoremIndeces} : ${auxTheoremConcl}.`;
+    }
+
+    rankContextTheorems(
+        theorems: Theorem[],
+        completionContext: CompletionContext
+    ): Theorem[] {
+        const goal = completionContext.proofGoal;
+        const goalTheorem = this.goalAsTheorem(goal);
+
+        const jaccardIndex = (theorem: Theorem): number => {
+            const theoremStatement = theorem.statement;
+            const completionTokens = goalTheorem.split(" ");
+            const theoremTokens = theoremStatement.split(" ");
+
+            const intersection = completionTokens.filter((token) =>
+                theoremTokens.includes(token)
+            );
+
+            const union = new Set([...completionTokens, ...theoremTokens]);
+
+            return intersection.length / union.size;
+        };
+
+        return theorems.sort((a, b) => jaccardIndex(b) - jaccardIndex(a));
+    }
+}
diff --git a/src/extension/coqPilot.ts b/src/extension/coqPilot.ts
index f9e5a098..a9e345bd 100644
--- a/src/extension/coqPilot.ts
+++ b/src/extension/coqPilot.ts
@@ -40,6 +40,7 @@ import {
     ProcessEnvironment,
     SourceFileEnvironment,
 } from "../core/completionGenerator";
+import { JaccardIndexContextTheoremsRanker } from "../core/contextTheoremRanker/JaccardIndexContextTheoremsRanker";
 import { ContextTheoremsRanker } from "../core/contextTheoremRanker/contextTheoremsRanker";
 import { DistanceContextTheoremsRanker } from "../core/contextTheoremRanker/distanceContextTheoremsRanker";
 import { RandomContextTheoremsRanker } from "../core/contextTheoremRanker/randomContextTheoremsRanker";
@@ -334,6 +335,8 @@ export class CoqPilot {
                 return new DistanceContextTheoremsRanker();
             case "random":
                 return new RandomContextTheoremsRanker();
+            case "jaccardIndex":
+                return new JaccardIndexContextTheoremsRanker();
             default:
                 throw new Error(
                     `Unknown context theorems ranker type: ${rankerType}`

From af0e80d8c2827f71501f01817513b846b4387f53 Mon Sep 17 00:00:00 2001
From: Andrei Kozyrev <kozyrev.andreiii2016@gmail.com>
Date: Sat, 11 May 2024 15:23:52 +0200
Subject: [PATCH 29/29] fix: update README after merge + update future plans

---
 README.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 64b297c1..9f6cb992 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,8 @@ Now `coqpilot` is in early beta and seeks for feedbacks. Please feel free to ope
   - 🎛 [How VsCode settings work](#how-vscode-settings-work)
   - 🧰 [Model Configuration](#model-configuration)
 - 📌 [Contributed Commands](#contributed-commands)
-- 🚧 [Planned Features](#planned-features)
+- 📊 [Benchmark](#benchmark)
+- 🔜 [Future Plans](#future-plans)
 - 📜 [Release Notes](#release-notes)
 
 ## Requirements
@@ -193,11 +194,7 @@ Another thing to keep in mind: We are still in beta and changes in settings may
 
 * `coqpilot.perform_completion_under_cursor`: Try to generate proof for the goal under the cursor.
 * `coqpilot.perform_completion_for_all_admits`: Try to prove all holes (admitted goals) in the current file.
-* `coqpilot.perform_completion_in_selection`: Try to prove holes (admitted goals) in selection.
-
-## Planned Features
-
-- Add benchmarking options for various models: soon. 
+* `coqpilot.perform_completion_in_selection`: Try to prove holes (admitted goals) in selection. 
 
 ## Benchmark
 
@@ -210,12 +207,12 @@ After that, you need to build the projects. And be careful, the actively maintai
 
 First things first, the process of running the benchmark is not perfectly automated yet. We are working on it. For now, one project (one unit containing nix environment) shall be ran at a time. Let's say you are going to run the benchmark on the `imm` project. You will have to do the following: 
 
-0. Go the the `imm` subdirectory and add a `_CoqProject` file in the root with the following: 
+<!-- 0. Go the the `imm` subdirectory and add a `_CoqProject` file in the root with the following: 
     ```
     -I result/lib/coq/8.19/user-contrib/imm
     -R result/lib/coq/8.19/user-contrib/imm imm
     ```
-    This is needed for the lsp-server to correctly resolve file dependencies.
+    This is needed for the lsp-server to correctly resolve file dependencies. -->
 
 1. Install nix, as specified in the [here](https://nixos.org/download.html). 
 
@@ -228,15 +225,21 @@ First things first, the process of running the benchmark is not perfectly automa
 3. Go to the `imm` subdirectory, apply the nix environment (without it the project will NOT build) and build the project: 
     ```bash
     cd dataset/imm 
-    nix-shell 
     nix-build
+    nix-shell 
     ```
-4. Return to the project root not exiting the nix-shell. Run the benchmark: 
+4. Make sure the `_CoqProject` was successfully generated in the root of your mroject. Return to the project root not exiting the nix-shell. Run the benchmark: 
     ```bash
     cd ../../
     npm run benchmark
     ```    
 
+## Future plans
+
+- Currently the user needs to manually enter the nix shell to get the correct environment for the benchmarks. We are working on automating this process.
+- Benchmarking system is evolving and will soon become more stable with smart scheduling (choice of models/services depending on availability and token limit counts) and automatically generated informative reports for the user.
+- Get rid of the overhead due to hacks with coq-lsp and the aux files.
+
 ## Release Notes
 
 Release notes could be found in the [CHANGELOG.md](https://github.com/JetBrains-Research/coqpilot/blob/refactor/CHANGELOG.md) file.