JetBrains-Research · K-dizzled · Oct 16, 2024 · May 27, 2024 · May 28, 2024 · May 28, 2024
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -20,7 +20,7 @@ on:
 
 env: 
   coqlsp-path: "coq-lsp"
-  coqlsp-version: "0.1.8+8.19"
+  coqlsp-version: "0.1.9+8.19"
   artifact-name: ubuntu-latest-build
 
 jobs:
@@ -51,7 +51,7 @@ jobs:
         restore-keys: opam-${{ matrix.os }}-${{ matrix.ocaml-compiler }}-
 
     - name: Set-up OCaml ${{ matrix.ocaml-compiler }}
-      uses: ocaml/setup-ocaml@v2
+      uses: ocaml/setup-ocaml@v3.0.10
       with:
         ocaml-compiler: ${{ matrix.ocaml-compiler }}
         dune-cache: true
@@ -60,7 +60,7 @@ jobs:
       env:
           OPAMYES: true
       run: |
-        opam install coq-lsp.0.1.8+8.19
+        opam install coq-lsp.0.1.9+8.19
         eval $(opam env)
 
     - name: Install Node.js

diff --git a/.gitignore b/.gitignore
@@ -18,9 +18,15 @@ src/test/resources/coqProj/Makefile.coq
 src/test/resources/coqProj/Makefile.coq.conf
 src/test/resources/**/.vscode
 
-# Ignore the generated build files in datatests inside benchmarks
+## Benchmarking files
+
+# Ignore generated build files inside dataset
 dataset/**/result
 dataset/**/.vscode/
 
-# Ignore private files used while benchmarking
-src/test/benchmark/benchmarkPrivate/
+# Ignore logs & cache
+benchmarkLogs/
+.cache/
+
+# Ignore private files (outdated)
+src/test/benchmark/benchmarkPrivate/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,20 @@
 # Changelog
 
+## 2.3.0
+
+**A major upgrade of the benchmarking system**: At the moment, only a little **new** functionality is provided; moreover, the ability to run benchmarks on Tactician/CoqHammer is temporarily unavailable. However, it will soon be restored and improved. Excessive work has been done to make the benchmarking system more flexible, secure, robust, self-contained, and easy to use. Experiments via our benchmarking framework have been made more accessible than ever. The configurability and reliability of the pipeline have been improved drastically. In a nutshell, the main features of the improved benchmarking system include:
+- Flexible DSL for input setup
+- Extensive configuration for fine-grained experiments
+- Support for single and multi-workspace runs
+- Dataset caching
+- Comprehensive logging
+- Fail-fast strategy
+- Additional metrics (tokens used, context theorems, proof generation stats)
+
+An extensive benchmarking guide is now available in [BENCHMARKING_FRAMEWORK_GUIDE.md](etc/docs/benchmark/BENCHMARKING_FRAMEWORK_GUIDE.md).
+
+We are looking forward to your feedback and suggestions for further improvements/new features.
+
 ## 2.2.7
 
 ### Internal changes: 

diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@
 
 ## Requirements
 
-* `coq-lsp` version `0.1.8+8.19.0` is currently required to run the extension.
+* `coq-lsp` version `0.1.9+8.19` is currently required to run the extension.
 
 ## Brief technical overview
 
@@ -42,7 +42,7 @@ For each service, an array of models could be defined through the settings. Each
 
 When `CoqPilot` completion command is issued, it parses the currently opened file, extracts theorems that have complete proofs and processes them into a message history for the LLM. It helps LLM to keep the style and hallucinate less. 
 
-For each `admit.` present in the file, an independent completion process is issued. If a valid proof is found, it is substituted in the editor. `CoqPilot` also allows a multi-round fixing procedure for the proofs from the LLM. I.e. if the proof was incorrect, compiler message could be automatically sent to the LLM with a request to repair it. It can now be configured in the settings. One can set the amount of attempts for the consequtive proof fixing with compiler feedback.
+For each `admit.` present in the file, an independent completion process is issued. If a valid proof is found, it is substituted in the editor. `CoqPilot` also allows a multi-round fixing procedure for the proofs from the LLM. I.e. if the proof was incorrect, compiler message could be automatically sent to the LLM with a request to repair it. It can now be configured in the settings. One can set the number of attempts for the consequtive proof fixing with compiler feedback.
 
 As soon as at least one valid proof is found, it is substituted in the editor and the process is finished.
 
@@ -66,7 +66,7 @@ As soon as at least one valid proof is found, it is substituted in the editor an
 
 To run the extension, you must install a `coq-lsp` server. Depending on the system used in your project, you should install it using `opam` or `nix`. A well-configured `nix` project should have the `coq-lsp` server installed as a dependency. To install `coq-lsp` using `opam`, you can use the following commands: 
 ```bash
-opam pin add coq-lsp 0.1.8+8.19.0
+opam pin add coq-lsp 0.1.9+8.19
 opam install coq-lsp
 ```
 For more information on how to install `coq-lsp` please refer to [coq-lsp](https://github.com/ejgallego/coq-lsp). 
@@ -213,7 +213,16 @@ git submodule update
 ```
 After that, you need to build the projects. Be careful, the actively maintained way to build this projects is `nix`. Moreover, when adding your own projects, make sure that they are built using `coq-8.19.0`.
 
-First things first, the process of running the benchmark is not perfectly automated yet. We are working on it. For now, one project (one unit containing nix environment) shall be ran at a time. Let's say you are going to run the benchmark on the `imm` project. You will have to do the following: 
+### New framework (beta)
+
+The new benchmarking framework with extended capabilities is now available. 
+However, it is still in the testing phase, so some bugs and missing features may be present.
+
+To use it, follow the instructions in the [`BENCHMARKING_FRAMEWORK_GUIDE.md`](etc/docs/benchmark/BENCHMARKING_FRAMEWORK_GUIDE.md).
+
+### Legacy framework
+
+The process of running the benchmark is not perfectly automated and we are working on it. For now, one project (one unit containing nix environment) shall be ran at a time. Let's say you are going to run the benchmark on the `imm` project. You will have to do the following: 
 
 <!-- 0. Go the the `imm` subdirectory and add a `_CoqProject` file in the root with the following: 
     ```

diff --git a/dataset/imm b/dataset/imm
diff --git a/dataset/auto_benchmark.v → .../standalone-source-files/auto_benchmark.v b/dataset/auto_benchmark.v → .../standalone-source-files/auto_benchmark.v
diff --git a/dataset/mixed_benchmark.v → ...standalone-source-files/mixed_benchmark.v b/dataset/mixed_benchmark.v → ...standalone-source-files/mixed_benchmark.v
diff --git a/dataset/teamCityExampleInput/items/0-auto-benchmark-v-test.json b/dataset/teamCityExampleInput/items/0-auto-benchmark-v-test.json
@@ -0,0 +1,23 @@
+{
+  "task": {
+    "goalToProve": "{\"info\":{\"evar\":[\"Ser_Evar\",1],\"name\":null},\"hyps\":[],\"ty\":\"forall (A : Type) (P : A -> Prop) (x : A), P x -> P x\"}",
+    "positionRange": {
+      "start": {
+        "line": 2,
+        "character": 4
+      },
+      "end": {
+        "line": 2,
+        "character": 10
+      }
+    },
+    "targetType": "ADMIT",
+    "relativeSourceFilePath": "auto_benchmark.v",
+    "sourceTheoremName": "test",
+    "relativeWorkspacePath": "standalone-source-files"
+  },
+  "targetModelIds": [
+    "invalid-proof",
+    "prove-with-auto"
+  ]
+}
diff --git a/dataset/teamCityExampleInput/items/1-auto-benchmark-v-test-thr.json b/dataset/teamCityExampleInput/items/1-auto-benchmark-v-test-thr.json
@@ -0,0 +1,23 @@
+{
+  "task": {
+    "goalToProve": "{\"info\":{\"evar\":[\"Ser_Evar\",36],\"name\":null},\"hyps\":[{\"names\":[\"n\"],\"def\":null,\"ty\":\"nat\"}],\"ty\":\"0 + n = n\"}",
+    "positionRange": {
+      "start": {
+        "line": 35,
+        "character": 4
+      },
+      "end": {
+        "line": 35,
+        "character": 10
+      }
+    },
+    "targetType": "ADMIT",
+    "relativeSourceFilePath": "auto_benchmark.v",
+    "sourceTheoremName": "test_thr",
+    "relativeWorkspacePath": "standalone-source-files"
+  },
+  "targetModelIds": [
+    "invalid-proof",
+    "prove-with-auto"
+  ]
+}
diff --git a/dataset/teamCityExampleInput/items/2-mixed-benchmark-v-add-comm.json b/dataset/teamCityExampleInput/items/2-mixed-benchmark-v-add-comm.json
@@ -0,0 +1,23 @@
+{
+  "task": {
+    "goalToProve": "{\"info\":{\"evar\":[\"Ser_Evar\",7],\"name\":null},\"hyps\":[],\"ty\":\"forall n m : nat, n + m = m + n\"}",
+    "positionRange": {
+      "start": {
+        "line": 12,
+        "character": 4
+      },
+      "end": {
+        "line": 12,
+        "character": 10
+      }
+    },
+    "targetType": "ADMIT",
+    "relativeSourceFilePath": "mixed_benchmark.v",
+    "sourceTheoremName": "add_comm",
+    "relativeWorkspacePath": "standalone-source-files"
+  },
+  "targetModelIds": [
+    "invalid-proof",
+    "prove-with-auto"
+  ]
+}
diff --git a/dataset/teamCityExampleInput/models/invalid-proof.json b/dataset/teamCityExampleInput/models/invalid-proof.json
@@ -0,0 +1,8 @@
+{
+  "ranker": "random",
+  "modelId": "invalid-proof",
+  "tactics": [
+    "a."
+  ],
+  "llmServiceIdentifier": 0
+}
diff --git a/dataset/teamCityExampleInput/models/prove-with-auto.json b/dataset/teamCityExampleInput/models/prove-with-auto.json
@@ -0,0 +1,8 @@
+{
+  "ranker": "random",
+  "modelId": "prove-with-auto",
+  "tactics": [
+    "auto."
+  ],
+  "llmServiceIdentifier": 0
+}
diff --git a/dataset/teamCityExampleInput/projects/standalone-source-files.json b/dataset/teamCityExampleInput/projects/standalone-source-files.json
@@ -0,0 +1,4 @@
+{
+  "relativeDirectoryPath": "standalone-source-files",
+  "requiresNixEnvironment": false
+}
+52 −0		.github/workflows/nix-action-8.18.yml
+52 −0		.github/workflows/nix-action-8.19.yml
+9 −7		.nix/config.nix
+1 −1		src/imm/SubExecution.v
+32 −13		src/travorder/TraversalOrder.v