GenBench · drndr · Jul 17, 2023 · Jul 19, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf b/src/genbench/tasks/nl_codesearch_clf/GenBench Evaluation Card.pdf
diff --git a/src/genbench/tasks/nl_codesearch_clf/__init__.py b/src/genbench/tasks/nl_codesearch_clf/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class NlCodesearchClf(TaskDict):
+    pass
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/__init__.py
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/config.jsonnet
@@ -0,0 +1,58 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_adv)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'python',
+        'robustness',
+        'covariate shift',
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+	has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Python programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_adv)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_adv).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_adv) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetAdv(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test" or split == "train":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/__init__.py
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_go)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'go',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+	    },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Go programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_go)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_go).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_go) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetGo(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/__init__.py
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'Natural Language Codesearch Classification (codesearchnet_java)',
+
+    description: 'Natural Language Codesearch Classification (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',
+
+    keywords: [
+        'codesearch',
+        'natural language query',
+        'binary classification',
+        'java',
+        'cross-lingual'
+    ],
+
+    authors: [
+        'Andor Diera',
+        'Abdelhalim Dahou',
+		'Lukas Galke',
+		'Fabian Karl',
+        'Florian Sihler',
+		'Ansgar Scherp',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
+        train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        },
+    ],
+
+    preparation_strategies: {
+        finetuning: {
+            objective: 'maximum_likelihood',
+        },
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Given a code comment and a Java programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
+                input_prefix: '',
+                output_prefix: '',
+                choices_prefix: '',
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
@@ -0,0 +1,19 @@
+# Natural Language Codesearch Classification (codesearchnet_java)
+
+## Abstract
+*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Examples
+*Give some examples of the Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Usage
+*Describe how to load your task and what is required for evaluation, if anything.*
+
+## Data Source
+*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_java).*
+
+## Limitations and Bias
+*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_java) has, with links and references if possible.*
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
@@ -0,0 +1,46 @@
+import random
+from typing import Dict
+
+import datasets
+
+from genbench import Task
+
+
+class NlCodesearchClfCodesearchnetJava(Task):
+    def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
+        """Create the dataset adding a negative sample for each code comment/query
+
+        Returns:
+            A dictionary containing key-value pairs for the raw datasets.
+            The keys are strings representing the name of the dataset split
+            (e.g., "train", "validation", "test") and the values are
+            HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
+            The train split only contains the original dataset.
+        """
+        # Load the raw datasets
+        raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
+        output: Dict[str, datasets.Dataset] = {}
+        # Set random seed for consistency
+        random.seed(42)
+        for split, dataset in raw_datasets.items():
+            if split == "test":
+                new_dataset = datasets.Dataset.from_dict({})
+                for item in dataset:
+                    # Add comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(item)
+                    other_items = [other_item for other_item in dataset if other_item != item]
+                    # Randomly select other item
+                    random_item = random.sample(other_items, 1)
+                    # Split input into comment and code
+                    input_parts = item["input"].split("[CODESPLIT]")
+                    # Split random input into comment and code
+                    random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
+                    # Combine the "input" fields of the original and random items
+                    new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
+                    new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
+                    # Add negative sample comment-code pair to new dataset
+                    new_dataset = new_dataset.add_item(new_item)
+                output[split] = new_dataset
+            else:
+                output[split] = dataset
+        return output
diff --git a/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py b/src/genbench/tasks/nl_codesearch_clf/codesearchnet_javascript/__init__.py