Skip to content
This repository has been archived by the owner on Jul 23, 2024. It is now read-only.

[Task Submission] Natural Language Codesearch Classification (nl_codesearch_clf ) #16

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from 24 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
5 changes: 5 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from genbench import TaskDict


class NlCodesearchClf(TaskDict):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
name: 'Natural Language Codesearch Classification (codesearchnet_adv)',

description: 'Natural Language Codesearch Classification (codesearchnet_adv) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures robustness against covariate shifts',

keywords: [
'codesearch',
'natural language query',
'binary classification',
'python',
'robustness',
'covariate shift',
],

authors: [
'Andor Diera',
'Abdelhalim Dahou',
'Lukas Galke',
'Fabian Karl',
'Florian Sihler',
'Ansgar Scherp',
],

data_source: {
type: 'manual',
test: 'https://zenodo.org/record/8310891/files/test_adv.jsonl',
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
},

has_validation_set: false,
has_train_set: true,

task_type: 'multiple_choice',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0,
},
],

preparation_strategies: {
finetuning: {
objective: 'maximum_likelihood',
},

prompt_based_testing: {
prompt_builder: {
instruction_zero_shot: 'Given a code comment and a Python programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
input_prefix: '',
output_prefix: '',
choices_prefix: '',
append_choices_to_input: false,
}
},
},
}
19 changes: 19 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Natural Language Codesearch Classification (codesearchnet_adv)

## Abstract
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_adv).*

## Examples
*Give some examples of the Natural Language Codesearch Classification (codesearchnet_adv).*

## Usage
*Describe how to load your task and what is required for evaluation, if anything.*

## Data Source
*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_adv).*

## Limitations and Bias
*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_adv) has, with links and references if possible.*

## GenBench Eval card
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
46 changes: 46 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_adv/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import random
from typing import Dict

import datasets

from genbench import Task


class NlCodesearchClfCodesearchnetAdv(Task):
def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
"""Create the dataset adding a negative sample for each code comment/query

Returns:
A dictionary containing key-value pairs for the raw datasets.
The keys are strings representing the name of the dataset split
(e.g., "train", "validation", "test") and the values are
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
The train split only contains the original dataset.
"""
# Load the raw datasets
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
output: Dict[str, datasets.Dataset] = {}
# Set random seed for consistency
random.seed(42)
for split, dataset in raw_datasets.items():
if split == "test" or split == "train":
new_dataset = datasets.Dataset.from_dict({})
for item in dataset:
# Add comment-code pair to new dataset
new_dataset = new_dataset.add_item(item)
other_items = [other_item for other_item in dataset if other_item != item]
# Randomly select other item
random_item = random.sample(other_items, 1)
# Split input into comment and code
input_parts = item["input"].split("[CODESPLIT]")
# Split random input into comment and code
random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
# Combine the "input" fields of the original and random items
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
# Add negative sample comment-code pair to new dataset
new_dataset = new_dataset.add_item(new_item)
output[split] = new_dataset
else:
output[split] = dataset
return output
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
name: 'Natural Language Codesearch Classification (codesearchnet_go)',

description: 'Natural Language Codesearch Classification (codesearchnet_go) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',

keywords: [
'codesearch',
'natural language query',
'binary classification',
'go',
'cross-lingual'
],

authors: [
'Andor Diera',
'Abdelhalim Dahou',
'Lukas Galke',
'Fabian Karl',
'Florian Sihler',
'Ansgar Scherp',
],

data_source: {
type: 'manual',
test: 'https://zenodo.org/record/8310891/files/test_go.jsonl',
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
},

has_validation_set: false,
has_train_set: true,

task_type: 'multiple_choice',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0,
},
],

preparation_strategies: {
finetuning: {
objective: 'maximum_likelihood',
},
prompt_based_testing: {
prompt_builder: {
instruction_zero_shot: 'Given a code comment and a Go programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
input_prefix: '',
output_prefix: '',
choices_prefix: '',
append_choices_to_input: false,
}
},
},
}
19 changes: 19 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Natural Language Codesearch Classification (codesearchnet_go)

## Abstract
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_go).*

## Examples
*Give some examples of the Natural Language Codesearch Classification (codesearchnet_go).*

## Usage
*Describe how to load your task and what is required for evaluation, if anything.*

## Data Source
*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_go).*

## Limitations and Bias
*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_go) has, with links and references if possible.*

## GenBench Eval card
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
46 changes: 46 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_go/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import random
from typing import Dict

import datasets

from genbench import Task


class NlCodesearchClfCodesearchnetGo(Task):
def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
"""Create the dataset adding a negative sample for each code comment/query

Returns:
A dictionary containing key-value pairs for the raw datasets.
The keys are strings representing the name of the dataset split
(e.g., "train", "validation", "test") and the values are
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
The train split only contains the original dataset.
"""
# Load the raw datasets
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
output: Dict[str, datasets.Dataset] = {}
# Set random seed for consistency
random.seed(42)
for split, dataset in raw_datasets.items():
if split == "test":
new_dataset = datasets.Dataset.from_dict({})
for item in dataset:
# Add comment-code pair to new dataset
new_dataset = new_dataset.add_item(item)
other_items = [other_item for other_item in dataset if other_item != item]
# Randomly select other item
random_item = random.sample(other_items, 1)
# Split input into comment and code
input_parts = item["input"].split("[CODESPLIT]")
# Split random input into comment and code
random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
# Combine the "input" fields of the original and random items
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
# Add negative sample comment-code pair to new dataset
new_dataset = new_dataset.add_item(new_item)
output[split] = new_dataset
else:
output[split] = dataset
return output
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
name: 'Natural Language Codesearch Classification (codesearchnet_java)',

description: 'Natural Language Codesearch Classification (codesearchnet_java) aims to measure the generalization capabilites of language models in code understanding. This subtasks measures cross-lingual generalization',

keywords: [
'codesearch',
'natural language query',
'binary classification',
'java',
'cross-lingual'
],

authors: [
'Andor Diera',
'Abdelhalim Dahou',
'Lukas Galke',
'Fabian Karl',
'Florian Sihler',
'Ansgar Scherp',
],

data_source: {
type: 'manual',
test: 'https://zenodo.org/record/8310891/files/test_java.jsonl',
train:'https://zenodo.org/record/8310891/files/train_adv.jsonl',
},

has_validation_set: false,
has_train_set: true,

task_type: 'multiple_choice',

evaluation_metrics: [
{
hf_id: 'accuracy',
git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
best_score: 1.0,
},
],

preparation_strategies: {
finetuning: {
objective: 'maximum_likelihood',
},
prompt_based_testing: {
prompt_builder: {
instruction_zero_shot: 'Given a code comment and a Java programming language code snippet, determine if the comment accurately represents the function of the code. Respond with True if the code matches the comment and False if it does not. The input format is defined as comment [CODESPLIT] code',
input_prefix: '',
output_prefix: '',
choices_prefix: '',
append_choices_to_input: false,
}
},
},
}
19 changes: 19 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Natural Language Codesearch Classification (codesearchnet_java)

## Abstract
*Copy the abstract of your accompanying paper for this task here Natural Language Codesearch Classification (codesearchnet_java).*

## Examples
*Give some examples of the Natural Language Codesearch Classification (codesearchnet_java).*

## Usage
*Describe how to load your task and what is required for evaluation, if anything.*

## Data Source
*Describe the data source for this Natural Language Codesearch Classification (codesearchnet_java).*

## Limitations and Bias
*Note any known limitations or biases that the Natural Language Codesearch Classification (codesearchnet_java) has, with links and references if possible.*

## GenBench Eval card
*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
46 changes: 46 additions & 0 deletions src/genbench/tasks/nl_codesearch_clf/codesearchnet_java/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import random
from typing import Dict

import datasets

from genbench import Task


class NlCodesearchClfCodesearchnetJava(Task):
def get_dataset_raw(self) -> Dict[str, datasets.Dataset]:
"""Create the dataset adding a negative sample for each code comment/query

Returns:
A dictionary containing key-value pairs for the raw datasets.
The keys are strings representing the name of the dataset split
(e.g., "train", "validation", "test") and the values are
HuggingFace `datasets.Dataset` objects containing the original pair and the distractors for the test split.
The train split only contains the original dataset.
"""
# Load the raw datasets
raw_datasets: Dict[str, datasets.Dataset] = self._load_data_source()
output: Dict[str, datasets.Dataset] = {}
# Set random seed for consistency
random.seed(42)
for split, dataset in raw_datasets.items():
if split == "test":
new_dataset = datasets.Dataset.from_dict({})
for item in dataset:
# Add comment-code pair to new dataset
new_dataset = new_dataset.add_item(item)
other_items = [other_item for other_item in dataset if other_item != item]
# Randomly select other item
random_item = random.sample(other_items, 1)
# Split input into comment and code
input_parts = item["input"].split("[CODESPLIT]")
# Split random input into comment and code
random_input_parts = random_item[0]["input"].split("[CODESPLIT]")
# Combine the "input" fields of the original and random items
new_input = input_parts[0] + "[CODESPLIT]" + random_input_parts[1]
new_item = {"input": new_input, "target": 0, "target_options": item["target_options"]}
# Add negative sample comment-code pair to new dataset
new_dataset = new_dataset.add_item(new_item)
output[split] = new_dataset
else:
output[split] = dataset
return output
Loading