Skip to content

Commit

Permalink
Merge branch 'main' of github.com:kjain14/OpenHands into kjain14-main
Browse files Browse the repository at this point in the history
  • Loading branch information
neubig committed Jan 20, 2025
2 parents 1b6e444 + 8848e60 commit 3355bae
Show file tree
Hide file tree
Showing 48 changed files with 10,069 additions and 0 deletions.
120 changes: 120 additions & 0 deletions evaluation/benchmarks/testgeneval/CodeBLEU/Evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Adapted from https://github.com/EngineeringSoftware/teco/blob/main/src/CodeBLEU/Evaluator.py
import os
from pathlib import Path
from typing import List

import numpy as np
from CodeBLEU import bleu, dataflow_match, syntax_match, weighted_ngram_match
from tree_sitter import Language


class Evaluator:
"""
Python interface for using CodeBLEU, based on calc_code_bleu.py.
"""

def __init__(
self,
lang: str,
alpha: float = 0.25,
beta: float = 0.25,
gamma: float = 0.25,
theta: float = 0.25,
):
self.lang = lang
self.alpha = alpha
self.beta = beta
self.gamma = gamma
self.theta = theta

# Load keywords and tree-sitter parser
this_dir: Path = Path(os.path.dirname(os.path.realpath(__file__)))
self.keywords = [
x.strip()
for x in open(
this_dir / 'keywords' / f'{self.lang}.txt', 'r', encoding='utf-8'
).readlines()
]
self.parser_language = Language(this_dir / 'parser' / 'my-languages.so', lang)

@staticmethod
def make_weights(reference_tokens, key_word_list):
return {
token: 1 if token in key_word_list else 0.2 for token in reference_tokens
}

def corpus_code_bleu(
self, refs_toks: List[List[List[str]]], hyps_toks: List[List[str]]
) -> float:
"""
Calculates CodeBLEU for the given references and hypotheses (should be tokenized).
:param refs_toks: the references, num_item * num_ref * num_tok.
:param hyps_toks: the hypotheses, num_item * num_tok.
:return: corpus-level CodeBLEU score;
NOTE: not to be confused with averaged sentence-level CodeBLEU score.
"""
assert len(refs_toks) == len(hyps_toks)

# Group tokens (for syntax match & dataflow match)
refs = [
[' '.join(ref_toks) for ref_toks in reference] for reference in refs_toks
]
hyps = [' '.join(hyp_toks) for hyp_toks in hyps_toks]

# Accumulate working scores and weights
cum_weighted_score = 0
cum_weight = 0

# Calculate ngram match (BLEU)
ngram_match_score = bleu.corpus_bleu(refs_toks, hyps_toks)
cum_weighted_score += self.alpha * ngram_match_score
cum_weight += self.alpha

# Calculate weighted ngram match
refs_toks_with_weights = [
[
[reference_tokens, self.make_weights(reference_tokens, self.keywords)]
for reference_tokens in reference
]
for reference in refs_toks
]
weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(
refs_toks_with_weights, hyps_toks
)
cum_weighted_score += self.beta * weighted_ngram_match_score
cum_weight += self.beta

# Calculate syntax match
try:
syntax_match_score = syntax_match.corpus_syntax_match(
refs, hyps, self.lang, parser_language=self.parser_language
)
except ZeroDivisionError:
# Syntax match not working, ignore this part
syntax_match_score = np.nan
pass
else:
cum_weighted_score += self.gamma * syntax_match_score
cum_weight += self.gamma

# Calculate dataflow match
dataflow_match_score = dataflow_match.corpus_dataflow_match(
refs, hyps, self.lang, parser_language=self.parser_language
)
if dataflow_match_score is not np.nan:
cum_weighted_score += self.theta * dataflow_match_score
cum_weight += self.theta
# else, ignore this part

return cum_weighted_score / cum_weight

def sentence_code_bleu(
self, refs_toks: List[List[str]], hyp_toks: List[str]
) -> float:
"""
Calculates CodeBLEU for the given references and hypothesis (should be tokenized).
:param refs_toks: the references, num_ref * num_tok.
:param hyp_toks: the hypothesis, num_tok.
:return: sentence-level CodeBLEU score.
"""
return self.corpus_code_bleu([refs_toks], [hyp_toks])
Loading

0 comments on commit 3355bae

Please sign in to comment.