diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..edbcf84 --- /dev/null +++ b/.gitignore @@ -0,0 +1,135 @@ +# Temporary Files +tmp.py + +# IDE +.vscode/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/data/human-eval-enhanced-202307.jsonl.gz b/data/human-eval-enhanced-202307.jsonl.gz new file mode 100644 index 0000000..3bc4e19 Binary files /dev/null and b/data/human-eval-enhanced-202307.jsonl.gz differ diff --git a/data/tests.py b/data/tests.py new file mode 100644 index 0000000..6899d5b --- /dev/null +++ b/data/tests.py @@ -0,0 +1,325 @@ +# The tests can be run as a Script, use the Command "python tests.py", +# alternatively, they can be run as a Module using pytest +# If you wish to use pytest, you can install it by "pip3 install pytest", +# use the Command "python -m pytest -vv tests.py" + +import gzip +import jsonlines + + +def g_unzip(in_file, out_file): + with open(in_file, "rb") as f_in, open(out_file, "w", encoding="utf8") as f_out: + decom_str = gzip.decompress(f_in.read()).decode("utf-8") + f_out.write(decom_str) + + +def test_HumanEval_32_fix(): + # the original prompt has a typo in "find_zero returns only only zero point" + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[32]["prompt"] + assert "only only zero point" in original_prompt + + # the fixed prompt is "find_zero returns only one zero point" + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[32]["prompt"] + assert "only only zero point" not in fixed_prompt + assert "only one zero point" in fixed_prompt + + # make sure the function definition is correct + solution = reader_list[38]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code, {}) + + +def test_HumanEval_38_fix(): + # the original prompt doesn't have examples in the docstring + # which causes inconsistency in the Data Set + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[38]["prompt"] + assert ">>> decode_cyclic" not in original_prompt + + # the fixed prompt has 2 examples in the docstring of decode_cyclic + # we didn't add examples for encode_cyclic to maintain consistency with other tasks like 32 + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[38]["prompt"] + assert ">>> decode_cyclic('bca')\n 'abc'\n" in fixed_prompt + assert ">>> decode_cyclic('ab')\n 'ab'\n" in fixed_prompt + assert ">>> encode_cyclic" not in fixed_prompt + + # make sure the added examples are correct + solution = reader_list[38]["canonical_solution"] + func_def_code = fixed_prompt + solution + + # decode_cyclic(string) is equivalent to encode_cyclic(encode_cyclics(string)) + exec(func_def_code + "\n\nassert encode_cyclic(encode_cyclic('bca')) == 'abc'", {}) + exec(func_def_code + "\n\nassert encode_cyclic(encode_cyclic('ab')) == 'ab'", {}) + + +def test_HumanEval_41_fix(): + # the original prompt doesn't have examples in the docstring + # which causes inconsistency in the Data Set + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[41]["prompt"] + assert ">>> car_race_collision" not in original_prompt + + # the fixed prompt has 1 example + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[41]["prompt"] + assert ">>> car_race_collision(3)\n 9\n" in fixed_prompt + + # make sure the added example is correct + solution = reader_list[41]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert car_race_collision(3) == 9", {}) + + +def test_HumanEval_47_fix(): + # the original prompt has a wrong example, median([-10, 4, 6, 1000, 10, 20]) should be 8.0 instead of 15.0 + # reference https://github.com/openai/human-eval/issues/6 + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[47]["prompt"] + assert ">>> median([-10, 4, 6, 1000, 10, 20])\n 15.0\n" in original_prompt + + # the fixed prompt has the correct example + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[47]["prompt"] + assert ">>> median([-10, 4, 6, 1000, 10, 20])\n 8.0\n" in fixed_prompt + + # make sure the added example is correct + solution = reader_list[47]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert median([-10, 4, 6, 1000, 10, 20]) == 8.0", {}) + + +def test_HumanEval_50_fix(): + # the original prompt doesn't have examples in the docstring + # also the prompt is ambiguous in "encode_shift", not explicitly specifying lowercase or uppercase for input string + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[50]["prompt"] + assert ">>> encode_shift" not in original_prompt + + # the fixed prompt has 1 example in the docstring of decode_shift + # we didn't add examples for encode_shift to maintain consistency with other tasks like 32 and 38 + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[50]["prompt"] + assert ">>> decode_shift('abc')\n 'vwx'\n" in fixed_prompt + assert ">>> encode_shift" not in fixed_prompt + + # make sure the added example is correct + solution = reader_list[50]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert decode_shift('abc') == 'vwx'", {}) + + +def test_HumanEval_57_fix(): + # the original prompt is ambiguous, not explicitly specifying how to handle non-strictly increasing or decreasing input + # also the prompt a typo in "Return True is list elements are monotonically increasing or decreasing." + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[57]["prompt"] + assert "Return True is list elements are monotonically increasing or decreasing." in original_prompt + + # the fixed prompt is + # "Return True if list elements are monotonically increasing or decreasing. + # Still return True when list elements are non-strictly monotonically increasing or decreasing." + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[57]["prompt"] + assert "Return True if list elements are monotonically increasing or decreasing." in fixed_prompt + assert "Still return True when list elements are non-strictly monotonically increasing or decreasing." in fixed_prompt + + # make sure the function definition is correct + solution = reader_list[57]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert monotonic([1, 2, 3]) is True", {}) + + +def test_HumanEval_64_fix(): + # the original prompt has an unnecessary statement + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[64]["prompt"] + assert "FIX = \"\"\"\nAdd more test cases.\n\"\"\"" in original_prompt + + # the fixed prompt doesn't have the unnecessary statement + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[64]["prompt"] + assert "FIX = \"\"\"\nAdd more test cases.\n\"\"\"" not in fixed_prompt + + # make sure the function definition is correct + solution = reader_list[64]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code, {}) + + +def test_HumanEval_67_fix(): + # the original prompt has a typo in "for examble" + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[67]["prompt"] + assert "for examble" in original_prompt + + # the fixed prompt is "for example" + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[67]["prompt"] + assert "for example" in fixed_prompt + + # make sure the function definition is correct + solution = reader_list[67]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code, {}) + + +def test_HumanEval_75_fix(): + # the original prompt doesn't align with the canonical solution and the tests + # reference https://huggingface.co/datasets/openai_humaneval/discussions/2 + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[75]["prompt"] + assert "Knowing that (a) is less then 100. " in original_prompt + + # the fixed prompt is "Each of the 3 prime numbers is less than 100." + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[75]["prompt"] + assert "Each of the 3 prime numbers is less than 100." in fixed_prompt + + # make sure the function definition is correct + solution = reader_list[75]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code, {}) + + +def test_HumanEval_83_fix(): + # the original prompt doesn't have examples in the docstring + # which causes inconsistency in the Data Set + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[83]["prompt"] + assert ">>> starts_one_ends" not in original_prompt + + # the fixed prompt has 1 example in the docstring of starts_one_ends + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[83]["prompt"] + assert ">>> starts_one_ends(2)\n 18\n" in fixed_prompt + + # make sure the added example is correct + solution = reader_list[83]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert starts_one_ends(2) == 18", {}) + + +def test_HumanEval_95_fix(): + # the canonical solution is wrong, and test cases don't cover that type of mistakes + # reference https://github.com/openai/human-eval/issues/22 + # the fixed canonical solution is correct, by changing "break" to "continue" in the last else clause + # also, we changed 1 test case "assert candidate({"p":"pineapple", "A":"banana", "B":"banana"}) == False" to + # "assert candidate({A":"banana", "B":"banana","p":"pineapple"}) == False" to capture similar mistakes + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_canonical_solution = reader_list[95]["canonical_solution"] + assert "else:\n break\n return state == \"upper\" or state == \"lower\" \n" not in fixed_canonical_solution + assert "else:\n continue\n return state == \"upper\" or state == \"lower\" \n" in fixed_canonical_solution + + # make sure that the test case is changed + fixed_tests = reader_list[95]["test"] + assert "assert candidate({\"p\":\"pineapple\", \"A\":\"banana\", \"B\":\"banana\"}) == False" not in fixed_tests + assert "assert candidate({\"A\":\"banana\", \"B\":\"banana\", \"p\":\"pineapple\"}) == False" in fixed_tests + + # make sure the fixed canonical solution is correct + prompt = reader_list[95]["prompt"] + func_def_code = prompt + fixed_canonical_solution + fixed_tests + exec(func_def_code + "\n\ncheck(check_dict_case)", {}) + + +def test_HumanEval_116_fix(): + # the original prompt doesn't align with the canonical solution and the tests + # also wrong examples in prompt + # reference https://huggingface.co/datasets/openai_humaneval/discussions/1 + with jsonlines.open("human-eval-v2-20210705.jsonl") as reader: + reader_list = list(reader) + original_prompt = reader_list[116]["prompt"] + assert "sort an array of non-negative integers according" in original_prompt + assert ">>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]" in original_prompt + assert ">>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]" in original_prompt + + # the fixed prompt deleted "non-negative" before "integers" + # the 2nd example is corrected to ">>> sort_array([-2, -3, -4, -5, -6]) == [-4, -2, -6, -5, -3]" + # the 3rd example is corrected to ">>> sort_array([1, 0, 2, 3, 4]) == [0, 1, 2, 4, 3]" + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_prompt = reader_list[116]["prompt"] + assert "sort an array of integers according" in fixed_prompt + assert ">>> sort_array([-2, -3, -4, -5, -6]) == [-4, -2, -6, -5, -3]" in fixed_prompt + assert ">>> sort_array([1, 0, 2, 3, 4]) == [0, 1, 2, 4, 3]" in fixed_prompt + + # make sure the function definition and the fixed examples is correct + solution = reader_list[116]["canonical_solution"] + func_def_code = fixed_prompt + solution + exec(func_def_code + "\n\nassert sort_array([-2, -3, -4, -5, -6]) == [-4, -2, -6, -5, -3]", {}) + exec(func_def_code + "\n\nassert sort_array([1, 0, 2, 3, 4]) == [0, 1, 2, 4, 3]", {}) + + +def test_HumanEval_163_fix(): + # the canonical solution is wrong, and test cases don't cover that type of mistakes + # reference https://github.com/openai/human-eval/issues/20 + # the fixed canonical solution is correct, by changing removing the lower boudning between 2 and 8 + # also, we changed 1 test case "candidate(132, 2)" to "candidate(13, 2)" due to the length of the output + # and we corrected all other test cases + with jsonlines.open("human-eval-enhanced-202307.jsonl") as reader: + reader_list = list(reader) + fixed_canonical_solution = reader_list[163]["canonical_solution"] + assert "max(2, min(a, b))" not in fixed_canonical_solution + assert "min(8, max(a, b))" not in fixed_canonical_solution + + # make sure that the test cases are changed + fixed_tests = reader_list[163]["test"] + assert "assert candidate(2, 10) == [2, 4, 6, 8]" not in fixed_tests + assert "assert candidate(10, 2) == [2, 4, 6, 8]" not in fixed_tests + assert "assert candidate(132, 2) == [2, 4, 6, 8]" not in fixed_tests + assert "assert candidate(17,89) == []" not in fixed_tests + assert "assert candidate(2, 10) == [2, 4, 6, 8, 10]" in fixed_tests + assert "assert candidate(10, 2) == [2, 4, 6, 8, 10]" in fixed_tests + assert "assert candidate(13, 2) == [2, 4, 6, 8, 10, 12]" in fixed_tests + assert "assert candidate(17, 89) == [18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88]" in fixed_tests + + # make sure the fixed canonical solution is correct + prompt = reader_list[163]["prompt"] + func_def_code = prompt + fixed_canonical_solution + fixed_tests + exec(func_def_code + "\n\ncheck(generate_integers)", {}) + + +def main(): + g_unzip("HumanEval.jsonl.gz", "human-eval-v2-20210705.jsonl") + g_unzip("human-eval-enhanced-202307.jsonl.gz", "human-eval-enhanced-202307.jsonl") + test_HumanEval_32_fix() + test_HumanEval_38_fix() + test_HumanEval_41_fix() + test_HumanEval_47_fix() + test_HumanEval_50_fix() + test_HumanEval_57_fix() + test_HumanEval_64_fix() + test_HumanEval_67_fix() + test_HumanEval_75_fix() + test_HumanEval_83_fix() + test_HumanEval_95_fix() + test_HumanEval_116_fix() + test_HumanEval_163_fix() + print("All Tests Passed!") + + +if __name__ == "__main__": + main()