diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 077e7cf..d65cae7 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -50,3 +50,10 @@ For it to run you need the experiments finished and their output copied next to └── requirements.txt ``` +## Important + +This evaluation is based on and for data from the Commit `2022-autumn-submission [3be8d0e]`! +While most of things have not changed since then, there is one change; the "transformers" were a string only, +which has been later changed to be proper json. +So, what you have at the tag is also valid json but the field is one big string instead of a list of objects. +For updates on this matter see [this issue](https://github.com/ciselab/Guided-MT-Code2Vec/issues/21). diff --git a/scripts/evaluation/evaluation.ipynb b/scripts/evaluation/evaluation.ipynb index 1a36cf7..46055c1 100644 --- a/scripts/evaluation/evaluation.ipynb +++ b/scripts/evaluation/evaluation.ipynb @@ -89,10 +89,10 @@ "execution_count": null, "outputs": [], "source": [ - "#all_experiments = set([datapoint[\"experiment\"] for datapoint in datapoints])\n", - "#all_seeds = set([datapoint[\"seed\"] for datapoint in datapoints])\n", "all_metrics = [\"F1\",\"MRR\",\"EDITDIST\",\"PMRR\",\"REC\",\"PREC\"]\n", - "all_transformers = [] #TBD" + "all_transformers = extract.get_known_transformers()\n", + "all_experiments = set(df[\"experiment\"])\n", + "all_seeds = set(df[\"seed\"])" ], "metadata": { "collapsed": false, @@ -105,7 +105,9 @@ "cell_type": "code", "execution_count": null, "outputs": [], - "source": [], + "source": [ + "df.head(5)" + ], "metadata": { "collapsed": false, "pycharm": { diff --git a/scripts/evaluation/extract.py b/scripts/evaluation/extract.py index 2e3827e..e2b085c 100644 --- a/scripts/evaluation/extract.py +++ b/scripts/evaluation/extract.py @@ -5,6 +5,15 @@ import pandas as pd +known_transformers = [ + "RenameVariableTransformer", "IfFalseElseTransformer", "AddNeutralElementTransformer", + "AddUnusedVariableTransformer", "LambdaIdentityTransformer", + "IfTrueTransformer", "RandomParameterNameTransformer" +] + +def get_known_transformers() -> [str]: + return known_transformers + def make_csv(path_to_data_dir: str, filename: str = "results.csv") -> None: """ Extracts all data from the given dirs jsonfiles, @@ -43,6 +52,9 @@ def make_df(path_to_data_dir: str) -> pd.DataFrame: datapoint["experiment"] = extract_experiment_from_path(path_to_data_dir, file) datapoint["TRANSFORMATIONS"] = count_transformers(datapoint) datapoint["generation"] = extract_generation_from_path(file) + transformers = extract_transformers_from_genotype(datapoint["genotype"], known_transformers) + datapoint = {**datapoint, **transformers} + del transformers datapoints.append(datapoint) @@ -72,9 +84,70 @@ def extract_generation_from_path(path: str) -> int: def count_transformers(datapoint): # There was an issue with the json, the genotype is just a string as some quotes were missing raw = datapoint["genotype"] - pattern = "transformer" - matches = re.findall(pattern, raw) - return len(matches) + if type(raw) == str: + pattern = "transformer" + matches = re.findall(pattern, raw) + return len(matches) + else: + return len(raw) + + +def extract_transformers_from_genotype(genotype, transformers: [str] = known_transformers) -> dict: + if type(genotype) == str: + return extract_transformers_from_str(genotype,transformers) + else: + return extract_genotype_from_json(genotype,transformers) + + +def extract_transformers_from_str(genotype:str, transformers: [str] = known_transformers) -> dict: + """ + Tries to count the given Transformers in the given Genotype-Str. + Result is a dictionary with the count of each Transformer. + + >>> example_genotype = "[{ transformer: RenameVariableTransformer, seed: 1131100509 }{ transformer: AddNeutralElementTransformer, seed: -1887344816 }{ transformer: IfFalseElseTransformer, seed: -1554943859 }{ transformer: LambdaIdentityTransformer, seed: 2097957312 }{ transformer: AddUnusedVariableTransformer, seed: 2076014978 }]" + >>> transformers = [ "RenameVariableTransformer","IfFalseElseTransformer","AddNeutralElementTransformer","AddUnusedVariableTransformer","LambdaIdentityTransformer","IfTrueTransformer", "RandomParameterNameTransformer"] + >>> extract_transformers_from_str(example_genotype,transformers)) + {'RenameVariableTransformer': 1, + 'IfFalseElseTransformer': 1, + 'AddNeutralElementTransformer': 1, + 'AddUnusedVariableTransformer': 1, + 'LambdaIdentityTransformer': 1, + 'IfTrueTransformer': 0, + 'RandomParameterNameTransformer': 0} + """ + results = {} + for trans in transformers: + pattern = trans + results[trans] = len(re.findall(pattern, genotype)) + return results + + +def extract_genotype_from_json(genotype: [dict], transformers: [str] = known_transformers) -> dict: + """ + Tries to count the given Transformers in the given Genotype-Str. + Result is a dictionary with the count of each Transformer. + + :param genotype: the transformers of one datapoint, a list of transformer+seed objects + :param transformers: a list of all available transformers to check for + :return: a dictionary with each transformers count + + >>> example_genotype_json = [{"transformer": "RenameVariableTransformer", "seed": 1131100509 },{ "transformer": "AddNeutralElementTransformer", "seed": -1887344816 },{"transformer": "IfFalseElseTransformer", "seed": -1554943859 },{"transformer": "LambdaIdentityTransformer", "seed": 2097957312 },{"transformer": "AddUnusedVariableTransformer", "seed": 2076014978}] + >>> transformers = [ "RenameVariableTransformer","IfFalseElseTransformer","AddNeutralElementTransformer","AddUnusedVariableTransformer","LambdaIdentityTransformer","IfTrueTransformer", "RandomParameterNameTransformer"] + >>> extract_genotype_from_json(example_genotype_json,transformers=transformers) + {'RenameVariableTransformer': 1, + 'IfFalseElseTransformer': 1, + 'AddNeutralElementTransformer': 1, + 'AddUnusedVariableTransformer': 1, + 'LambdaIdentityTransformer': 1, + 'IfTrueTransformer': 0, + 'RandomParameterNameTransformer': 0} + + """ + result = {} + found_trans = [g["transformer"] for g in genotype] + for trans in transformers: + result[trans]=found_trans.count(trans) + return result if __name__ == "__main__":