Skip to content

Commit

Permalink
Parsed Genotype and enriched dataframe #21
Browse files Browse the repository at this point in the history
  • Loading branch information
lapplislazuli committed Oct 13, 2022
1 parent 01b4c5e commit 3210e3e
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 7 deletions.
7 changes: 7 additions & 0 deletions scripts/evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,10 @@ For it to run you need the experiments finished and their output copied next to
└── requirements.txt
```

## Important

This evaluation is based on and for data from the Commit `2022-autumn-submission [3be8d0e]`!
While most of things have not changed since then, there is one change; the "transformers" were a string only,
which has been later changed to be proper json.
So, what you have at the tag is also valid json but the field is one big string instead of a list of objects.
For updates on this matter see [this issue](https://github.com/ciselab/Guided-MT-Code2Vec/issues/21).
10 changes: 6 additions & 4 deletions scripts/evaluation/evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@
"execution_count": null,
"outputs": [],
"source": [
"#all_experiments = set([datapoint[\"experiment\"] for datapoint in datapoints])\n",
"#all_seeds = set([datapoint[\"seed\"] for datapoint in datapoints])\n",
"all_metrics = [\"F1\",\"MRR\",\"EDITDIST\",\"PMRR\",\"REC\",\"PREC\"]\n",
"all_transformers = [] #TBD"
"all_transformers = extract.get_known_transformers()\n",
"all_experiments = set(df[\"experiment\"])\n",
"all_seeds = set(df[\"seed\"])"
],
"metadata": {
"collapsed": false,
Expand All @@ -105,7 +105,9 @@
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"source": [
"df.head(5)"
],
"metadata": {
"collapsed": false,
"pycharm": {
Expand Down
79 changes: 76 additions & 3 deletions scripts/evaluation/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@
import pandas as pd


known_transformers = [
"RenameVariableTransformer", "IfFalseElseTransformer", "AddNeutralElementTransformer",
"AddUnusedVariableTransformer", "LambdaIdentityTransformer",
"IfTrueTransformer", "RandomParameterNameTransformer"
]

def get_known_transformers() -> [str]:
return known_transformers

def make_csv(path_to_data_dir: str, filename: str = "results.csv") -> None:
"""
Extracts all data from the given dirs jsonfiles,
Expand Down Expand Up @@ -43,6 +52,9 @@ def make_df(path_to_data_dir: str) -> pd.DataFrame:
datapoint["experiment"] = extract_experiment_from_path(path_to_data_dir, file)
datapoint["TRANSFORMATIONS"] = count_transformers(datapoint)
datapoint["generation"] = extract_generation_from_path(file)
transformers = extract_transformers_from_genotype(datapoint["genotype"], known_transformers)
datapoint = {**datapoint, **transformers}
del transformers

datapoints.append(datapoint)

Expand Down Expand Up @@ -72,9 +84,70 @@ def extract_generation_from_path(path: str) -> int:
def count_transformers(datapoint):
# There was an issue with the json, the genotype is just a string as some quotes were missing
raw = datapoint["genotype"]
pattern = "transformer"
matches = re.findall(pattern, raw)
return len(matches)
if type(raw) == str:
pattern = "transformer"
matches = re.findall(pattern, raw)
return len(matches)
else:
return len(raw)


def extract_transformers_from_genotype(genotype, transformers: [str] = known_transformers) -> dict:
if type(genotype) == str:
return extract_transformers_from_str(genotype,transformers)
else:
return extract_genotype_from_json(genotype,transformers)


def extract_transformers_from_str(genotype:str, transformers: [str] = known_transformers) -> dict:
"""
Tries to count the given Transformers in the given Genotype-Str.
Result is a dictionary with the count of each Transformer.
>>> example_genotype = "[{ transformer: RenameVariableTransformer, seed: 1131100509 }{ transformer: AddNeutralElementTransformer, seed: -1887344816 }{ transformer: IfFalseElseTransformer, seed: -1554943859 }{ transformer: LambdaIdentityTransformer, seed: 2097957312 }{ transformer: AddUnusedVariableTransformer, seed: 2076014978 }]"
>>> transformers = [ "RenameVariableTransformer","IfFalseElseTransformer","AddNeutralElementTransformer","AddUnusedVariableTransformer","LambdaIdentityTransformer","IfTrueTransformer", "RandomParameterNameTransformer"]
>>> extract_transformers_from_str(example_genotype,transformers))
{'RenameVariableTransformer': 1,
'IfFalseElseTransformer': 1,
'AddNeutralElementTransformer': 1,
'AddUnusedVariableTransformer': 1,
'LambdaIdentityTransformer': 1,
'IfTrueTransformer': 0,
'RandomParameterNameTransformer': 0}
"""
results = {}
for trans in transformers:
pattern = trans
results[trans] = len(re.findall(pattern, genotype))
return results


def extract_genotype_from_json(genotype: [dict], transformers: [str] = known_transformers) -> dict:
"""
Tries to count the given Transformers in the given Genotype-Str.
Result is a dictionary with the count of each Transformer.
:param genotype: the transformers of one datapoint, a list of transformer+seed objects
:param transformers: a list of all available transformers to check for
:return: a dictionary with each transformers count
>>> example_genotype_json = [{"transformer": "RenameVariableTransformer", "seed": 1131100509 },{ "transformer": "AddNeutralElementTransformer", "seed": -1887344816 },{"transformer": "IfFalseElseTransformer", "seed": -1554943859 },{"transformer": "LambdaIdentityTransformer", "seed": 2097957312 },{"transformer": "AddUnusedVariableTransformer", "seed": 2076014978}]
>>> transformers = [ "RenameVariableTransformer","IfFalseElseTransformer","AddNeutralElementTransformer","AddUnusedVariableTransformer","LambdaIdentityTransformer","IfTrueTransformer", "RandomParameterNameTransformer"]
>>> extract_genotype_from_json(example_genotype_json,transformers=transformers)
{'RenameVariableTransformer': 1,
'IfFalseElseTransformer': 1,
'AddNeutralElementTransformer': 1,
'AddUnusedVariableTransformer': 1,
'LambdaIdentityTransformer': 1,
'IfTrueTransformer': 0,
'RandomParameterNameTransformer': 0}
"""
result = {}
found_trans = [g["transformer"] for g in genotype]
for trans in transformers:
result[trans]=found_trans.count(trans)
return result


if __name__ == "__main__":
Expand Down

0 comments on commit 3210e3e

Please sign in to comment.