-
Ease-of-use Task Register for Autotune APIPrinciples
Requirement
@register_task(name="text-generation")
def eval_func(model, model_name=None):
eval_dataset = init_dataset("xxx")
accuracy = Accuracy()
for data, label in eval_dataset:
output = model(data)
accuracy.update(output, label)
return accuracy Repo Architecture
Exampleslm_eval@register_task(name="lm_eval")
def eval_func(model, model_name, tasks=["lambada_openai"]):
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
results = evaluate(
model="hf-causal",
model_args="pretrained=" + model_name + ",tokenizer=" + model_name + ",dtype=float32",
user_model=model,
batch_size=32,
tasks=tasks,
)
return results["accuracy"] lm_code_eval@register_task(name="lm_code_eval")
def eval_func(model, model_name, tasks=None):
from intel_extension_for_transformers.llm.evaluation.lm_code_eval import evaluate
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
results = evaluate(
model=user_model,
tokenizer=tokenizer,
tasks=",".join(tasks),
batch_size=args.batch_size,
)
return results["accuracy"] Usagemodel = autotune(
model,
conf,
example_inputs, # example_inputs for jit.trace
run_fn, # calibration function
task="lm_eval", # registered evaluation task
task_args={
"model_name": "facebook/opt-125m",
"tasks": ["lambada_openai", "hellaswag", "winogrande", "piqa", "wikitext"],
}
) |
Beta Was this translation helpful? Give feedback.
Replies: 4 comments 4 replies
-
One general comments: Given current pile calibration dataset, some Chinese evaluation tasks like CEval/CMMLU highly likely perform poor. If we consider to support different types of evaluation (Chinese, Math, Code), we need to have a new better (most likely mixed) calibration dataset. |
Beta Was this translation helpful? Give feedback.
-
I also consider adding task_name, like |
Beta Was this translation helpful? Give feedback.
-
May I know what's the purpose of |
Beta Was this translation helpful? Give feedback.
-
Decision:
Reasons:
|
Beta Was this translation helpful? Give feedback.
Decision:
Reasons: