Skip to content

Commit

Permalink
Add evaluation support for using lm-eval harness (#1349)
Browse files Browse the repository at this point in the history
## Add evaluation support for using lm-eval harness

* Added a new Evaluator to use lm_eval to evaluate hf model
* Added a llama2 config to use the new evaluator

## Checklist before requesting a review
- [ ] Add unit tests for this change.
- [x] Make sure all tests can pass.
- [ ] Update documents if necessary.
- [x] Lint and apply fixes to your code by running `lintrunner -a`
- [ ] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.
- [ ] Is this PR including examples changes? If yes, please remember to
update [example
documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md)
in a follow-up PR.

## (Optional) Issue link
  • Loading branch information
shaahji authored Sep 11, 2024
1 parent 1ceb55d commit 71d3522
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 7 deletions.
30 changes: 30 additions & 0 deletions examples/llama2/llama2_lmeval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"input_model": {
"type": "HfModel",
"generative": true,
"model_path": "meta-llama/Llama-2-7b-hf",
"load_kwargs": { "attn_implementation": "eager" }
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
}
},
"evaluators": {
"evaluator": {
"type": "LMEvaluator",
"model_class": "hf",
"tasks": [ "hellaswag" ],
"batch_size": 16,
"limit": 0.05,
"max_gen_toks": 10
}
},
"auto_optimizer_config": { "opt_level": 0, "disable_auto_optimizer": true, "precision": "fp16" },
"evaluator": "evaluator",
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_dir": "output"
}
77 changes: 70 additions & 7 deletions olive/evaluator/olive_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ class OliveModelOutput(NamedTuple):


class OliveEvaluator(ABC):
def __init__(self, **kwargs):
super().__init__()

@abstractmethod
def evaluate(
self,
Expand Down Expand Up @@ -191,6 +194,10 @@ def compute_throughput(metric: Metric, latencies: Any) -> MetricResult:


class _OliveEvaluator(OliveEvaluator):
@staticmethod
def device_string_to_torch_device(device: Device):
return torch.device("cuda") if device == Device.GPU else torch.device(device)

@classmethod
def io_bind_enabled(cls, metric: Metric, inference_settings: Dict) -> bool:
if metric.user_config.io_bind:
Expand Down Expand Up @@ -715,11 +722,6 @@ def _evaluate_raw_latency(
@Registry.register(str(Framework.PYTORCH))
@Registry.register("PyTorchEvaluator")
class PyTorchEvaluator(_OliveEvaluator):

@staticmethod
def _device_string_to_torch_device(device: Device):
return torch.device("cuda") if device == Device.GPU else torch.device(device)

@torch.no_grad()
def _inference(
self,
Expand All @@ -734,7 +736,7 @@ def _inference(
preds = []
targets = []
logits = []
device = PyTorchEvaluator._device_string_to_torch_device(device)
device = _OliveEvaluator.device_string_to_torch_device(device)
run_kwargs = metric.get_run_kwargs()
if device:
session.to(device)
Expand Down Expand Up @@ -791,7 +793,7 @@ def _evaluate_raw_latency(
session = model.prepare_session(inference_settings=None, device=device)

input_data, _ = next(iter(dataloader))
torch_device = PyTorchEvaluator._device_string_to_torch_device(device)
torch_device = _OliveEvaluator.device_string_to_torch_device(device)
run_kwargs = metric.get_run_kwargs()

is_cuda = device == Device.GPU
Expand Down Expand Up @@ -1075,6 +1077,67 @@ def _prepare_dataloader(
return FileListCommonDataLoader(dataloader, model.io_config, batch_size=file_chunk_size)


@Registry.register("LMEvaluator")
class LMEvaluator(OliveEvaluator):
def __init__(self, model_class: str, tasks: List[str], **kwargs):
super().__init__(**kwargs)

self.model_class = model_class
self.tasks = tasks
self.limit = kwargs.get("limit")
self.batch_size = kwargs.get("batch_size", 1)
self.max_gen_toks = kwargs.get("max_gen_toks")

def evaluate(
self,
model: "OliveModelHandler",
metrics: List[Metric],
device: Device = Device.CPU,
execution_providers: Union[str, List[str]] = None,
) -> MetricResult:
import lm_eval

device = _OliveEvaluator.device_string_to_torch_device(device)
# device = torch.device("cuda:5")
tokenizer = model.get_hf_tokenizer()
nn_module = model.load_model().eval().to(device)

lmmodel = lm_eval.api.registry.get_model(self.model_class)(
pretrained=nn_module,
tokenizer=tokenizer,
batch_size=self.batch_size,
device=device,
max_gen_toks=self.max_gen_toks,
)

task_manager = lm_eval.tasks.TaskManager()

results = lm_eval.simple_evaluate(
model=lmmodel,
tasks=self.tasks,
task_manager=task_manager,
log_samples=False,
batch_size=self.batch_size,
device=device,
limit=self.limit,
)

metrics = {}
for task_name in sorted(results["results"].keys()):
metric_items = sorted(results["results"][task_name].items())

task_metrics = {}
for mf, v in metric_items:
if mf != "alias":
m, _ = mf.split(",", 1)
if not m.endswith("_stderr"):
task_metrics[m] = SubMetricResult(value=v, priority=-1, higher_is_better=True)

metrics[task_name] = MetricResult.parse_obj(task_metrics)

return flatten_metric_result(metrics)


class OliveEvaluatorConfig(NestedConfig):
_nested_field_name = "type_args"

Expand Down

0 comments on commit 71d3522

Please sign in to comment.