forked from ChunxuTang/AI-Reference-Architecture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark-model-loading.py
87 lines (68 loc) · 2.66 KB
/
benchmark-model-loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
A script to benchmark model loading of an ML model, typically an LLM, via HuggingFace
transformers. The default model is Google's flan-t5-xl, which is ~11GB in storage
with 3B parameters.
Example usage:
python3 benchmark-model-loadingpy -mp ./models/flan-t5-xl -tp ./models/flan-t5-xl
"""
import argparse
import logging
import time
from logging.config import fileConfig
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
log_conf_path = "./conf/logging.conf"
fileConfig(log_conf_path, disable_existing_loggers=True)
_logger = logging.getLogger("BenchmarkModelLoading")
def get_args():
parser = argparse.ArgumentParser(
description="Benchmark model loading with HuggingFace transformers"
)
parser.add_argument(
"-mp",
"--model_path",
help="Path of input model",
default="/mnt/alluxio/fuse/models/flan-t5-xl",
)
parser.add_argument(
"-tp",
"--tokenizer_path",
help="Path of tokenizer model",
default="/mnt/alluxio/fuse/models/flan-t5-xl",
)
return parser.parse_args()
class BenchmarkModelLoadingRunner:
def __init__(self, model_path, tokenizer_path):
self.model_path = model_path
self.tokenizer_path = tokenizer_path
self.tokenizer = None
self.model = None
def benchmark_model_loading(self):
start_time = time.perf_counter()
_logger.debug(f"Loading tokenizer from {self.tokenizer_path}...")
self.tokenizer = T5Tokenizer.from_pretrained(self.tokenizer_path)
_logger.debug(f"Loading model from {self.model_path}...")
self.model = T5ForConditionalGeneration.from_pretrained(
self.model_path, device_map="auto"
)
end_time = time.perf_counter()
_logger.info(f"Model loading in {end_time - start_time:0.4f} seconds")
def test_model_inference(
self,
input_text="Q: Can Geoffrey Hinton have a conversation with George Washington? "
+ "Give the rationale before answering.",
):
input_ids = self.tokenizer(
input_text, return_tensors="pt"
).input_ids.to("cuda")
outputs = self.model.generate(input_ids, max_length=100)
_logger.info("Test model inference with the following question:")
_logger.info(input_text)
_logger.info(self.tokenizer.decode(outputs[0]))
if __name__ == "__main__":
args = get_args()
benchmark_model_loading_runner = BenchmarkModelLoadingRunner(
tokenizer_path=args.tokenizer_path, model_path=args.model_path
)
benchmark_model_loading_runner.benchmark_model_loading()
benchmark_model_loading_runner.test_model_inference()