From d67b0737771c77bb222caaf0bdb46fa92daa1c4a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 25 Apr 2023 14:27:18 -0400 Subject: [PATCH 01/16] Fix --- scripts/run_all_benchmark_breakdown.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_all_benchmark_breakdown.sh b/scripts/run_all_benchmark_breakdown.sh index 818ddc4..fbcf7cd 100755 --- a/scripts/run_all_benchmark_breakdown.sh +++ b/scripts/run_all_benchmark_breakdown.sh @@ -12,7 +12,7 @@ ./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_ ./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_ ./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_# OOM? +./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_ # OOM? ./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1 ./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1 From c757f7cbf0ef533615492c48b7a50c47fe5a7041 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 25 Apr 2023 14:27:43 -0400 Subject: [PATCH 02/16] Text gen inference --- .dockerignore | 1 + Dockerfile | 18 +++++++++++++++++- transformers | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.dockerignore b/.dockerignore index 0be7d57..2a9f48c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,5 +2,6 @@ !src !scripts !transformers +!text-generation-inference !requirements.txt !Makefile diff --git a/Dockerfile b/Dockerfile index 1e35ee0..bf5fe58 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,11 +10,27 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \ && chown $USERNAME /app # git-lfs is needed to interact with the huggingface hub +# ssl and gcc are needed for text-gen-inference RUN apt-get update \ - && apt-get install git-lfs \ + && apt-get install git-lfs libssl-dev gcc \ && rm -rf /var/lib/apt/lists/* \ && git lfs install + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && PROTOC_ZIP=protoc-21.12-linux-x86_64.zip \ + && curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP \ + && unzip -o $PROTOC_ZIP -d /usr/local bin/protoc \ + && unzip -o $PROTOC_ZIP -d /usr/local 'include/*' \ + && rm -f $PROTOC_ZIP \ + && chmod 777 /root/ && chmod 777 /root/.cargo + +ENV PATH="/root/.cargo/bin:$PATH" + +COPY --chown=$USERNAME text-generation-inference/ ./text-generation-inference + +RUN cd text-generation-inference && make install && make install-benchmark && cd .. + COPY --chown=$USERNAME ./requirements.txt ./ COPY --chown=$USERNAME transformers/ ./transformers diff --git a/transformers b/transformers index a2efad2..b50afe0 160000 --- a/transformers +++ b/transformers @@ -1 +1 @@ -Subproject commit a2efad2c96e6da982f102eea53918c7b8431da80 +Subproject commit b50afe022715ce94502dfda2679c559a7dad8595 From 81d322c62f86f04506d8a4a48309462d6b0f59ec Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 1 May 2023 10:41:01 -0400 Subject: [PATCH 03/16] env --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index bf5fe58..f7d36a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,8 @@ COPY --chown=$USERNAME transformers/ ./transformers # Stock version of pip doesn't work with editable transformers. RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir +ENV HUGGINGFACE_HUB_CACHE=/app/data/.hf_cache/ + COPY --chown=$USERNAME Makefile . COPY --chown=$USERNAME src/ ./src COPY --chown=$USERNAME scripts/ ./scripts From d591be1b24834cb72342b64b30097cf17633d8f0 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 1 May 2023 11:16:59 -0400 Subject: [PATCH 04/16] pipeline --- src/pipeline.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/pipeline.py b/src/pipeline.py index 03f8c0d..63b8de6 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -431,9 +431,66 @@ def __init__(self, **kwargs): ) +class TextGenModelWrapper: + def __init__(self, model): + self.model = model + + def parameters(self): + return self.model.parameters() + + def __call__( + self, + input_ids, + past_key_values, + attention_mask, + position_ids, + return_dict, + use_cache, + ): + return self.model(input_ids, attention_mask, position_ids, past_key_values) + + +class TG_Pipeline(Pipeline): + def __init__(self, **kwargs): + if self.device != torch.device("cuda"): + raise ValueError(f"Textgen does not support device {self.device}") + + super().__init__(**kwargs) + + def _get_config( + self, + model_type: Optional[str], + pretrained_config: Optional[str], + config_args: Dict[str, Any], + ) -> Optional[PretrainedConfig]: + return None + + def _create_model(self) -> PreTrainedModel: + raise NotImplementedError() + + def _reload_model(self): + raise NotImplementedError() + + def _save_pretrained(self, pretrained_model: str): + raise NotImplementedError() + + def _load_pretrained(self, pretrained_model: str): + from text_generation_server import get_model + + pretrained_model, revision = parse_revision(pretrained_model) + return TextGenModelWrapper(get_model(pretrained_model, revision, False, False)) + + def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): + raise NotImplementedError() + + def _allocate_mock_cache(self, past_key_length: int, batch_size: int): + raise NotImplementedError() + + _PIPELINE_CLASS_MAP = { "HF_Pipeline": HF_Pipeline, "DS_Pipeline": DS_Pipeline, + "TG_Pipeline": TG_Pipeline, } From 4e59ef265bfaf926d8bbaa07e4ecae0238616d28 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 1 May 2023 12:13:15 -0400 Subject: [PATCH 05/16] stuff --- scripts/run_benchmark_breakdown.sh | 4 +- scripts/run_textgen_benchmark_breakdown.sh | 56 ++++++++++++++++++++++ src/pipeline.py | 48 ++++++++++++++++--- src/utils.py | 7 ++- 4 files changed, 105 insertions(+), 10 deletions(-) create mode 100755 scripts/run_textgen_benchmark_breakdown.sh diff --git a/scripts/run_benchmark_breakdown.sh b/scripts/run_benchmark_breakdown.sh index 5781a13..e912a5d 100755 --- a/scripts/run_benchmark_breakdown.sh +++ b/scripts/run_benchmark_breakdown.sh @@ -56,7 +56,9 @@ run () { # run(step, runtime, attn) then echo "Skipping existing $FILE_NAME" else - $RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save="$FILE_NAME" + CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME" + echo "$CMD" + $CMD fi } diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh new file mode 100755 index 0000000..60ed54f --- /dev/null +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -0,0 +1,56 @@ + +# Santacoder prefill. +# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 +# Santacoder decode (fewer data points because slower) +# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 +MODEL_NAME=${1:-"santacoder"} +MODEL_PATH=${2:-"bigcode/gpt_bigcode-santacoder"} +BATCH_SIZE=${3:-32} +MAX_NEW_TOKENS=${4:-2040} +# Prime number to see key length padding effect. +TOKEN_STEP=${5:-5} +STEP_ID=${6:-""} +FILE_PREFIX=${7:-""} +CYCLES=${8:-10} + +SAVE_DIR=data/benchmarks/v2 +#BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" +RUN="python3 src/main.py --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom" + + +RUNTIME=("") +RUNTIME_NAMES=("base") + +ATTN=( \ + "--pipeline_class=TG_Pipeline" \ + ) +ATTN_NAME=( \ + "textgen" \ + ) + + +STEP=("--no_prefill" "--no_cache") +STEP_NAME=("decode" "prefill") + +COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE predict_last_token=True" + +run () { # run(step, runtime, attn) + FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_step_"$TOKEN_STEP"_"${STEP_NAME[$1]}"/"$FILE_PREFIX""${RUNTIME_NAMES[$2]}"_"${ATTN_NAME[$3]}".json + if [ -f "$FILE_NAME" ]; + then + echo "Skipping existing $FILE_NAME" + else + CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME" + echo "$CMD" + $CMD + fi +} + +if [ "${STEP_ID}" -eq "0" ] +then + # Decode (default attn only) + run 0 0 0 +else + # Prefill + run 1 0 0 +fi diff --git a/src/pipeline.py b/src/pipeline.py index 63b8de6..b0adf37 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -21,6 +21,10 @@ GPTBigCodeConfig, ) +from transformers.modeling_outputs import ( + CausalLMOutputWithCrossAttentions, +) + logger = logging.getLogger(__name__) @@ -413,7 +417,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) - if self.device != torch.device("cuda"): + if self.device != torch.device("cuda:0"): raise ValueError(f"Deepspeed does not support device {self.device}") if self.dtype not in (torch.float32, torch.float16, torch.bfloat16): @@ -433,10 +437,21 @@ def __init__(self, **kwargs): class TextGenModelWrapper: def __init__(self, model): + from text_generation_server.models import CausalLM, FlashCausalLM + self.model = model + if isinstance(self.model, FlashCausalLM): + self._is_flash = True + elif isinstance(self.model, CausalLM): + self._is_flash = False + else: + raise NotImplementedError() def parameters(self): - return self.model.parameters() + return [] + + def eval(self): + pass def __call__( self, @@ -447,16 +462,35 @@ def __call__( return_dict, use_cache, ): - return self.model(input_ids, attention_mask, position_ids, past_key_values) + if self._is_flash: + raise NotImplementedError() + logits, past_key_values = self.model.forward( + input_ids, + position_ids, + cu_seqlens, + max_s, + past_key_values, + pre_allocate_past_size, + ) + else: + logits, past_key_values = self.model.forward(input_ids, attention_mask, position_ids, past_key_values) + return CausalLMOutputWithCrossAttentions( + loss=None, + logits=logits, + past_key_values=past_key_values, + hidden_states=None, + attentions=None, + cross_attentions=None, + ) class TG_Pipeline(Pipeline): def __init__(self, **kwargs): - if self.device != torch.device("cuda"): - raise ValueError(f"Textgen does not support device {self.device}") - super().__init__(**kwargs) + if self.device != torch.device("cuda:0"): + raise ValueError(f"Textgen does not support device {self.device}") + def _get_config( self, model_type: Optional[str], @@ -475,7 +509,7 @@ def _save_pretrained(self, pretrained_model: str): raise NotImplementedError() def _load_pretrained(self, pretrained_model: str): - from text_generation_server import get_model + from text_generation_server.models import get_model pretrained_model, revision = parse_revision(pretrained_model) return TextGenModelWrapper(get_model(pretrained_model, revision, False, False)) diff --git a/src/utils.py b/src/utils.py index 9abc913..bf5f227 100644 --- a/src/utils.py +++ b/src/utils.py @@ -149,8 +149,11 @@ def get_inputs_from_tokens(tokens, length, tokenizer): raise RuntimeError("Failed to generate stable input sequences") -def get_random_inputs(length, tokenizer, random_state): - return get_inputs_from_tokens(random_state.randint(0, tokenizer.vocab_size, length).tolist(), length, tokenizer) +def get_random_inputs(lengths, tokenizer, random_state): + return [ + get_inputs_from_tokens(random_state.randint(0, tokenizer.vocab_size, length).tolist(), length, tokenizer) + for length in lengths + ] def get_inputs_from_files(files: List[Path], lengths, tokenizer, random_state): From 2918c96a73df9439a4fab7d5dc886264d409815a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 1 May 2023 14:58:46 -0400 Subject: [PATCH 06/16] New generate --- src/pipeline.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index b0adf37..2b18a2a 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -512,7 +512,7 @@ def _load_pretrained(self, pretrained_model: str): from text_generation_server.models import get_model pretrained_model, revision = parse_revision(pretrained_model) - return TextGenModelWrapper(get_model(pretrained_model, revision, False, False)) + return get_model(pretrained_model, revision, False, False) def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): raise NotImplementedError() @@ -520,6 +520,132 @@ def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): def _allocate_mock_cache(self, past_key_length: int, batch_size: int): raise NotImplementedError() + def _generate_textgen( + self, + batch, + max_new_tokens: int, + use_cache: bool = True, + do_prefill: bool = True, + breakdown_latency: bool = False, + key_length_step: int = 1, + ignore_oom: bool = False, + pad_generated_tokens: float = 0, + ): + t0 = self._get_time(breakdown_latency) + # TODO: Implement + assert do_prefill + assert key_length_step == 1 + assert pad_generated_tokens == 0 + + batch_size = len(batch) + + input_length = max(batch.input_lengths) + output_length = input_length + max_new_tokens + + t1 = self._get_time(breakdown_latency) + last_time = t1 + generate_times = {} + with torch.inference_mode(): + for key_length in range(input_length, output_length, key_length_step): + try: + generated, batch = self.model.generate_token(batch) + t2 = self._get_time(breakdown_latency) + generate_times[key_length] = t2 - last_time + last_time = t2 + except torch.cuda.OutOfMemoryError: + if ignore_oom: + logger.warning(f"Out of memory at key length {None}") + break + else: + raise + output_text = [g.text for g in generated] + + metrics = {} + if breakdown_latency: + metrics[Metrics.LATENCY_GENERATE_START] = t1 - t0 + metrics[Metrics.LATENCY_GENERATE_BREAKDOWN] = generate_times + + return output_text, metrics + + def __call__( + self, + text: List[str], + max_new_tokens: int, + custom_generate: bool = False, + use_cache: bool = True, + do_prefill: bool = True, + breakdown_latency=False, + key_length_step: int = 1, + ignore_oom: bool = False, + pad_generated_tokens: float = 0, + ) -> Tuple[List[str], Dict[str, Any]]: + t0 = self._get_time() + inputs = self.tokenizer(text, return_tensors="pt", padding=True) + + from text_generation_server.pb import generate_pb2 + from text_generation_server.models.model import Model + + model: Model = self.model + + batch_pb = generate_pb2.Batch( + id=0, + requests=[ + generate_pb2.Request( + id=i, + inputs=input_, + truncate=99999, + parameters=generate_pb2.NextTokenChooserParameters( + temperature=1.0, + top_k=1, + top_p=1, + typical_p=1, + do_sample=False, + seed=0, + repetition_penalty=1.0, + watermark=False, + ), + stopping_parameters=generate_pb2.StoppingCriteriaParameters( + max_new_tokens=max_new_tokens, + stop_sequences=None, + ignore_eos_token=True, + ), + ) + for i, input_ in enumerate(inputs) + ], + size=len(inputs), + max_tokens=0, # Ignored + ) + batch = model.batch_type.from_pb(batch_pb, self.tokenizer, self.device) + batch_size = len(batch) + + # TODO: Implement + input_length = max(batch.input_lengths) + output_length = input_length + max_new_tokens + + output_text, generate_metrics = self._generate_textgen( + batch, + max_new_tokens, + use_cache, + do_prefill, + breakdown_latency, + key_length_step, + ignore_oom, + pad_generated_tokens, + ) + t1 = self._get_time(True) + + metrics = { + **generate_metrics, + Metrics.BATCH_SIZE: batch_size, + Metrics.INPUT_LENGTH: input_length, + Metrics.OUTPUT_LENGTH: output_length, + Metrics.TOKENS_SAMPLE: output_length - input_length, + Metrics.TOKENS_BATCH: batch_size * (output_length - input_length), + Metrics.LATENCY_E2E: t1 - t0, + } + + return output_text, metrics + _PIPELINE_CLASS_MAP = { "HF_Pipeline": HF_Pipeline, From a09a0e30b75ff4d27896bda7301d97a4db31cca6 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 1 May 2023 18:03:04 -0400 Subject: [PATCH 07/16] improvements --- .../run_all_textgen_benchmark_breakdown.sh | 20 ++++ src/pipeline.py | 96 +++++++++++++++---- 2 files changed, 100 insertions(+), 16 deletions(-) create mode 100755 scripts/run_all_textgen_benchmark_breakdown.sh diff --git a/scripts/run_all_textgen_benchmark_breakdown.sh b/scripts/run_all_textgen_benchmark_breakdown.sh new file mode 100755 index 0000000..ee7ca5d --- /dev/null +++ b/scripts/run_all_textgen_benchmark_breakdown.sh @@ -0,0 +1,20 @@ + +# Santacoder +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 5 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 5 0 v2_ + +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 11 1 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ + +# Large model +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_ # OOM? + +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 32 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 256 8190 29 1 v2_ 1 # OOM? diff --git a/src/pipeline.py b/src/pipeline.py index 2b18a2a..af7e8b3 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -75,7 +75,6 @@ def __init__( else: self.model = self._load_pretrained(pretrained_model) - self.model.eval() t3 = self._get_time() self.global_metrics[Metrics.INIT_TOKEN] = t1 - t0 self.global_metrics[Metrics.INIT_CONFIG] = t2 - t1 @@ -101,7 +100,7 @@ def _create_model(self) -> PreTrainedModel: self.global_metrics[Metrics.INIT_DEVICE] = t2 - t1 self.global_metrics[Metrics.INIT_WEIGHTS] = t3 - t2 - return model + return model.eval() def _reload_model(self): self._save_pretrained("tmp") @@ -136,7 +135,7 @@ def _load_pretrained(self, pretrained_model: str) -> PreTrainedModel: model = model.to(self.device) t2 = self._get_time() self.global_metrics[Metrics.INIT_DEVICE] = t2 - t1 - return model + return model.eval() def _get_config( self, @@ -386,8 +385,8 @@ def aggregate_metrics(self, metrics: List[Dict[str, Any]]): breakdown = all_metrics.pop(Metrics.LATENCY_GENERATE_BREAKDOWN, []) mean_metrics = {key: np.mean(value).item() for key, value in all_metrics.items() if len(value) > 0} - throughput = mean_metrics[Metrics.TOKENS_BATCH] / mean_metrics[Metrics.LATENCY_E2E] - model_throughput = mean_metrics[Metrics.TOKENS_BATCH] / mean_metrics[Metrics.LATENCY_MODEL] + throughput = mean_metrics.get(Metrics.TOKENS_BATCH, 0) / mean_metrics.get(Metrics.LATENCY_E2E, 1) + model_throughput = mean_metrics.get(Metrics.TOKENS_BATCH, 0) / mean_metrics.get(Metrics.LATENCY_MODEL, 1) if len(breakdown) > 0: mean_metrics[Metrics.LATENCY_GENERATE_BREAKDOWN] = { @@ -487,10 +486,13 @@ def __call__( class TG_Pipeline(Pipeline): def __init__(self, **kwargs): super().__init__(**kwargs) + # TODO: Ignoring dtype if self.device != torch.device("cuda:0"): raise ValueError(f"Textgen does not support device {self.device}") + self.config = self.model.model.transformer.config + def _get_config( self, model_type: Optional[str], @@ -512,7 +514,9 @@ def _load_pretrained(self, pretrained_model: str): from text_generation_server.models import get_model pretrained_model, revision = parse_revision(pretrained_model) - return get_model(pretrained_model, revision, False, False) + + with fast_init(self.device) if self.fast_init else contextlib.nullcontext(): + return get_model(pretrained_model, revision, False, False) def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): raise NotImplementedError() @@ -520,6 +524,67 @@ def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): def _allocate_mock_cache(self, past_key_length: int, batch_size: int): raise NotImplementedError() + def get_num_parameters(self) -> int: + return 0 + + def _update_generate_batch(self, batch, use_cache, do_prefill, key_length): + from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch + + assert do_prefill or use_cache + + if isinstance(batch, FlashCausalLMBatch): + # Tested for flash santacoder only + assert max(batch.input_lengths) == batch.max_seqlen + seqlen_diff = key_length - batch.max_seqlen + assert seqlen_diff >= 0 + if batch.past_key_values is None: + mock_cache = use_cache and not do_prefill + else: + if not use_cache: + batch.past_key_values = None + mock_cache = use_cache and seqlen_diff > 0 + if mock_cache: + batch.past_key_values = [] + + for i, old_length in enumerate(batch.input_lengths): + length = old_length + seqlen_diff + batch.input_lengths[i] = length + batch.max_seqlen = max(batch.max_seqlen, length) + add_tokens = [self.tokenizer.pad_token_id] * seqlen_diff + batch.all_input_ids[i].extend(add_tokens) + batch.all_input_ids_tensor[i][old_length:length] = torch.tensor(add_tokens) + batch.cu_seqlens[(i + 1)] = batch.cu_seqlens[i] + length + + if use_cache and batch.past_key_values is not None: + # Decode + batch.input_ids[i] = batch.all_input_ids_tensor[i][length - 1 : length] + batch.position_ids[i] = length - 1 + if mock_cache: + batch.stopping_criterias[i].current_tokens = max(batch.stopping_criterias[i].current_tokens, 1) + batch.past_key_values.append( + torch.randn( + [self.config.n_layer, length, 2, 1, self.config.n_embd // self.config.n_head], + dtype=self.model.dtype, + device=self.device, + ) + ) + batch.past_key_values.append( + torch.zeros( + [self.config.n_layer, 1, 2, 1, self.config.n_embd // self.config.n_head], + dtype=self.model.dtype, + device=self.device, + ) + ) + else: + # Prefill + batch.input_ids[i] = batch.all_input_ids_tensor[i][:length] + batch.position_ids[i] = torch.arange(0, length, dtype=torch.int32, device=self.device) + + assert batch.max_seqlen == key_length + + else: + raise NotImplementedError() + def _generate_textgen( self, batch, @@ -532,13 +597,10 @@ def _generate_textgen( pad_generated_tokens: float = 0, ): t0 = self._get_time(breakdown_latency) - # TODO: Implement - assert do_prefill - assert key_length_step == 1 + assert do_prefill or use_cache + # TODO: Implement? assert pad_generated_tokens == 0 - batch_size = len(batch) - input_length = max(batch.input_lengths) output_length = input_length + max_new_tokens @@ -548,6 +610,9 @@ def _generate_textgen( with torch.inference_mode(): for key_length in range(input_length, output_length, key_length_step): try: + if key_length_step > 1 or not use_cache or not do_prefill: + self._update_generate_batch(batch, use_cache, do_prefill, key_length) + last_time = self._get_time(breakdown_latency) generated, batch = self.model.generate_token(batch) t2 = self._get_time(breakdown_latency) generate_times[key_length] = t2 - last_time @@ -558,7 +623,7 @@ def _generate_textgen( break else: raise - output_text = [g.text for g in generated] + output_text = ["" if g.generated_text is None else g.generated_text.text for g in generated] metrics = {} if breakdown_latency: @@ -580,7 +645,6 @@ def __call__( pad_generated_tokens: float = 0, ) -> Tuple[List[str], Dict[str, Any]]: t0 = self._get_time() - inputs = self.tokenizer(text, return_tensors="pt", padding=True) from text_generation_server.pb import generate_pb2 from text_generation_server.models.model import Model @@ -592,7 +656,7 @@ def __call__( requests=[ generate_pb2.Request( id=i, - inputs=input_, + inputs=t, truncate=99999, parameters=generate_pb2.NextTokenChooserParameters( temperature=1.0, @@ -610,9 +674,9 @@ def __call__( ignore_eos_token=True, ), ) - for i, input_ in enumerate(inputs) + for i, t in enumerate(text) ], - size=len(inputs), + size=len(text), max_tokens=0, # Ignored ) batch = model.batch_type.from_pb(batch_pb, self.tokenizer, self.device) From 9c90421d96ef222aa88e74d30ae2b94c25dace8e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 3 May 2023 10:40:00 -0400 Subject: [PATCH 08/16] stuff --- scripts/run_all_benchmark_breakdown.sh | 16 ++++++++-------- scripts/run_all_textgen_benchmark_breakdown.sh | 16 ++++++++-------- scripts/run_textgen_benchmark_breakdown.sh | 2 +- src/pipeline.py | 18 +++++++++++++----- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/scripts/run_all_benchmark_breakdown.sh b/scripts/run_all_benchmark_breakdown.sh index fbcf7cd..90f80a2 100755 --- a/scripts/run_all_benchmark_breakdown.sh +++ b/scripts/run_all_benchmark_breakdown.sh @@ -9,12 +9,12 @@ ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ # Large model -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_ # OOM? +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 29 1 v2_ 1 # OOM? +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? diff --git a/scripts/run_all_textgen_benchmark_breakdown.sh b/scripts/run_all_textgen_benchmark_breakdown.sh index ee7ca5d..be8fb1c 100755 --- a/scripts/run_all_textgen_benchmark_breakdown.sh +++ b/scripts/run_all_textgen_benchmark_breakdown.sh @@ -9,12 +9,12 @@ ./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ # Large model -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_ # OOM? +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 32 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/large-model 256 8190 29 1 v2_ 1 # OOM? +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 v2_ 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 60ed54f..310c9b9 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -15,7 +15,7 @@ CYCLES=${8:-10} SAVE_DIR=data/benchmarks/v2 #BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" -RUN="python3 src/main.py --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom" +RUN="python3 src/main.py --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init" RUNTIME=("") diff --git a/src/pipeline.py b/src/pipeline.py index af7e8b3..4d27c06 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -534,9 +534,11 @@ def _update_generate_batch(self, batch, use_cache, do_prefill, key_length): if isinstance(batch, FlashCausalLMBatch): # Tested for flash santacoder only + # TODO: Fix batch size 1 assert max(batch.input_lengths) == batch.max_seqlen seqlen_diff = key_length - batch.max_seqlen assert seqlen_diff >= 0 + kv_shape = [2, 1, self.config.n_embd // self.config.n_head] if batch.past_key_values is None: mock_cache = use_cache and not do_prefill else: @@ -544,7 +546,14 @@ def _update_generate_batch(self, batch, use_cache, do_prefill, key_length): batch.past_key_values = None mock_cache = use_cache and seqlen_diff > 0 if mock_cache: - batch.past_key_values = [] + if len(batch.input_lengths) > 1: + batch.past_key_values = [] + else: + batch.past_key_values = torch.randn( + [self.config.n_layer, batch.max_tokens, *kv_shape], + dtype=self.model.dtype, + device=self.device, + ) for i, old_length in enumerate(batch.input_lengths): length = old_length + seqlen_diff @@ -559,18 +568,18 @@ def _update_generate_batch(self, batch, use_cache, do_prefill, key_length): # Decode batch.input_ids[i] = batch.all_input_ids_tensor[i][length - 1 : length] batch.position_ids[i] = length - 1 - if mock_cache: + if mock_cache and len(batch.input_lengths) > 1: batch.stopping_criterias[i].current_tokens = max(batch.stopping_criterias[i].current_tokens, 1) batch.past_key_values.append( torch.randn( - [self.config.n_layer, length, 2, 1, self.config.n_embd // self.config.n_head], + [self.config.n_layer, length, *kv_shape], dtype=self.model.dtype, device=self.device, ) ) batch.past_key_values.append( torch.zeros( - [self.config.n_layer, 1, 2, 1, self.config.n_embd // self.config.n_head], + [self.config.n_layer, 1, *kv_shape], dtype=self.model.dtype, device=self.device, ) @@ -660,7 +669,6 @@ def __call__( truncate=99999, parameters=generate_pb2.NextTokenChooserParameters( temperature=1.0, - top_k=1, top_p=1, typical_p=1, do_sample=False, From c77d79f8030a03a52b030c4a45b377f0d663b117 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 19 May 2023 20:36:07 -0400 Subject: [PATCH 09/16] stuff --- .../run_all_textgen_benchmark_breakdown.sh | 28 ++++++------- scripts/run_textgen_benchmark_breakdown.sh | 42 ++++++++----------- src/main.py | 2 +- src/pipeline.py | 4 +- 4 files changed, 36 insertions(+), 40 deletions(-) diff --git a/scripts/run_all_textgen_benchmark_breakdown.sh b/scripts/run_all_textgen_benchmark_breakdown.sh index be8fb1c..0fb0eab 100755 --- a/scripts/run_all_textgen_benchmark_breakdown.sh +++ b/scripts/run_all_textgen_benchmark_breakdown.sh @@ -1,20 +1,20 @@ # Santacoder -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 5 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 5 0 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 5 0 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 5 0 -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 11 1 v2_ -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 v2_ -./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 11 1 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 # Large model -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 v2_ -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 # OOM? -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 v2_ 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 1 # OOM? diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 310c9b9..3d6827d 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -10,47 +10,41 @@ MAX_NEW_TOKENS=${4:-2040} # Prime number to see key length padding effect. TOKEN_STEP=${5:-5} STEP_ID=${6:-""} -FILE_PREFIX=${7:-""} -CYCLES=${8:-10} +CYCLES=${7:-10} -SAVE_DIR=data/benchmarks/v2 +SAVE_DIR=data/benchmarks/v3 #BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" -RUN="python3 src/main.py --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init" +RUN="python3 src/main.py --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " -RUNTIME=("") -RUNTIME_NAMES=("base") - -ATTN=( \ - "--pipeline_class=TG_Pipeline" \ - ) -ATTN_NAME=( \ - "textgen" \ - ) +IMPL=("flash" "santa" "causal" "vector" "bigcode") STEP=("--no_prefill" "--no_cache") STEP_NAME=("decode" "prefill") -COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE predict_last_token=True" +COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE" run () { # run(step, runtime, attn) - FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_step_"$TOKEN_STEP"_"${STEP_NAME[$1]}"/"$FILE_PREFIX""${RUNTIME_NAMES[$2]}"_"${ATTN_NAME[$3]}".json + FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_"${STEP_NAME[$1]}"_step_"$TOKEN_STEP"_"$CYCLES"/"${IMPL[$2]}".json if [ -f "$FILE_NAME" ]; then echo "Skipping existing $FILE_NAME" else - CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME" + CMD="MODEL_TYPE=${IMPL[$2]} $RUN $COMMON ${STEP[$1]} --save=$FILE_NAME" echo "$CMD" $CMD fi } -if [ "${STEP_ID}" -eq "0" ] -then - # Decode (default attn only) - run 0 0 0 -else - # Prefill - run 1 0 0 -fi +for impl in {0..4} +do + if [ "${STEP_ID}" -eq "0" ] + then + # Decode (default attn only) + run 0 $impl + else + # Prefill + run 1 $impl + fi +done diff --git a/src/main.py b/src/main.py index e42b929..9547c20 100644 --- a/src/main.py +++ b/src/main.py @@ -179,10 +179,10 @@ def main(argv: Optional[List[str]] = None) -> None: benchmark_metrics[Metrics.MEMORY_RESERVED_MAX] = torch.cuda.max_memory_reserved() t3 = time.perf_counter() - benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2 benchmark_metrics[Metrics.RUNTIME_TOTAL] = t3 - t0 if len(all_metrics) > 0: + benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2 benchmark_metrics.update(pipeline.aggregate_metrics(all_metrics)) benchmark_metrics = Metrics.reorder_metrics(benchmark_metrics) diff --git a/src/pipeline.py b/src/pipeline.py index 4d27c06..60195af 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -516,7 +516,7 @@ def _load_pretrained(self, pretrained_model: str): pretrained_model, revision = parse_revision(pretrained_model) with fast_init(self.device) if self.fast_init else contextlib.nullcontext(): - return get_model(pretrained_model, revision, False, False) + return get_model(pretrained_model, revision, False, None) def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): raise NotImplementedError() @@ -716,6 +716,8 @@ def __call__( Metrics.LATENCY_E2E: t1 - t0, } + output_text=[i+o for i, o in zip(text, output_text)] + return output_text, metrics From 2ad418ca1b331528f05b2eff8b8f30112f0c2071 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 23 May 2023 14:27:31 -0400 Subject: [PATCH 10/16] Use model fast forward --- scripts/run_all_benchmark_breakdown.sh | 16 ++++++++-------- scripts/run_textgen_benchmark_breakdown.sh | 13 +++++++------ src/parse_breakdown_results.py | 4 ++-- src/pipeline.py | 6 ++++-- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/scripts/run_all_benchmark_breakdown.sh b/scripts/run_all_benchmark_breakdown.sh index 90f80a2..2fb7df2 100755 --- a/scripts/run_all_benchmark_breakdown.sh +++ b/scripts/run_all_benchmark_breakdown.sh @@ -9,12 +9,12 @@ ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ # Large model -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 3d6827d..826d447 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -14,13 +14,13 @@ CYCLES=${7:-10} SAVE_DIR=data/benchmarks/v3 #BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" -RUN="python3 src/main.py --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " +RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " -IMPL=("flash" "santa" "causal" "vector" "bigcode") +IMPL=("flash" "causal" "vector" "bigcode") -STEP=("--no_prefill" "--no_cache") +STEP=("" "--no_cache") STEP_NAME=("decode" "prefill") COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE" @@ -31,13 +31,14 @@ run () { # run(step, runtime, attn) then echo "Skipping existing $FILE_NAME" else - CMD="MODEL_TYPE=${IMPL[$2]} $RUN $COMMON ${STEP[$1]} --save=$FILE_NAME" - echo "$CMD" + export MODEL_TYPE="${IMPL[$2]}" + CMD="$RUN $COMMON ${STEP[$1]} --save=$FILE_NAME" + echo "MODEL_TYPE=${IMPL[$2]} $CMD" $CMD fi } -for impl in {0..4} +for impl in {0..3} do if [ "${STEP_ID}" -eq "0" ] then diff --git a/src/parse_breakdown_results.py b/src/parse_breakdown_results.py index 4c281cf..d8380cc 100644 --- a/src/parse_breakdown_results.py +++ b/src/parse_breakdown_results.py @@ -62,8 +62,8 @@ def main(argv: Optional[List[str]] = None) -> None: dirname = args.input_dir.stem if title is None: try: - name, _, bs, _, _, _, _, step = dirname.rsplit("_", 7) - title = f"{name} {step}, bs = {bs}" + name, _, bs, _, _, _, _, step, cycles = dirname.rsplit("_", 8) + title = f"{name}, bs = {bs} (s={step}, c={cycles})" except ValueError: title = dirname diff --git a/src/pipeline.py b/src/pipeline.py index 60195af..fbf4992 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -619,8 +619,10 @@ def _generate_textgen( with torch.inference_mode(): for key_length in range(input_length, output_length, key_length_step): try: - if key_length_step > 1 or not use_cache or not do_prefill: - self._update_generate_batch(batch, use_cache, do_prefill, key_length) + if (key_length_step > 1 and key_length>key_length) or not use_cache or not do_prefill: + if not hasattr(self.model,"fast_forward"): + raise NotImplementedError() + self.model.fast_forward(batch, key_length, use_cache) last_time = self._get_time(breakdown_latency) generated, batch = self.model.generate_token(batch) t2 = self._get_time(breakdown_latency) From fd75d7c40f2627a01cdb32e7db84f334395d915f Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 24 May 2023 15:04:36 -0400 Subject: [PATCH 11/16] Cpu profile --- src/main.py | 10 +++++++--- src/pipeline.py | 6 +++--- src/profile.py | 20 ++++++++++++-------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/main.py b/src/main.py index 9547c20..f349649 100644 --- a/src/main.py +++ b/src/main.py @@ -58,6 +58,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--max_log_outputs", type=int) parser.add_argument("--breakdown_latency", "--bl", action="store_true") parser.add_argument("--profile", "-p", action="store_true") + parser.add_argument("--profile_cpu", "--pcpu", action="store_true") parser.add_argument("--profile_cycles", "--pc", type=int) parser.add_argument("--full_trace", "--pt", action="store_true") parser.add_argument("--show_op_names", "--pn", action="store_true") @@ -108,13 +109,16 @@ def main(argv: Optional[List[str]] = None) -> None: all_metrics = [] - if args.profile: + profile = args.profile or args.profile_cpu + + if profile: profiler = get_profiler( skip=args.skip + pre_warmup_cycles, warmup=warmup, cycles=post_warmup_cycles, full_trace=args.full_trace, show_op_names=args.show_op_names, + cpu=args.profile_cpu, ) else: profiler = contextlib.nullcontext() @@ -125,7 +129,7 @@ def main(argv: Optional[List[str]] = None) -> None: "Cycles (warmup)": args.skip + warmup, "Cycles (benchmark)": args.cycles, } - if args.profile: + if profile: benchmark_metrics["Cycles (profile)"] = post_warmup_cycles benchmark_metrics["Cycles (total)"] = args.skip + warmup + pre_warmup_cycles + post_warmup_cycles @@ -158,7 +162,7 @@ def main(argv: Optional[List[str]] = None) -> None: ignore_oom=args.ignore_oom, pad_generated_tokens=args.pad_generated_tokens, ) - if args.profile: + if profile: p.step() if step == 0: diff --git a/src/pipeline.py b/src/pipeline.py index fbf4992..3137ad4 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -619,8 +619,8 @@ def _generate_textgen( with torch.inference_mode(): for key_length in range(input_length, output_length, key_length_step): try: - if (key_length_step > 1 and key_length>key_length) or not use_cache or not do_prefill: - if not hasattr(self.model,"fast_forward"): + if (key_length_step > 1 and key_length > key_length) or not use_cache or not do_prefill: + if not hasattr(self.model, "fast_forward"): raise NotImplementedError() self.model.fast_forward(batch, key_length, use_cache) last_time = self._get_time(breakdown_latency) @@ -718,7 +718,7 @@ def __call__( Metrics.LATENCY_E2E: t1 - t0, } - output_text=[i+o for i, o in zip(text, output_text)] + output_text = [i + o for i, o in zip(text, output_text)] return output_text, metrics diff --git a/src/profile.py b/src/profile.py index 27486ba..b58422e 100644 --- a/src/profile.py +++ b/src/profile.py @@ -10,31 +10,33 @@ logger = logging.getLogger(__name__) -def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1): +def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1, cpu: bool = False): def trace_fn( p: torch.profiler.profile, ): averages = p.key_averages() + var_name = f"self_{'cpu' if cpu else 'cuda'}_time_total" if full_trace: # Show every GPU op. # Exclude CPU cuda ops to shorten the table. events = torch.autograd.profiler.EventList( - [evt for evt in p.profiler.function_events if evt.self_cuda_time_total > 0] + [evt for evt in p.profiler.function_events if getattr(evt, var_name) > 0] ) log_rank_n(events.table(row_limit=-1, max_src_column_width=1000), logger.info, rank) if show_op_names: # Show non-cropped names, in the same order as in the table. averages_sorted = torch.autograd.profiler.EventList( - sorted(averages, key=lambda evt: evt.self_cuda_time_total, reverse=True) + sorted(averages, key=lambda evt: getattr(evt, var_name), reverse=True) ) for entry in averages_sorted: log_rank_n(entry.key, logger.info, rank) # Try to avoid name cropping, still hard-coded to max 55 characters - log_rank_n( - averages.table(sort_by="self_cuda_time_total", row_limit=-1, max_src_column_width=1000), logger.info, rank - ) + log_rank_n(averages.table(sort_by=var_name, row_limit=-1, max_src_column_width=1000), logger.info, rank) + + # Store results for future use. + p.bc_profile_result = p.profiler.function_events return trace_fn @@ -45,6 +47,7 @@ def get_profiler( cycles: int, full_trace: bool = False, show_op_names: bool = False, + cpu=False, ) -> Union[torch.profiler.profile, contextlib.nullcontext]: schedule = torch.profiler.schedule( # Warmup is a must if measuring speed as it's when all the optimizations are performed @@ -57,6 +60,7 @@ def get_profiler( ) return torch.profiler.profile( schedule=schedule, - activities=[torch.profiler.ProfilerActivity.CUDA], - on_trace_ready=get_trace_fn(full_trace, show_op_names), + activities=[torch.profiler.ProfilerActivity.CPU if cpu else torch.profiler.ProfilerActivity.CUDA], + on_trace_ready=get_trace_fn(full_trace, show_op_names, cpu=cpu), + with_modules=True, ) From 1b00dcc97a2845b63a444b4f8ff2b065817d2e76 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 May 2023 15:51:52 -0400 Subject: [PATCH 12/16] fix --- scripts/run_textgen_benchmark_breakdown.sh | 4 ++-- src/pipeline.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 826d447..61d08eb 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -17,7 +17,7 @@ SAVE_DIR=data/benchmarks/v3 RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " -IMPL=("flash" "causal" "vector" "bigcode") +IMPL=("flash" "causal" "vector" "bigcode" "bigcode2") STEP=("" "--no_cache") @@ -38,7 +38,7 @@ run () { # run(step, runtime, attn) fi } -for impl in {0..3} +for impl in {0..4} do if [ "${STEP_ID}" -eq "0" ] then diff --git a/src/pipeline.py b/src/pipeline.py index 3137ad4..c5f9982 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -619,10 +619,10 @@ def _generate_textgen( with torch.inference_mode(): for key_length in range(input_length, output_length, key_length_step): try: - if (key_length_step > 1 and key_length > key_length) or not use_cache or not do_prefill: + if (key_length_step > 1 and key_length > input_length) or not use_cache or not do_prefill: if not hasattr(self.model, "fast_forward"): raise NotImplementedError() - self.model.fast_forward(batch, key_length, use_cache) + self.model.fast_forward(batch, key_length, self.dtype if use_cache else None) last_time = self._get_time(breakdown_latency) generated, batch = self.model.generate_token(batch) t2 = self._get_time(breakdown_latency) From 07e3b221ea4d42da241da6d007d27d616e3ec6df Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 May 2023 15:53:34 -0400 Subject: [PATCH 13/16] update --- scripts/run_textgen_benchmark_breakdown.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 61d08eb..550e9d8 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -12,7 +12,7 @@ TOKEN_STEP=${5:-5} STEP_ID=${6:-""} CYCLES=${7:-10} -SAVE_DIR=data/benchmarks/v3 +SAVE_DIR=data/benchmarks/v4 #BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " From a624f60a09cde5e6289cd870669b1bd1e0a2f6e4 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 May 2023 19:22:36 -0400 Subject: [PATCH 14/16] Rolling average --- src/parse_breakdown_results.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/parse_breakdown_results.py b/src/parse_breakdown_results.py index d8380cc..af6a362 100644 --- a/src/parse_breakdown_results.py +++ b/src/parse_breakdown_results.py @@ -2,6 +2,8 @@ from argparse import ArgumentParser from pathlib import Path from typing import List, Optional +import matplotlib.pyplot as plt +import pandas as pd def get_arg_parser() -> ArgumentParser: @@ -10,6 +12,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--title") parser.add_argument("--size", nargs=2, type=float) parser.add_argument("--save_dir", "--save", type=Path) + parser.add_argument("--rolling", "-r", type=int) return parser @@ -24,9 +27,7 @@ def read_data(input_file: Path): return data -def plot(data, title=None, size=None): - import matplotlib.pyplot as plt - +def plot(data, title=None, size=None, rolling=None): fig = plt.figure(figsize=size) ax = fig.add_subplot() @@ -34,10 +35,11 @@ def plot(data, title=None, size=None): cmap = cmap[::2] + cmap[1::2] for i, dat in enumerate(data): - latency_data = dat["Latency (generate breakdown)"] + latency_data = pd.Series({int(k): v * 1000 for k, v in dat["Latency (generate breakdown)"].items()}) + if rolling is not None: + latency_data = latency_data.rolling(rolling, center=True, min_periods=1).mean() ax.plot( - [int(k) for k in latency_data.keys()], - [v * 1000 for v in latency_data.values()], + latency_data, label=dat["Setting"], linewidth=1, color=cmap[i], @@ -67,7 +69,7 @@ def main(argv: Optional[List[str]] = None) -> None: except ValueError: title = dirname - fig = plot(data, title, args.size) + fig = plot(data, title, args.size, args.rolling) fig.show() if args.save_dir: save_path = (args.save_dir / dirname).with_suffix(".jpg") From fa981e956cea8f00cb44ebb4ae32ca990e5aaf2a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 26 May 2023 09:41:14 -0400 Subject: [PATCH 15/16] fix --- src/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipeline.py b/src/pipeline.py index c5f9982..b5c33a1 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -630,7 +630,7 @@ def _generate_textgen( last_time = t2 except torch.cuda.OutOfMemoryError: if ignore_oom: - logger.warning(f"Out of memory at key length {None}") + logger.warning(f"Out of memory at key length {key_length}") break else: raise From e321062ab006597547b4cc1f7bb26c6e9a94ad0e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Sun, 28 May 2023 21:06:13 -0400 Subject: [PATCH 16/16] fix --- scripts/run_all_textgen_benchmark_breakdown.sh | 16 ++++++++-------- scripts/run_textgen_benchmark_breakdown.sh | 7 +++---- src/pipeline.py | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/scripts/run_all_textgen_benchmark_breakdown.sh b/scripts/run_all_textgen_benchmark_breakdown.sh index 0fb0eab..d5de265 100755 --- a/scripts/run_all_textgen_benchmark_breakdown.sh +++ b/scripts/run_all_textgen_benchmark_breakdown.sh @@ -9,12 +9,12 @@ ./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 # Large model -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 11 0 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 11 0 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 11 0 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 11 0 # OOM? +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 # OOM? -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 1 8190 29 1 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 8 8190 29 1 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 32 8190 29 1 1 -./scripts/run_textgen_benchmark_breakdown.sh large_model ./data/bigcode_large-model 256 8190 29 1 1 # OOM? +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 1 # OOM? diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh index 550e9d8..998344c 100755 --- a/scripts/run_textgen_benchmark_breakdown.sh +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -12,12 +12,11 @@ TOKEN_STEP=${5:-5} STEP_ID=${6:-""} CYCLES=${7:-10} -SAVE_DIR=data/benchmarks/v4 -#BATCH_SIZES="1 2 4 8 16 24 32 48 64 96 128 160 224 256" +SAVE_DIR=data/benchmarks/v5 RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " -IMPL=("flash" "causal" "vector" "bigcode" "bigcode2") +IMPL=("flash" "causal" "vector" "bigcode" "bigcode2" "bigcode3") STEP=("" "--no_cache") @@ -38,7 +37,7 @@ run () { # run(step, runtime, attn) fi } -for impl in {0..4} +for impl in {0..5} do if [ "${STEP_ID}" -eq "0" ] then diff --git a/src/pipeline.py b/src/pipeline.py index b5c33a1..a9fb08f 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -491,7 +491,7 @@ def __init__(self, **kwargs): if self.device != torch.device("cuda:0"): raise ValueError(f"Textgen does not support device {self.device}") - self.config = self.model.model.transformer.config + self.config = getattr(self.model, "config", None) or self.model.model.transformer.config def _get_config( self,