Skip to content

Commit

Permalink
[Tests] Fixed HF warning (#1478)
Browse files Browse the repository at this point in the history
```
test_continuous_batching.py: 7 warnings
test_kv_cache_eviction.py: 2 warnings
test_llm_pipeline.py: 6 warnings
test_sampling.py: 11 warnings
  /venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:774: UserWarning: `return_dict_in_generate` is NOT set to `True`, but `output_scores` is. When `return_dict_in_generate` is not `True`, `output_scores` is ignored.
    warnings.warn(
```
  • Loading branch information
ilya-lavrenov authored Jan 6, 2025
1 parent cb6b68e commit ded4c3d
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 9 deletions.
8 changes: 4 additions & 4 deletions tests/python_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ def convert_to_hf(
return

kwargs = {}
kwargs['return_dict_in_generate'] = True

# generic parameters
kwargs['max_length'] = generation_config.max_length
# has higher priority than 'max_length'
Expand Down Expand Up @@ -253,8 +255,7 @@ def run_hugging_face(
input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
prompt_len = 0 if generation_config.echo else input_ids.numel()

generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
return_dict_in_generate=True, tokenizer=hf_tokenizer)
generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)

generation_result = GenerationResult()
Expand All @@ -268,8 +269,7 @@ def run_hugging_face(
inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
return_dict_in_generate=True, tokenizer=hf_tokenizer)
hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)

generation_ids = []
scores = []
Expand Down
12 changes: 7 additions & 5 deletions tests/python_tests/test_llm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def test_encoded_inputs(model_descr, inputs):
hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)

input_ids, attention_mask = inputs
prompt_len = input_ids.shape[1]

if attention_mask is not None:
inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
Expand All @@ -61,10 +62,10 @@ def test_encoded_inputs(model_descr, inputs):
inputs_hf = dict(inputs=torch.tensor(input_ids))
inputs_ov = ov.Tensor(input_ids)

hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config)
hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config).sequences[0]
ov_output = ov_pipe.generate(inputs_ov, ov_generation_config)

hf_res = hf_output[0, input_ids.shape[1]:].numpy()
hf_res = hf_output[prompt_len:].numpy()
ov_res = np.array(ov_output.tokens, dtype=np.int64)
assert np.all(ov_res == hf_res)

Expand Down Expand Up @@ -132,9 +133,10 @@ def test_chat_scenario(model_descr, generation_config_kwargs: Dict):

chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
prompt_len = tokenized['input_ids'].numel()

answer = opt_model.generate(**tokenized, generation_config=hf_generation_config)
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
answer = opt_model.generate(**tokenized, generation_config=hf_generation_config).sequences[0]
answer_str = tokenizer.decode(answer[prompt_len:], skip_special_tokens=True)
chat_history_hf.append({'role': 'assistant', 'content': answer_str})

answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config)
Expand Down Expand Up @@ -379,7 +381,7 @@ def test_perf_metrics(generation_config, prompt):

# Check that load time is adequate.
load_time = perf_metrics.get_load_time()
assert load_time > 0 and load_time < 2000.0
assert load_time > 0 and load_time < total_time

# Check that num input and generated tokens are adequate.
num_generated_tokens = perf_metrics.get_num_generated_tokens()
Expand Down

0 comments on commit ded4c3d

Please sign in to comment.