From ded4c3d778db4a7d1ce351adc28260568d333e23 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 6 Jan 2025 11:36:03 +0400
Subject: [PATCH] [Tests] Fixed HF warning (#1478)

```
test_continuous_batching.py: 7 warnings
test_kv_cache_eviction.py: 2 warnings
test_llm_pipeline.py: 6 warnings
test_sampling.py: 11 warnings
  /venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:774: UserWarning: `return_dict_in_generate` is NOT set to `True`, but `output_scores` is. When `return_dict_in_generate` is not `True`, `output_scores` is ignored.
    warnings.warn(
```
---
 tests/python_tests/common.py            |  8 ++++----
 tests/python_tests/test_llm_pipeline.py | 12 +++++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index dc58d1ad2f..bb34c1dcd4 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -179,6 +179,8 @@ def convert_to_hf(
         return
 
     kwargs = {}
+    kwargs['return_dict_in_generate'] = True
+
     # generic parameters
     kwargs['max_length'] = generation_config.max_length
     # has higher priority than 'max_length'
@@ -253,8 +255,7 @@ def run_hugging_face(
             input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
             prompt_len = 0 if generation_config.echo else input_ids.numel()
 
-            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
-                                                  return_dict_in_generate=True, tokenizer=hf_tokenizer)
+            generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
             all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
             generation_result = GenerationResult()
@@ -268,8 +269,7 @@ def run_hugging_face(
         inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
         input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
         hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
-        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config,
-                                                return_dict_in_generate=True, tokenizer=hf_tokenizer)
+        hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
 
         generation_ids = []
         scores = []
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 986b342c59..5278f4424f 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -53,6 +53,7 @@ def test_encoded_inputs(model_descr, inputs):
     hf_generation_config = convert_to_hf(opt_model.generation_config, ov_generation_config)
 
     input_ids, attention_mask = inputs
+    prompt_len = input_ids.shape[1]
 
     if attention_mask is not None:
         inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask))
@@ -61,10 +62,10 @@ def test_encoded_inputs(model_descr, inputs):
         inputs_hf = dict(inputs=torch.tensor(input_ids))
         inputs_ov = ov.Tensor(input_ids)
 
-    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config)
+    hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config).sequences[0]
     ov_output = ov_pipe.generate(inputs_ov, ov_generation_config)
 
-    hf_res = hf_output[0, input_ids.shape[1]:].numpy()
+    hf_res = hf_output[prompt_len:].numpy()
     ov_res = np.array(ov_output.tokens, dtype=np.int64)
     assert np.all(ov_res == hf_res)
 
@@ -132,9 +133,10 @@ def test_chat_scenario(model_descr, generation_config_kwargs: Dict):
 
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+        prompt_len = tokenized['input_ids'].numel()
 
-        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        answer = opt_model.generate(**tokenized, generation_config=hf_generation_config).sequences[0]
+        answer_str = tokenizer.decode(answer[prompt_len:], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
 
         answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config)
@@ -379,7 +381,7 @@ def test_perf_metrics(generation_config, prompt):
 
     # Check that load time is adequate.
     load_time = perf_metrics.get_load_time()
-    assert load_time > 0 and load_time < 2000.0
+    assert load_time > 0 and load_time < total_time
 
     # Check that num input and generated tokens are adequate.
     num_generated_tokens = perf_metrics.get_num_generated_tokens()