From b353929fabdd4620f2c44b664ffbc0a474a88923 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Thu, 9 Jan 2025 13:20:07 +0800
Subject: [PATCH 1/6] Update openvino tokenizers (#1506)

To test Llama3 fix:
https://github.com/openvinotoolkit/openvino_tokenizers/pull/357
---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index bcfd3eda25..d5f0abf827 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732
+Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532

From ca0babefd952ac78bcb0008ced94beb380a73496 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 09:21:27 +0400
Subject: [PATCH 2/6] Replaced chatglm2-6b with chatglm3-6b (#1505)

CVS-159975
---
 SUPPORTED_MODELS.md                       | 1 -
 tests/python_tests/models/real_models     | 1 -
 tests/python_tests/ov_genai_test_utils.py | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 6b45f47890..79333fa45c 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -14,7 +14,6 @@
       <td>ChatGLM</td>
       <td>
         <ul>
-          <li><a href="https://huggingface.co/THUDM/chatglm2-6b"><code>THUDM/chatglm2-6b</code></a></li>
           <li><a href="https://huggingface.co/THUDM/chatglm3-6b"><code>THUDM/chatglm3-6b</code></a></li>
         </ul>
       </td>
diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models
index 420f8f53b6..5fd8fe0500 100644
--- a/tests/python_tests/models/real_models
+++ b/tests/python_tests/models/real_models
@@ -27,7 +27,6 @@ Salesforce/codegen-350M-multi
 Salesforce/codegen-350M-nl
 Salesforce/codegen2-1b
 # Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32)
-THUDM/chatglm2-6b
 THUDM/chatglm3-6b
 TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ
 TinyLlama/TinyLlama-1.1B-Chat-v0.6
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 66fb58f46d..ff55c3c378 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -26,7 +26,7 @@ def get_models_list():
         "facebook/opt-125m",
         "microsoft/phi-1_5",
         "microsoft/phi-2",
-        "THUDM/chatglm2-6b",
+        "THUDM/chatglm3-6b",
         "Qwen/Qwen2-0.5B-Instruct",
         "Qwen/Qwen-7B-Chat",
         "Qwen/Qwen1.5-7B-Chat",

From 2c6d67e039a22e32bc43b53533c3f5b27929eea6 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 06:21:51 +0100
Subject: [PATCH 3/6] Whisper pipeline: refactor tests, disable
 `return_timestamps` check (#1496)

Ticket: 160055

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/windows.yml               |   6 +
 samples/export-requirements.txt             |   2 +-
 tests/python_tests/requirements.txt         |   2 +-
 tests/python_tests/test_whisper_pipeline.py | 434 ++++++++------------
 4 files changed, 169 insertions(+), 275 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 95a713d7a1..8f43af44ae 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -310,6 +310,12 @@ jobs:
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
+          
+          # will install transformers 4.46.3 version
+          # transformers 4.46.3 will enable return_timestamps tests
+          # this check enabled for windows only. Ticket: 160205.
+          python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+          
           python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
   genai_python_lib_vlm:
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 2f71891b7b..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index e23eaacc21..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index aa78666e32..c046d1ae2c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -11,11 +11,13 @@
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
 import gc
 import json
-import time
 import typing
 import numpy as np
 import os
 import pathlib
+import importlib.metadata as metadata
+from packaging.version import parse
+
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -27,36 +29,29 @@ def run_gc_after_test():
     gc.collect()
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
+def get_whisper_models_list(tiny_only=False):
+    model_ids = [
         "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
         "distil-whisper/distil-small.en",
     ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
-
-    nightly_models = []
 
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
+    if tiny_only:
+        model_ids = ["openai/whisper-tiny"]
 
     if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+        model_ids = [
+            model_id
+            for model_id in model_ids
+            if model_id in pytest.selected_model_ids.split(" ")
+        ]
 
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+    prefix = pathlib.Path(os.getenv("GENAI_MODELS_PATH_PREFIX", ""))
+    return [(model_id, prefix / model_id.split("/")[1]) for model_id in model_ids]
 
 
 # used whisper models are relatively small
 # cache them in memory to speedup tests
-@functools.lru_cache(3)
+@functools.lru_cache()
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 
@@ -90,6 +85,7 @@ def read_whisper_model(params, **tokenizer_kwargs):
             model_id,
             export=True,
             trust_remote_code=True,
+            stateful=False,
             compile=False,
             device="CPU",
             load_in_8bit=False,
@@ -114,30 +110,39 @@ def read_whisper_model(params, **tokenizer_kwargs):
     )
 
 
-def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id):
-    ds = datasets.load_dataset(dataset_id, "clean", split="validation")
-    opt_infer_time = 0
-    genai_infer_time = 0
-
-    for ds_row in ds:
-        audio_sample = ds_row["audio"]
+def run_huggingface(
+    pipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
+
+    return pipeline(
+        sample,
+        max_new_tokens=min(config.max_new_tokens, 444),
+        return_timestamps=config.return_timestamps,
+        generate_kwargs={"language": config.language, "task": config.task},
+    )
 
-        streamer_result = []
 
-        start = time.time()
-        genai_result = genai_pipe.generate(
-            audio_sample["array"].tolist(), streamer=lambda x: streamer_result.append(x)
-        )
-        genai_infer_time += time.time() - start
+def run_genai(
+    pipeline: ov_genai.WhisperPipeline,
+    sample,
+    config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    if not config:
+        config = ov_genai.WhisperGenerationConfig()
 
-        start = time.time()
-        result = opt_pipe(audio_sample)
-        opt_infer_time += time.time() - start
+    genai_config = pipeline.get_generation_config()
 
-        assert genai_result.texts[0] == result["text"]
-        assert "".join(streamer_result) == result["text"]
+    genai_config.max_new_tokens = config.max_new_tokens
+    genai_config.return_timestamps = config.return_timestamps
+    genai_config.task = config.task
+    genai_config.language = f"<|{config.language}|>" if config.language else None
 
-    print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}")
+    return pipeline.generate(sample, genai_config, streamer=streamer)
 
 
 def get_samples_from_dataset(
@@ -166,13 +171,50 @@ def get_samples_from_dataset(
     return [x["audio"]["array"] for x in ds]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list())
-@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"])
-@pytest.mark.precommit
-def test_whisper_on_hf_dataset(model_descr, dataset_id):
-    model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr)
+def run_pipeline_with_ref(
+    model_id: str,
+    tmp_path: str,
+    sample: np.ndarray | list[np.ndarray],
+    generation_config: ov_genai.WhisperGenerationConfig | None = None,
+    streamer: typing.Callable[[str], bool] | None = None,
+):
+    _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+
+    if type(sample) is np.ndarray and len(sample.shape) == 1:
+        sample = np.expand_dims(sample, 0)
+
+    for _sample in sample:
+        genai_result = run_genai(genai_pipe, _sample, generation_config, streamer)
+        hf_result = run_huggingface(hf_pipe, _sample, generation_config)
+
+        compare_results(hf_result, genai_result)
+
 
-    compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id)
+def compare_results(hf_result, genai_result):
+    assert genai_result.texts[0] == hf_result["text"]
+
+    # transformers 4.47 updated return_timestamps implementation
+    # remove once genai implementation aligned with transformers. Ticket 160205.
+    transformers_version_greater_4_47 = parse(
+        metadata.version("transformers")
+    ) >= parse("4.47.0")
+
+    if transformers_version_greater_4_47:
+        return
+
+    if "chunks" not in hf_result and genai_result.chunks is None:
+        return
+
+    assert len(genai_result.chunks) == len(hf_result["chunks"])
+
+    for opt_chunk, genai_chunk in zip(hf_result["chunks"], genai_result.chunks):
+        assert opt_chunk["text"] == genai_chunk.text
+        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
+        if opt_chunk["timestamp"][1]:
+            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
+        else:
+            assert opt_chunk["timestamp"][1] == None
+            assert round(genai_chunk.end_ts, 2) == -1.0
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -182,16 +224,11 @@ def test_whisper_on_hf_dataset(model_descr, dataset_id):
 )
 @pytest.mark.precommit
 def test_smoke(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample)
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] == expected["text"]
-
-    assert "chunks" not in expected
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
@@ -259,79 +296,55 @@ def test_whisper_constructors(model_descr, test_sample):
 def test_max_new_tokens(model_descr, test_sample):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample, max_new_tokens=10)["text"]
+    expected = opt_pipe(test_sample, max_new_tokens=10)
 
     genai_result = pipe.generate(test_sample, max_new_tokens=10)
 
-    assert genai_result.texts[0] == expected
-
-    genai_result = pipe.generate(test_sample)
-
-    assert genai_result.texts[0] != expected
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 10
     genai_result = pipe.generate(test_sample, config)
-    assert genai_result.texts[0] == expected
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_samples",
+    [
+        (get_samples_from_dataset(language="fr", length=1), "fr"),
+        (get_samples_from_dataset(language="de", length=1), "de"),
+    ],
 )
 @pytest.mark.precommit
-def test_language_mode_fr(model_descr, test_sample):
-    model_id, path = model_descr
+def test_language_mode(model_descr, test_samples):
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    samples, language = test_samples
 
     expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"}
+        samples[0], max_new_tokens=30, generate_kwargs={"language": language}
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>")
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|fr|>"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="de", length=3)
-)
-@pytest.mark.precommit
-def test_language_mode_de(model_descr, test_sample):
-    model_id, path = model_descr
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample, max_new_tokens=30, generate_kwargs={"language": "de"}
+    genai_result = pipe.generate(
+        samples[0], max_new_tokens=30, language=f"<|{language}|>"
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>")
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
-    config.language = "<|de|>"
-    genai_result = pipe.generate(test_sample, config)
+    config.language = f"<|{language}|>"
+    genai_result = pipe.generate(samples[0], config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
-    "test_sample", get_samples_from_dataset(language="fr", length=3)
+    "test_sample", get_samples_from_dataset(language="fr", length=1)
 )
 @pytest.mark.precommit
 def test_task_mode(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     expected = opt_pipe(
@@ -344,7 +357,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|fr|>", task="translate"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -352,27 +365,7 @@ def test_task_mode(model_descr, test_sample):
     config.task = "translate"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
-
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
-        generate_kwargs={"language": "ru", "task": "translate"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample, max_new_tokens=30, language="<|ru|>", task="translate"
-    )
-
-    assert genai_result.texts[0] == expected["text"]
-
-    config = pipe.get_generation_config()
-    config.max_new_tokens = 30
-    config.language = "<|ru|>"
-    config.task = "translate"
-    genai_result = pipe.generate(test_sample, config)
-
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     # seems to be equivalent to translate task
     expected = opt_pipe(
@@ -385,7 +378,7 @@ def test_task_mode(model_descr, test_sample):
         test_sample, max_new_tokens=30, language="<|en|>", task="transcribe"
     )
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
     config = pipe.get_generation_config()
     config.max_new_tokens = 30
@@ -393,21 +386,20 @@ def test_task_mode(model_descr, test_sample):
     config.task = "transcribe"
     genai_result = pipe.generate(test_sample, config)
 
-    assert genai_result.texts[0] == expected["text"]
+    compare_results(expected, genai_result)
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",
     [
-        *get_samples_from_dataset(language="fr", length=2),
-        *get_samples_from_dataset(language="de", length=2),
-        *get_samples_from_dataset(language="es", length=2),
+        *get_samples_from_dataset(language="fr", length=1),
+        *get_samples_from_dataset(language="de", length=1),
+        *get_samples_from_dataset(language="es", length=1),
     ],
 )
 @pytest.mark.precommit
 def test_language_autodetect(model_descr, test_sample):
-    model_id, path = model_descr
     model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
 
     input_features = opt_pipe.feature_extractor(test_sample)
@@ -415,188 +407,84 @@ def test_language_autodetect(model_descr, test_sample):
     # ensure detected language us not english
     assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"]
 
-    expected = opt_pipe(
-        test_sample,
-        max_new_tokens=30,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(max_new_tokens=30),
     )
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=30)
-
-    assert genai_result.texts[0] == expected["text"]
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        return_timestamps=True,
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-    # long form audio not supported yet
-    test_sample = test_sample[: 16000 * 30]
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-        max_new_tokens=15,
-        generate_kwargs={"language": "en"},
-    )
-
-    genai_result = pipe.generate(
-        test_sample.tolist(),
-        max_new_tokens=15,
-        return_timestamps=True,
-        language="<|en|>",
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=test_sample,
+        generation_config=ov_genai.WhisperGenerationConfig(
+            return_timestamps=True, language="en", max_new_tokens=30
+        ),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(multilingual=True))
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-        *get_samples_from_dataset(language="fr", length=10, long_form=True),
-    ],
+    "test_sample", get_samples_from_dataset(length=10, long_form=True)
 )
 @pytest.mark.precommit
-def test_longform_audio_return_timestamps_multilingual(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
+def test_longform_audio(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
     streamer_result = []
 
-    genai_result = pipe.generate(
+    genai_result = run_genai(
+        genai_pipe,
         test_sample,
-        return_timestamps=True,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
         streamer=lambda x: streamer_result.append(x),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
-
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
-
-
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(en_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=10, long_form=True),
-    ],
-)
-@pytest.mark.precommit
-def test_longform_audio_return_timestamps_en(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(
-        test_sample,
-        return_timestamps=True,
-    )
-
-    streamer_result = []
-
-    genai_result = pipe.generate(
+    hf_result = run_huggingface(
+        hf_pipe,
         test_sample,
-        return_timestamps=True,
-        streamer=lambda x: streamer_result.append(x),
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
     )
 
-    assert genai_result.texts[0] == expected["text"]
-    assert "".join(streamer_result) == expected["text"]
-
-    assert len(genai_result.chunks) == len(expected["chunks"])
+    compare_results(hf_result, genai_result)
 
-    for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks):
-        assert opt_chunk["text"] == genai_chunk.text
-        assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2)
-        if opt_chunk["timestamp"][1]:
-            assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2)
-        else:
-            assert opt_chunk["timestamp"][1] == None
-            assert round(genai_chunk.end_ts, 2) == -1.0
+    assert "".join(streamer_result) == hf_result["text"]
 
 
-@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
-@pytest.mark.parametrize(
-    "test_sample",
-    [
-        *get_samples_from_dataset(language="en", length=3, long_form=True),
-        *get_samples_from_dataset(language="sp", length=3, long_form=True),
-    ],
-)
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
-def test_longform_audio(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
-
-    expected = opt_pipe(test_sample, return_timestamps=True)
-
-    genai_result = pipe.generate(test_sample)
+def test_shortform(model_descr):
+    samples = []
+    ds = datasets.load_dataset(
+        "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+    )
 
-    assert genai_result.texts[0] == expected["text"]
+    for ds_row in ds:
+        samples.append(ds_row["audio"]["array"])
 
-    assert genai_result.chunks == None
+    run_pipeline_with_ref(
+        model_id=model_descr[0],
+        tmp_path=model_descr[1],
+        sample=samples,
+    )
 
 
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))

From 5a82b84a643578c3b534e76088aa0f3125cad31e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:32:22 +0400
Subject: [PATCH 4/6] DOCS: unify package name usage across snippets in
 README.md (#1509)

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c5cf799973..cea1e358bc 100644
--- a/README.md
+++ b/README.md
@@ -73,9 +73,9 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh
 ### Run generation using LLMPipeline API in Python
 
 ```python
-import openvino_genai as ov_genai
+import openvino_genai
 #Will run model on CPU, GPU or NPU are possible options
-pipe = ov_genai.LLMPipeline("./TinyLlama-1.1B-Chat-v1.0/", "CPU")
+pipe = openvino_genai.LLMPipeline("./TinyLlama-1.1B-Chat-v1.0/", "CPU")
 print(pipe.generate("The Sun is yellow because", max_new_tokens=100))
 ```
 
@@ -128,11 +128,11 @@ curl -O "https://storage.openvinotoolkit.org/test_data/images/dog.jpg"
 ```python
 import numpy as np
 import openvino as ov
-import openvino_genai as ov_genai
+import openvino_genai
 from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
-pipe = ov_genai.VLMPipeline("./InternVL2-1B", "CPU")
+pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
 pipe.start_chat()
 
 image = Image.open("dog.jpg")

From 2d5911b13b2bfab8a0433eaa38394bb9d064680f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:33:27 +0400
Subject: [PATCH 5/6] GHA: use latest OpenVINO master (#1511)

Fix to PA has been merged
https://github.com/openvinotoolkit/openvino/pull/28332
---
 .github/workflows/genai-tools.yml              | 2 +-
 .github/workflows/linux.yml                    | 2 +-
 .github/workflows/mac.yml                      | 2 +-
 .github/workflows/stable_diffusion_1_5_cpp.yml | 4 ++--
 .github/workflows/windows.yml                  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/genai-tools.yml b/.github/workflows/genai-tools.yml
index bd6cb46362..333bee3e11 100644
--- a/.github/workflows/genai-tools.yml
+++ b/.github/workflows/genai-tools.yml
@@ -44,7 +44,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   llm_bench:
     name: 'LLM bench tests'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0d7a5b7bae..0a991e2a54 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -52,7 +52,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
     - name: Clone docker tag from OpenVINO repo
       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 062b83fc27..7cb0ff98d3 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.10'
-  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
+  OV_BRANCH: 'master'
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 3b01697f26..e0bf5371b3 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -45,7 +45,7 @@ jobs:
       with:
         platform: ubuntu22
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   openvino_download_windows:
     name: Download OpenVINO for Windows
@@ -71,7 +71,7 @@ jobs:
       with:
         platform: windows
         commit_packages_to_provide: wheels
-        revision: 345163f87953fb0dd8dd590257eb7fc84378da8e
+        revision: latest_available_commit
 
   stable_diffusion_1_5_cpp-linux:
     runs-on: ubuntu-22.04-8-cores
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8f43af44ae..e65972110b 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 345163f87953fb0dd8dd590257eb7fc84378da8e
+  OV_BRANCH: 'master'
   OV_TARBALL: ''
 
 jobs:

From 7ef754c88e13f2970272628d59c9202e773ce5f1 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 9 Jan 2025 12:41:05 +0400
Subject: [PATCH 6/6] [GHA] Increase timeout for
 cpp-multinomial-greedy_causal_lm-ubuntu (#1510)

See
https://github.com/openvinotoolkit/openvino.genai/actions/runs/12676190622/job/35328859923?pr=1507

It fails from time to time by timeout.
Let's increase it a bit to check whether it will make GHA CI more stable

---------

Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
---
 .github/workflows/causal_lm_cpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index fb0c9c4b0b..b6abbefac0 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -53,17 +53,17 @@ jobs:
           wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a
+          && timeout 35s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a
         env:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b
+          && timeout 35s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b
         env:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          && timeout 35s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
           | diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
         env:
           PYTHONPATH: "./build"