kadirnar · sikamedia · Dec 29, 2023 · Dec 30, 2023 · Dec 30, 2023 · kadirnar
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+.vs/
+jupyternotebook/output
diff --git a/README.md b/README.md
@@ -59,6 +59,21 @@ summary = summarizer.summarize(transcript)
 print(summary[0]["summary_text"])
 ```
 
+If the transcript is longer than the specified maximum sequence length, the summarizer will split the transcript
+into chunks and summarize each chunk individually. The chunks are then concatenated to form the final summary.
+
+The improved summarization pipeline shows below how to use the summarizer with a long length text:
+
+```python
+from whisperplus.pipelines.long_text_support_summarization import (
+    LongTextSupportSummarizationPipeline,
+)
+
+summarizer = LongTextSupportSummarizationPipeline(model_id="facebook/bart-large-cnn")
+summary_text = summarizer.summarize(transcript)
+print(summary_text)
+```
+
 ### Speaker Diarization
 
 ```python

diff --git a/jupyternotebook/whisperplusnotebook.ipynb b/jupyternotebook/whisperplusnotebook.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,6 @@ Requests==2.31.0
 torch==2.1.0
 torchaudio==2.1.0
 transformers==4.35.2
+pytest==7.4.0
+pytest-cov==4.1.0
+pytest-mock==3.10.0
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_long_text_support_summarization.py b/tests/test_long_text_support_summarization.py
@@ -0,0 +1,61 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from whisperplus import LongTextSupportSummarizationPipeline
+
+
+class TestLongTextSupportSummarizationPipeline:
+
+    @pytest.fixture
+    def summarizer(self):
+        return LongTextSupportSummarizationPipeline()
+
+    @pytest.fixture
+    def summarizer_mocker(self):
+        with patch(
+                'whisperplus.pipelines.long_text_support_summarization.LongTextSupportSummarizationPipeline'
+                '.load_model', new=lambda x: None):
+            summarizer = LongTextSupportSummarizationPipeline()
+            summarizer.load_model = MagicMock(side_effect=Exception("Model loading failed"))
+            return summarizer
+
+    def test_load_model(self, summarizer):
+        # Test whether the model is loaded successfully
+        assert summarizer.model is not None
+
+    def test_split_text_into_chunks(self, summarizer):
+        # Test chunking of splitting text
+        text = "This is a test text. " * 50
+        chunks = summarizer.split_text_into_chunks(text, 100)
+        assert type(chunks) is list
+        assert len(chunks) > 1  # 应该被分割成多个块
+
+    def test_summarize_long_text_chunking(self, summarizer):
+        # Test chunking of long text summaries
+        long_text = "This is a long text. " * 1000
+        summary = summarizer.summarize_long_text(long_text, 130, 30)
+        assert type(summary) is str
+        assert len(summary) > 0
+
+    def test_summarize_short_text(self, summarizer):
+        # Test summarization of short text
+        short_text = "This is a short text."
+        summary = summarizer.summarize(short_text)
+        assert type(summary) is list
+        assert len(summary) > 0
+
+    def test_summarize_long_text(self, summarizer):
+        # Test summarization of long text
+        long_text = "This is a long text. " * 1000  # 创建一个足够长的文本
+        summary = summarizer.summarize(long_text)
+        assert type(summary) is str
+        assert len(summary) > 0
+
+    def test_model_loading_exception(self, summarizer_mocker):
+        # Check if the model attribute is still None after an exception is raised
+        assert summarizer_mocker.model is None
+
+        with pytest.raises(Exception) as exc_info:
+            summarizer_mocker.load_model()
+            assert "Model loading failed" in str(exc_info.value)
diff --git a/whisperplus/__init__.py b/whisperplus/__init__.py
@@ -1,3 +1,4 @@
+from whisperplus.pipelines.long_text_support_summarization import LongTextSupportSummarizationPipeline
 from whisperplus.pipelines.summarization import TextSummarizationPipeline
 from whisperplus.pipelines.whisper import SpeechToTextPipeline
 from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
@@ -7,4 +8,4 @@
 __version__ = '0.0.6'
 __author__ = 'kadirnar'
 __license__ = 'Apache License 2.0'
-__all__ = ['']
+__all__ = ['LongTextSupportSummarizationPipeline']
diff --git a/whisperplus/pipelines/long_text_support_summarization.py b/whisperplus/pipelines/long_text_support_summarization.py
@@ -0,0 +1,60 @@
+import logging
+
+import torch
+from transformers import AutoTokenizer, pipeline
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+class LongTextSupportSummarizationPipeline:
+
+    def __init__(self, model_id: str = "facebook/bart-large-cnn"):
+        logging.info("Initializing Text Summarization Pipeline")
+        self.model_id = model_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model_max_length = self.tokenizer.model_max_length
+        self.model = None
+        self.load_model()
+
+    def load_model(self):
+        try:
+            logging.info("Loading model...")
+            self.model = pipeline(
+                "summarization", model=self.model_id, device=0 if self.device == "cuda" else -1)
+            logging.info("Model loaded successfully.")
+        except Exception as e:
+            logging.error(f"Error loading model: {e}")
+
+    def summarize(self, text: str, max_length: int = 130, min_length: int = 30):
+        # Check if text needs to be split into smaller chunks
+        tokens = self.tokenizer.encode(text, truncation=False, return_tensors='pt')
+        if tokens.size(1) > self.model_max_length:
+            # Split the text
+            return self.summarize_long_text(text, max_length, min_length)
+        else:
+            return self.model(text, max_length=max_length, min_length=min_length, do_sample=False)
+
+    def summarize_long_text(self, text, max_length, min_length):
+        # Split the text into chunks
+        chunk_size = self.model_max_length - 50  # To account for [CLS], [SEP], etc.
+        text_chunks = self.split_text_into_chunks(text, chunk_size)
+
+        summaries = []
+        for chunk in text_chunks:
+            summary = self.model(chunk, max_length=max_length, min_length=min_length, do_sample=False)
+            summaries.append(summary[0]['summary_text'])
+
+        return ' '.join(summaries)
+
+    def split_text_into_chunks(self, text, chunk_size):
+        tokens = self.tokenizer.encode(text)
+        chunk_start = 0
+        chunks = []
+        while chunk_start < len(tokens):
+            chunk_end = min(chunk_start + chunk_size, len(tokens))
+            chunk = self.tokenizer.decode(
+                tokens[chunk_start:chunk_end], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            chunks.append(chunk)
+            chunk_start += chunk_size
+        return chunks
diff --git a/whisperplus/pipelines/summarization.py b/whisperplus/pipelines/summarization.py
@@ -10,6 +10,7 @@ class TextSummarizationPipeline:
 
     def __init__(self, model_id: str = "facebook/bart-large-cnn"):
         logging.info("Initializing Text Summarization Pipeline")
+        self.device = None
         self.model_id = model_id
         self.model = None
         self.set_device()