diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index 129f1f1..da0d642 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -159,9 +159,11 @@ def test_truncate_text(accepted): test_obj.subdict["text"] = "I like cats and dogs." test_obj._truncate_text() assert test_obj.subdict["text"] == "I like cats and dogs." + assert "text_truncated" not in test_obj.subdict test_obj.subdict["text"] = 20000 * "m" test_obj._truncate_text() - assert test_obj.subdict["text"] == 5000 * "m" + assert test_obj.subdict["text_truncated"] == 5000 * "m" + assert test_obj.subdict["text"] == 20000 * "m" @pytest.mark.gcv @@ -173,6 +175,14 @@ def test_analyse_image(set_testdict, set_environ, accepted): set_testdict[item], analyse_text=True, accept_privacy=accepted ) test_obj.analyse_image() + testdict = {} + testdict["text"] = 20000 * "m" + test_obj = tt.TextDetector( + testdict, skip_extraction=True, analyse_text=True, accept_privacy=accepted + ) + test_obj.analyse_image() + assert test_obj.subdict["text_truncated"] == 5000 * "m" + assert test_obj.subdict["text"] == 20000 * "m" @pytest.mark.gcv diff --git a/ammico/text.py b/ammico/text.py index ca2516b..c6c83fc 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -263,7 +263,7 @@ def _truncate_text(self, max_length: int = 5000) -> str: """Truncate the text if it is too long for googletrans.""" if self.subdict["text"] and len(self.subdict["text"]) > max_length: print("Text is too long - truncating to {} characters.".format(max_length)) - self.subdict["text"] = self.subdict["text"][:max_length] + self.subdict["text_truncated"] = self.subdict["text"][:max_length] def analyse_image(self) -> dict: """Perform text extraction and analysis of the text. @@ -283,7 +283,7 @@ def analyse_image(self) -> dict: self._truncate_text() self.translate_text() self.remove_linebreaks() - if self.analyse_text: + if self.analyse_text and self.subdict["text_english"]: self._run_spacy() self.clean_text() self.text_summary() @@ -336,8 +336,13 @@ def translate_text(self): raise ValueError( "Privacy disclosure not accepted - skipping text translation." ) + text_to_translate = ( + self.subdict["text_truncated"] + if "text_truncated" in self.subdict + else self.subdict["text"] + ) try: - translated = self.translator.translate(self.subdict["text"]) + translated = self.translator.translate(text_to_translate) except Exception: print("Could not translate the text with error {}.".format(Exception)) translated = None diff --git a/pyproject.toml b/pyproject.toml index 13e795d..d1ebdef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "ammico" -version = "0.2.5" +version = "0.2.6" description = "AI Media and Misinformation Content Analysis Tool" readme = "README.md" maintainers = [