add english metaphors

cvjena · Nov 6, 2024 · 5e2f992 · 5e2f992
1 parent dba9aa1
commit 5e2f992
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 15 deletions.
diff --git a/src/freestylo/AlliterationAnnotation.py b/src/freestylo/AlliterationAnnotation.py
@@ -16,6 +16,7 @@
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from freestylo.TextObject import TextObject
+from tqdm import tqdm
 
 """
 This class is used to find alliterations candidates in a text.
@@ -40,6 +41,8 @@ def __init__(self, text : TextObject, max_skip = 2, min_length=3, skip_tokens=["
         """
 
         self.text = text
+        self.type = "alliteration"
+        text.annotations.append(self)
         self.candidates = []
         self.max_skip = max_skip
         self.min_length = min_length
@@ -55,7 +58,7 @@ def find_candidates(self):
         open_candidates = {}
         i = 0
 
-        for i in range(len(tokens)):
+        for i in tqdm(range(len(tokens))):
             token = tokens[i]
             token_char = token[0].lower()
             # check if there is an  alliteration candidate with the current character
@@ -103,7 +106,7 @@ def serialize(self) -> list:
         for c in self.candidates:
             candidates.append({
                 "ids": c.ids,
-                "length": c.length,
+                "score": c.score,
                 "char": c.char})
         return candidates
 

diff --git a/src/freestylo/ChiasmusAnnotation.py b/src/freestylo/ChiasmusAnnotation.py
@@ -17,6 +17,7 @@
 
 from freestylo.TextObject import TextObject
 from freestylo.Configs import get_model_path
+from tqdm import tqdm
 import numpy as np
 
 class ChiasmusAnnotation:
@@ -54,10 +55,10 @@ def find_candidates(self):
         pos = self.text.pos
 
         outer_matches = []
-        for i in range(len(pos)):
+        for i in tqdm(range(len(pos))):
             outer_matches += self._find_matches(i, i + self.window_size)
 
-        for match in outer_matches:
+        for match in tqdm(outer_matches):
             A, A_ = match
             start_inner = A + 1
             inner_matches = self._find_matches(start_inner, A_)
@@ -156,13 +157,15 @@ def score_candidates(self):
         This method scores the chiasmus candidates.
         """
         features = []
-        for candidate in self.candidates:
+        for candidate in tqdm(self.candidates):
             features.append(self.get_features(candidate))
         if self.model is None:
             print("Load Chiasmus Model before scoring the candidates")
             return False
         features = np.stack(features)
+        print("   scoring....")
         scores = self.model.decision_function(features)
+        print("   Done scoring")
         for score, candidate in zip(scores, self.candidates):
             candidate.score = score
         return True

diff --git a/src/freestylo/Configs.py b/src/freestylo/Configs.py
@@ -23,6 +23,7 @@
 model_list = [
         "chiasmus_de.pkl",
         "metaphor_de.torch",
+        "metaphor_en.torch",
         "fasttext_mgh.bin.zip",
         ]
 

diff --git a/src/freestylo/EpiphoraAnnotation.py b/src/freestylo/EpiphoraAnnotation.py
@@ -17,6 +17,7 @@
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from freestylo.TextObject import TextObject
+from tqdm import tqdm
 
 
 class EpiphoraAnnotation:
@@ -41,6 +42,8 @@ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but",
         """
 
         self.text = text
+        self.type = "epiphora"
+        text.annotations.append(self)
         self.candidates = []
         self.min_length = min_length
         self.conj = conj
@@ -58,8 +61,8 @@ def split_in_phrases(self):
 
         phrases = []
         current_start = 0
-        for i, token in enumerate(self.text.tokens):
-            if token in self.conj or self.text.pos[i] == self.punct_pos:
+        for i, token in tqdm(enumerate(self.text.tokens)):
+            if token in self.conj or self.text.pos[i] == self.punct_pos or self.text.pos[i] in ["CONJ", "CCONJ"]:
                 if i-current_start > 2:
                     phrases.append([current_start, i])
                     current_start = i+1
@@ -74,7 +77,7 @@ def find_candidates(self):
         candidates = []
         current_candidate = EpiphoraCandidate([], "")
         phrases = self.split_in_phrases()
-        for phrase in phrases:
+        for phrase in tqdm(phrases):
             word = self.text.tokens[phrase[1]-1]
             if word != current_candidate.word:
                 if len(current_candidate.ids) >= self.min_length:
@@ -97,7 +100,7 @@ def serialize(self) -> list:
         for c in self.candidates:
             candidates.append({
                 "ids": c.ids,
-                "length": c.length,
+                "score": c.score,
                 "word": c.word})
         return candidates
 

diff --git a/src/freestylo/MetaphorAnnotation.py b/src/freestylo/MetaphorAnnotation.py
@@ -20,6 +20,7 @@
 import freestylo.SimilarityNN as SimilarityNN
 from freestylo.TextObject import TextObject
 from freestylo.Configs import get_model_path
+from tqdm import tqdm
 
 
 
@@ -49,7 +50,7 @@ def find_candidates(self):
         This method finds metaphor candidates in the text.
         """
         pos = self.text.pos
-        for i in range(len(pos)-1):
+        for i in tqdm(range(len(pos)-1)):
             if pos[i] == "ADJ" and pos[i+1] == "NOUN":
                 self.candidates.append(MetaphorCandidate(i, i+1))
 
@@ -119,7 +120,9 @@ def score_candidates(self):
         adj_metaphor_tensor = self.model(adj_tensor)
         noun_metaphor_tensor = self.model(noun_tensor)
         #scores = 1-(torch.nn.CosineSimilarity()(adj_metaphor_tensor, noun_metaphor_tensor)+1)/2
+        print("   scoring...")
         scores = cosine_distance(adj_metaphor_tensor, noun_metaphor_tensor)
+        print("   done")
         for score, candidate in zip(scores, self.candidates):
             candidate.score = score.item()
 

diff --git a/src/freestylo/PolysyndetonAnnotation.py b/src/freestylo/PolysyndetonAnnotation.py
@@ -16,6 +16,7 @@
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from freestylo.TextObject import TextObject
+from tqdm import tqdm
 
 
 class PolysyndetonAnnotation:
@@ -42,6 +43,8 @@ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but",
         """
 
         self.text = text
+        self.type = "polysyndeton"
+        text.annotations.append(self)
         self.candidates = []
         self.min_length = min_length
         self.conj = conj
@@ -62,7 +65,7 @@ def split_in_phrases(self):
         phrases = []
         current_sentence_start = 0
         current_phrase_start = 0
-        for i, token in enumerate(self.text.tokens):
+        for i, token in tqdm(enumerate(self.text.tokens)):
             if token in self.sentence_end_tokens:
                 phrases.append([current_phrase_start, i])
                 current_phrase_start = i+1
@@ -95,7 +98,7 @@ def find_candidates(self):
         """
         candidates = []
         sentences = self.split_in_phrases()
-        for sentence in sentences:
+        for sentence in tqdm(sentences):
             current_candidate = PolysyndetonCandidate([], "")
             current_word = ""
             for phrase in sentence:
@@ -109,7 +112,7 @@ def find_candidates(self):
 
         self.candidates = []
         for candidate in candidates:
-            if candidate.word in self.conj:
+            if candidate.word in self.conj or self.text.pos[candidate.ids[0][0]] in ["CONJ", "CCONJ"]:
                 self.candidates.append(candidate)
 
 

diff --git a/src/freestylo/TextObject.py b/src/freestylo/TextObject.py
@@ -48,7 +48,7 @@ def __init__(self, textfile=None, text=None, language=''):
 
         if textfile is not None:
             try:
-                with open(textfile, 'r') as f:
+                with open(textfile, 'r', errors="ignore") as f:
                     self.text = f.read()
             except FileNotFoundError:
                 print("File not found, no textfile loaded")
@@ -77,6 +77,7 @@ def serialize(self, filename):
         with open(filename, 'w') as f:
             annotations = {}
             for anno in self.annotations:
+                print("serializing annotation", anno.type)
                 annotations[anno.type] = anno.serialize()
             save_dict = {
                 'text': self.text,
@@ -87,6 +88,7 @@ def serialize(self, filename):
                 'token_offsets': self.token_offsets,
                 'annotations': annotations
             }
+            print("save")
             with open(filename, 'w') as f:
                 json.dump(save_dict, f, indent=4)
 

diff --git a/src/freestylo/TextPreprocessor.py b/src/freestylo/TextPreprocessor.py
@@ -24,7 +24,7 @@ class TextPreprocessor:
     This class is used to preprocess text.
     It uses the TextObject class to store the text and its annotations.
     """
-    def __init__(self, language='en'):
+    def __init__(self, language='en', max_length=None):
         """
         Constructor for the TextPreprocessor class.
 
@@ -42,6 +42,13 @@ def __init__(self, language='en'):
             from MGHPreprocessor import MGHPreprocessor
             self.nlp = MGHPreprocessor()
 
+        if max_length is not None:
+            try:
+                self.nlp.max_length = max_length
+            except:
+                print("Setting nlp max length not supported for middle high german, continue...")
+
+
 
     def load_spacy_nlp(self, model_name):
         """