Skip to content

Commit

Permalink
add english metaphors
Browse files Browse the repository at this point in the history
  • Loading branch information
fschncvg committed Nov 6, 2024
1 parent dba9aa1 commit 5e2f992
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 15 deletions.
7 changes: 5 additions & 2 deletions src/freestylo/AlliterationAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from freestylo.TextObject import TextObject
from tqdm import tqdm

"""
This class is used to find alliterations candidates in a text.
Expand All @@ -40,6 +41,8 @@ def __init__(self, text : TextObject, max_skip = 2, min_length=3, skip_tokens=["
"""

self.text = text
self.type = "alliteration"
text.annotations.append(self)
self.candidates = []
self.max_skip = max_skip
self.min_length = min_length
Expand All @@ -55,7 +58,7 @@ def find_candidates(self):
open_candidates = {}
i = 0

for i in range(len(tokens)):
for i in tqdm(range(len(tokens))):
token = tokens[i]
token_char = token[0].lower()
# check if there is an alliteration candidate with the current character
Expand Down Expand Up @@ -103,7 +106,7 @@ def serialize(self) -> list:
for c in self.candidates:
candidates.append({
"ids": c.ids,
"length": c.length,
"score": c.score,
"char": c.char})
return candidates

Expand Down
9 changes: 6 additions & 3 deletions src/freestylo/ChiasmusAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from freestylo.TextObject import TextObject
from freestylo.Configs import get_model_path
from tqdm import tqdm
import numpy as np

class ChiasmusAnnotation:
Expand Down Expand Up @@ -54,10 +55,10 @@ def find_candidates(self):
pos = self.text.pos

outer_matches = []
for i in range(len(pos)):
for i in tqdm(range(len(pos))):
outer_matches += self._find_matches(i, i + self.window_size)

for match in outer_matches:
for match in tqdm(outer_matches):
A, A_ = match
start_inner = A + 1
inner_matches = self._find_matches(start_inner, A_)
Expand Down Expand Up @@ -156,13 +157,15 @@ def score_candidates(self):
This method scores the chiasmus candidates.
"""
features = []
for candidate in self.candidates:
for candidate in tqdm(self.candidates):
features.append(self.get_features(candidate))
if self.model is None:
print("Load Chiasmus Model before scoring the candidates")
return False
features = np.stack(features)
print(" scoring....")
scores = self.model.decision_function(features)
print(" Done scoring")
for score, candidate in zip(scores, self.candidates):
candidate.score = score
return True
Expand Down
1 change: 1 addition & 0 deletions src/freestylo/Configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
model_list = [
"chiasmus_de.pkl",
"metaphor_de.torch",
"metaphor_en.torch",
"fasttext_mgh.bin.zip",
]

Expand Down
11 changes: 7 additions & 4 deletions src/freestylo/EpiphoraAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from freestylo.TextObject import TextObject
from tqdm import tqdm


class EpiphoraAnnotation:
Expand All @@ -41,6 +42,8 @@ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but",
"""

self.text = text
self.type = "epiphora"
text.annotations.append(self)
self.candidates = []
self.min_length = min_length
self.conj = conj
Expand All @@ -58,8 +61,8 @@ def split_in_phrases(self):

phrases = []
current_start = 0
for i, token in enumerate(self.text.tokens):
if token in self.conj or self.text.pos[i] == self.punct_pos:
for i, token in tqdm(enumerate(self.text.tokens)):
if token in self.conj or self.text.pos[i] == self.punct_pos or self.text.pos[i] in ["CONJ", "CCONJ"]:
if i-current_start > 2:
phrases.append([current_start, i])
current_start = i+1
Expand All @@ -74,7 +77,7 @@ def find_candidates(self):
candidates = []
current_candidate = EpiphoraCandidate([], "")
phrases = self.split_in_phrases()
for phrase in phrases:
for phrase in tqdm(phrases):
word = self.text.tokens[phrase[1]-1]
if word != current_candidate.word:
if len(current_candidate.ids) >= self.min_length:
Expand All @@ -97,7 +100,7 @@ def serialize(self) -> list:
for c in self.candidates:
candidates.append({
"ids": c.ids,
"length": c.length,
"score": c.score,
"word": c.word})
return candidates

Expand Down
5 changes: 4 additions & 1 deletion src/freestylo/MetaphorAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import freestylo.SimilarityNN as SimilarityNN
from freestylo.TextObject import TextObject
from freestylo.Configs import get_model_path
from tqdm import tqdm



Expand Down Expand Up @@ -49,7 +50,7 @@ def find_candidates(self):
This method finds metaphor candidates in the text.
"""
pos = self.text.pos
for i in range(len(pos)-1):
for i in tqdm(range(len(pos)-1)):
if pos[i] == "ADJ" and pos[i+1] == "NOUN":
self.candidates.append(MetaphorCandidate(i, i+1))

Expand Down Expand Up @@ -119,7 +120,9 @@ def score_candidates(self):
adj_metaphor_tensor = self.model(adj_tensor)
noun_metaphor_tensor = self.model(noun_tensor)
#scores = 1-(torch.nn.CosineSimilarity()(adj_metaphor_tensor, noun_metaphor_tensor)+1)/2
print(" scoring...")
scores = cosine_distance(adj_metaphor_tensor, noun_metaphor_tensor)
print(" done")
for score, candidate in zip(scores, self.candidates):
candidate.score = score.item()

Expand Down
9 changes: 6 additions & 3 deletions src/freestylo/PolysyndetonAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from freestylo.TextObject import TextObject
from tqdm import tqdm


class PolysyndetonAnnotation:
Expand All @@ -42,6 +43,8 @@ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but",
"""

self.text = text
self.type = "polysyndeton"
text.annotations.append(self)
self.candidates = []
self.min_length = min_length
self.conj = conj
Expand All @@ -62,7 +65,7 @@ def split_in_phrases(self):
phrases = []
current_sentence_start = 0
current_phrase_start = 0
for i, token in enumerate(self.text.tokens):
for i, token in tqdm(enumerate(self.text.tokens)):
if token in self.sentence_end_tokens:
phrases.append([current_phrase_start, i])
current_phrase_start = i+1
Expand Down Expand Up @@ -95,7 +98,7 @@ def find_candidates(self):
"""
candidates = []
sentences = self.split_in_phrases()
for sentence in sentences:
for sentence in tqdm(sentences):
current_candidate = PolysyndetonCandidate([], "")
current_word = ""
for phrase in sentence:
Expand All @@ -109,7 +112,7 @@ def find_candidates(self):

self.candidates = []
for candidate in candidates:
if candidate.word in self.conj:
if candidate.word in self.conj or self.text.pos[candidate.ids[0][0]] in ["CONJ", "CCONJ"]:
self.candidates.append(candidate)


Expand Down
4 changes: 3 additions & 1 deletion src/freestylo/TextObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, textfile=None, text=None, language=''):

if textfile is not None:
try:
with open(textfile, 'r') as f:
with open(textfile, 'r', errors="ignore") as f:
self.text = f.read()
except FileNotFoundError:
print("File not found, no textfile loaded")
Expand Down Expand Up @@ -77,6 +77,7 @@ def serialize(self, filename):
with open(filename, 'w') as f:
annotations = {}
for anno in self.annotations:
print("serializing annotation", anno.type)
annotations[anno.type] = anno.serialize()
save_dict = {
'text': self.text,
Expand All @@ -87,6 +88,7 @@ def serialize(self, filename):
'token_offsets': self.token_offsets,
'annotations': annotations
}
print("save")
with open(filename, 'w') as f:
json.dump(save_dict, f, indent=4)

Expand Down
9 changes: 8 additions & 1 deletion src/freestylo/TextPreprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TextPreprocessor:
This class is used to preprocess text.
It uses the TextObject class to store the text and its annotations.
"""
def __init__(self, language='en'):
def __init__(self, language='en', max_length=None):
"""
Constructor for the TextPreprocessor class.
Expand All @@ -42,6 +42,13 @@ def __init__(self, language='en'):
from MGHPreprocessor import MGHPreprocessor
self.nlp = MGHPreprocessor()

if max_length is not None:
try:
self.nlp.max_length = max_length
except:
print("Setting nlp max length not supported for middle high german, continue...")



def load_spacy_nlp(self, model_name):
"""
Expand Down

0 comments on commit 5e2f992

Please sign in to comment.