Skip to content

Commit

Permalink
edge-cases
Browse files Browse the repository at this point in the history
  • Loading branch information
melanchthon19 committed Feb 16, 2021
1 parent 95fd8bf commit 4716b1f
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 22 deletions.
4 changes: 2 additions & 2 deletions decimas_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import PyPDF2


class Scrap_Decimas():
class ScrapDecimas():
def __init__(self):
self.versos = []

Expand Down Expand Up @@ -80,7 +80,7 @@ def output_file(self, file_name, versos):
file.write(f'{verso}\n')


SD = Scrap_Decimas()
SD = ScrapDecimas()
SD.scrap_cervantes()
SD.scrap_diferentemente()
SD.scrap_violeta()
Expand Down
2 changes: 1 addition & 1 deletion phonetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
'y': 'i'}
}
punctuation = ['.', ',', ';', '?']
double_consonants = ['tr', 'dr', 'ns', 'pr', 'fr']
double_consonants = ['tr', 'dr', 'ns', 'pr', 'fr', 'br']

vowels = list(vowels_strong.keys()) + list(vowels_weak.keys()) + list(vowels_accented.keys())
alphabet = vowels + list(consonants.keys())
Expand Down
79 changes: 60 additions & 19 deletions silabeador.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def count_syllables_text(self, text):
for line in text:
syllables_text.append(self.count_syllables_sentence(line))

def count_syllables_sentence(self, sentence):
def count_syllables_sentence(self, sentence, debug=False):
'''
main function that takes a sentence as input and counts the number of metric syllables
input: string
Expand All @@ -70,7 +70,7 @@ def count_syllables_sentence(self, sentence):
self.structure = [self.phonemes2structure(word) for word in self.phonemes]

# dividing the structure in syllables (i.e. 'CFCFC' --> CF-CFC)
self.structure_syllables = [self.divide_structure_syllables(word) for word in self.structure]
self.structure_syllables = [self.divide_structure_syllables(word_structure) for word_structure in self.structure]
# dividing the word in syllables according to how the structure was previously divided (i.e. 'cazar' --> 'ca-zar')
self.word_syllables = [self.add_separator(structure, word) for structure, word in zip(self.structure_syllables, self.phonemes)]
# metric rules are considered when counting syllables in a sentence
Expand All @@ -81,22 +81,27 @@ def count_syllables_sentence(self, sentence):

self.number_syllables = self.count(self.word_syllables, self.metric_rule(self.last_word))

if debug and self.number_syllables != 8:
print(sentence)

if self.verbose == 1:
print(sentence, self.word_syllables, self.number_syllables)
print(f'{sentence} --> {self.word_syllables} [{self.number_syllables}]')

elif self.verbose == 2:# and self.number_syllables != 8: # add second condition for debugging purposes
print('sentence', self.sentence)
print('\nsentence', sentence)
print('phonemes', self.phonemes)
print('structure', self.structure)
print('structure syllables', self.structure_syllables)
print('word syllables', self.word_syllables)
print('number of metric syllables', self.number_syllables)

elif self.verbose == 3:
print('last word:', self.last_word['word'])
print('last word structure', self.last_word['structure'])
print('last word phonemes', self.last_word['phonemes'])
print('last word syllables', self.last_word['word_syllables'])
print('last word accent:', self.last_word['accent'])
print('is last word monosilabo:', self.last_word['monosilabo'])
print('number of metric syllables', self.number_syllables)

return self.number_syllables # amount of metric syllables in the sentence

Expand Down Expand Up @@ -138,6 +143,10 @@ def are_vowels(self, c1, c2):
return False

def word2phonemes(self, word):
"""
function that takes a word and translates it to phonemes
following the rules from char2phone dictionary
"""
# findall retrieves a list of characters only
phonemes = ''.join(re.findall('[^\W]*', word)) # TODO: deal with punctuation in the middle of sentence
for rule in self.char2phone.keys(): # certain rules must be applied first
Expand All @@ -148,20 +157,31 @@ def word2phonemes(self, word):
return phonemes

def phonemes2structure(self, phonemes):
"""
function that takes a sequence of phonemes and translates it to its structure
following the mapping in phonemes_dict dictionary.
double consonants are mapped to just one structure character 'T'
"""
#print(phonemes)
phonemes_reduced = self.reduce_double_syllables(phonemes)
structure = [self.phonemes_dict[phone] if phone in self.alphabet else phone for phone in phonemes_reduced]
#print(structure)
return structure

def divide_structure_syllables(self, word):
sequence = ''.join(word)
pattern = re.compile(r"""(CDDC(?![FAD]))?
(CDAC(?![FAD]))?
def divide_structure_syllables(self, word_structure, debug=False):
sequence = ''.join(word_structure)
pattern = re.compile(r"""(CDDC(?![AFD]))?
(CDAC(?![AFD]))?
(CFDC)?
(CDD)?
(CDFC(?![AFD]))?
(CDF)?
(CDA)?
(CDC(?![AFD]))?
(DFC)?
(CCF)?
(CFC(?![AFD]))?
(CAC(?![AFD]))?
(TFC(?![AFD]))?
(CFT(?![AFD]))?
(CFC(?![AFD]))?
Expand All @@ -172,16 +192,23 @@ def divide_structure_syllables(self, word):
(TD)?
(TAC)?
(TA)?
(FC(?![FAD]))?
(FC(?![AFD]))?
(FC(?![AFD]))?
(FD)?
(CD(?![AFD]))?
(DC(?=[CT]))?
(CA)?
(F(?=C[FAD])?)?
(CA(?![CT]))?
(F(?=C[AFD])?)?
(A)?
(D)?""", re.VERBOSE)
(D)?
""", flags=re.VERBOSE)
match = re.findall(pattern, sequence)
structure_syllables = '-'.join([syllable for group in match for syllable in group if syllable])

if debug:
print('word structure', word_structure)
print('sequence', sequence)
print('match', match)
print('structure syllable', structure_syllables)
return structure_syllables

def reduce_double_syllables(self, word):
Expand All @@ -193,18 +220,27 @@ def reduce_double_syllables(self, word):

return ''.join(word)

def add_separator(self, structure, word):
def add_separator(self, structure, word, debug=False):
"""
function that adds a hyphen to separate the word's syllables.
it takes a separated structure (i.e. F-F-CF) and its word ('aora')
and it inserts the hyphens ('aora' --> 'a-o-ra').
Special cases are structure T which stands for double consonants (i.e. 'ns', 'tr')
thus these cases are mapped to two characters.
"""
word_segmented = list(word)
char = 0
forward = 0

while char < len(word_segmented) - 1:
while char < len(structure) - 1:
if structure[char] == '-':
word_segmented.insert(char + forward, '-')
elif structure[char] == 'T':
forward += 1 # moving one forward because T is mapped to two characters
char += 1

if debug:
print(structure, word, word_segmented, ''.join(word_segmented))

return ''.join(word_segmented)


Expand Down Expand Up @@ -287,8 +323,13 @@ def count(self, word_syllables, last_word_count):
# ph is a dictionary with information regarding vowels, alphabet, char2phone rules, etc.
ph = phonetics.phonetics
text = preprocess.read_txt('data/decima2.txt')
#text = preprocess.read_txt('cases.txt')
#text = preprocess.read_txt('data/decimas_data_small.txt')

silabeador = Silabeador(**ph)
silabeador.sinalefa = True # counting using sinalefa
silabeador.count_syllables_sentence(text[3])
#silabeador.count_syllables_text(text)
#silabeador.count_syllables_sentence(text[11])
silabeador.count_syllables_text(text)
#silabeador.count_syllables_sentence('pidió el cid alojamiento')
#silabeador.divide_structure_syllables(['C', 'F', 'C', 'F', 'D', 'C'], debug=True)
#silabeador.add_separator('CF-CA-F', 'maría', debug=True)

0 comments on commit 4716b1f

Please sign in to comment.