forked from RobMcH/CYK-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Parser.py
190 lines (173 loc) · 8.75 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import sys
import os.path
try:
from . import GrammarConverter
except (SystemError, ImportError):
import GrammarConverter
class Node:
"""
Used for storing information about a non-terminal symbol. A node can have a maximum of two
children because of the CNF of the grammar.
It is possible though that there are multiple parses of a sentence. In this case information
about an alternative child is stored in self.child1 or self.child2 (the parser will decide
where according to the ambiguous rule).
terminal_index is used to get the corresponding terminal symbol to a non-terminal symbol in the
major diagonal of the parse table (which corresponds to the input sentence).
"""
def __init__(self, symbol, y, x, i, terminal_index = None, y2 = None, x2 = None, i2 = None):
if terminal_index is not None:
self.terminal_index = terminal_index
self.child1 = [(y, x, i)]
self.child2 = [(y2, x2, i2)]
self.symbol = symbol
def __repr__(self):
"""
:return: the string representation of a Node object.
"""
return self.symbol
class Parser:
"""
A CYK parser which is able to parse any grammar in CNF. The grammar can be read from a file or
passed as a string. It either returns a string representation of the parse tree(s) or prints it
to standard out.
"""
def __init__(self, grammar, sentence):
"""
Creates a new parser object which will read in the grammar and transform it into CNF and
then parse the given sentence with that grammar.
:param grammar: the file path to the grammar/the string repr. of the grammar to read in
:param sentence: the file path to the sentence/the string repr. of the sentence to read in
"""
self.parse_table = None
self.prods = {}
self.grammar = None
if os.path.isfile(grammar):
self.grammar_from_file(grammar)
else:
self.grammar_from_string(grammar)
self.__call__(sentence)
def __call__(self, sentence, parse = False):
"""
Parse the given sentence (string or file) with the earlier given grammar.
:param sentence: the sentence to parse with self.grammar
"""
if os.path.isfile(sentence):
with open(sentence) as inp:
self.input = inp.readline().split()
if parse:
self.parse()
else:
self.input = sentence.split()
def grammar_from_file(self, grammar):
"""
Reads in a CFG from a given file, converts it to CNF and stores it in self.grammar.
:param grammar: the file in which the grammar is stored.
"""
self.grammar = GrammarConverter.convert_grammar(GrammarConverter.read_grammar(grammar))
def grammar_from_string(self, grammar):
"""
Reads in a CFG from a string, converts it to CNF and stores it in self.grammar.
:param grammar: the CFG in string representation.
:return:
"""
self.grammar = GrammarConverter.convert_grammar(
[x.replace("->", "").split() for x in grammar.split("\n")])
def parse(self):
"""
Does the actual parsing according to the CYK algorithm. The parse table is stored in
self.parse_table.
"""
length = len(self.input) + 1
self.parse_table = [[[] for x in range(length)] for y in range(length - 1)]
for i in range(1, length):
# Find out which non terminals can generate the terminals in the input string
# and put them into the parse table. One terminal could be generated by multiple
# non terminals, therefore the parse table will contain a list of non terminals.
for rule in self.grammar:
if f"'{self.input[i - 1]}'" == rule[1]:
self.parse_table[i - 1][i].append(Node(rule[0], None, None, None, i - 1))
for i in range(2, length):
for j in range(i - 2, -1, -1):
for k in range(j + 1, i):
for rule in self.grammar:
# Get every possible production of the nodes at a certain position in the
# parse table. Results are cached for better performance.
if (j, k) not in self.prods:
self.prods[j, k] = [node.symbol for node in self.parse_table[j][k]]
prod1 = self.prods[j, k]
if (k, i) not in self.prods:
self.prods[k, i] = [node.symbol for node in self.parse_table[k][i]]
prod2 = self.prods[k, i]
if rule[1] in prod1 and rule[2] in prod2:
node = Node(rule[0], j, k, prod1.index(rule[1]), None, k, i,
prod2.index(rule[2]))
self.parse_table[j][i].append(node)
if (j, i) not in self.prods:
self.prods[j, i] = []
self.prods[j, i].append(node.symbol)
# If the parse table has multiple symbols at a certain position one
# must check if the current rule can produce them and if so the node
# (of the current rule) is modified. In this way ambiguous sentences
# are handled.
if len(prod1) > 1:
for ind, symbol in enumerate(prod1):
if symbol == rule[1] and ind != prod1.index(rule[1]):
node.child1.append((j, k, ind))
if len(prod2) > 1:
for ind, symbol in enumerate(prod2):
if symbol == rule[2] and ind != prod2.index(rule[2]):
node.child2.append((k, i, ind))
def print_tree(self, output = True):
"""
Print the parse tree starting with the start symbol. Alternatively it returns the string
representation of the tree(s) instead of printing it.
"""
start_symbol = self.parse_table[0][len(self.input)][0]
if start_symbol.symbol == self.grammar[0][0]:
if output:
print("The given sentence is contained in the language produced by the given "
"grammar!")
print("\nPossible parse(s):")
trees = self.generate_trees(start_symbol)
if trees.find(f"]{start_symbol.symbol}") > -1:
# There are multiple parses. This formats the string properly.
trees = trees.replace(f"]{start_symbol.symbol}", f"]\n[{start_symbol.symbol}")
return print(trees) if output else trees
else:
print("The given sentence is not contained in the language produced by the given "
"grammar!")
def generate_trees(self, node):
"""
Generates the string representation of the parse tree(s).
:param node: the starting node.
:return: the parse tree in string form.
"""
tree, table = "[", self.parse_table
# For each node every child is checked, that guarantees that ambiguous
# sentences are handled correctly.
for child1 in node.child1:
child1_y, child1_x, child1_i = child1
for child2 in node.child2:
child2_y, child2_x, child2_i = child2
if child1_x is None and child1_y is None and child2_x is None and child2_y is None:
tree = f"{tree}{node.symbol} '{self.input[node.terminal_index]}'"
else:
tree = f"{tree}{node.symbol} "
if child1_y is not None and child1_x is not None:
tree = f"{tree}{self.generate_trees(table[child1_y][child1_x][child1_i])}"
if child2_y is not None and child2_x is not None:
tree = f"{tree}{self.generate_trees(table[child2_y][child2_x][child2_i])}"
tree += "]"
return tree
if __name__ == '__main__':
ARGUMENTS = sys.argv
if len(ARGUMENTS) > 3 or len(ARGUMENTS) == 2:
print("Usage: python3 Parser.py <grammar.file> <sentence.file>\n"
"or: python3 Parser.py <grammar as string> <sentence as string>")
elif len(ARGUMENTS) == 3:
GRAMMAR = "as path for the grammar" if os.path.isfile(ARGUMENTS[1]) else "as grammar"
SENTENCE = "as path for the sentence" if os.path.isfile(ARGUMENTS[2]) else "as sentence"
print(f"Using {ARGUMENTS[1]} {GRAMMAR} and {ARGUMENTS[2]} {SENTENCE}.")
PARSER = Parser(ARGUMENTS[1], ARGUMENTS[2])
PARSER.parse()
PARSER.print_tree()