diff --git a/dfa/construction.py b/dfa/construction.py index 0715ceb..97b4968 100644 --- a/dfa/construction.py +++ b/dfa/construction.py @@ -8,21 +8,20 @@ ) from nfa.nfa import ASCII_COUNT from utils import list_dict - - -dfa_list = [] +from utils import G def convert_to_dfa(nfa_start_node): + G.dfa_list = [] jump_table = list_dict(MAX_DFA_STATUS_NUM) ns = [nfa_start_node] n_closure = closure(ns) dfa = Dfa.nfas_to_dfa(n_closure) - dfa_list.append(dfa) + G.dfa_list.append(dfa) dfa_index = 0 - while dfa_index < len(dfa_list): - dfa = dfa_list[dfa_index] + while dfa_index < len(G.dfa_list): + dfa = G.dfa_list[dfa_index] for i in range(ASCII_COUNT): c = chr(i) nfa_move = move(dfa.nfa_sets, c) @@ -30,18 +29,18 @@ def convert_to_dfa(nfa_start_node): nfa_closure = closure(nfa_move) if nfa_closure is None: continue - new_dfa = convert_completed(dfa_list, nfa_closure) + new_dfa = convert_completed(G.dfa_list, nfa_closure) if new_dfa is None: new_dfa = Dfa.nfas_to_dfa(nfa_closure) - dfa_list.append(new_dfa) + G.dfa_list.append(new_dfa) next_state = new_dfa.status_num jump_table[dfa.status_num][c] = next_state if new_dfa.accepted: jump_table[new_dfa.status_num]['accepted'] = True dfa_index = dfa_index + 1 - + return jump_table - + def convert_completed(dfa_list, closure): for dfa in dfa_list: @@ -55,4 +54,4 @@ def log_dfa(dfa_list): for dfa in dfa_list: print('dfa num: ', dfa.status_num, dfa.accepted) for nfa in dfa.nfa_sets: - print(' nfa sets: ', nfa.status_num) \ No newline at end of file + print(' nfa sets: ', nfa.status_num) diff --git a/dfa/dfa.py b/dfa/dfa.py index 0103632..61df805 100644 --- a/dfa/dfa.py +++ b/dfa/dfa.py @@ -19,4 +19,4 @@ def nfas_to_dfa(cls, nfas): dfa.status_num = Dfa.STATUS_NUM Dfa.STATUS_NUM = Dfa.STATUS_NUM + 1 - return dfa \ No newline at end of file + return dfa diff --git a/dfa/dfa_group.py b/dfa/dfa_group.py index 361ddb7..8e0e1d3 100644 --- a/dfa/dfa_group.py +++ b/dfa/dfa_group.py @@ -21,4 +21,4 @@ def get(self, count): return self.group[count] def __len__(self): - return len(self.group) \ No newline at end of file + return len(self.group) diff --git a/dfa/minimize_dfa.py b/dfa/minimize_dfa.py index 5c74200..0dabee1 100644 --- a/dfa/minimize_dfa.py +++ b/dfa/minimize_dfa.py @@ -1,19 +1,16 @@ -from dfa.construction import dfa_list from dfa.dfa_group import DfaGroup from nfa.nfa import ASCII_COUNT from utils import list_dict - - -group_list = [] -on_partition = True +from utils import G def minimize_dfa(jump_table): + G.group_list = [] + G.on_partition = True partition_accepted() - global on_partition - while on_partition: - on_partition = False + while G.on_partition: + G.on_partition = False partition_on_num(jump_table) partition_on_char(jump_table) @@ -23,12 +20,12 @@ def minimize_dfa(jump_table): def partition_accepted(): group_na = [] group_a = [] - for dfa in dfa_list: + for dfa in G.dfa_list: if dfa.accepted: group_a.append(dfa) else: group_na.append(dfa) - + if len(group_a) > 0: append_group(group_a) if len(group_na) > 0: @@ -38,59 +35,59 @@ def partition_accepted(): def append_group(group_a): group = DfaGroup() group.group = group_a - group_list.append(group) + G.group_list.append(group) def partition_on_num(jump_table): - for group in group_list: - dfa_index = 1 - first_dfa = group.get(0) - next_dfa = group.get(dfa_index) - - while next_dfa is not None: - for i in range(10): + for group in G.group_list: + for i in range(10): + divide_group = dict() + for dfa in group.group: ch = str(i) - if partition(jump_table, group, first_dfa, next_dfa, ch): - global on_partition - on_partition = True - break - dfa_index = dfa_index + 1 - next_dfa = group.get(dfa_index) - - -def partition_on_char(jump_table): - for group in group_list: - dfa_index = 1 - first_dfa = group.get(0) - next_dfa = group.get(dfa_index) - - while next_dfa is not None: - for i in range(ASCII_COUNT): - ch = chr(i) - if not str.isdigit(ch) and partition(jump_table, group, first_dfa, next_dfa, ch): - global on_partition - on_partition = True - break - dfa_index = dfa_index + 1 - next_dfa = group.get(dfa_index) + partition(jump_table, dfa, divide_group, ch) + if len(divide_group) > 1: + G.on_partition = True + G.group_list.remove(group) + add_group_list(G.group_list, divide_group.items()) + return -def partition(jump_table, group, first, next, ch): - goto_first = jump_table[first.status_num].get(ch) - goto_next = jump_table[next.status_num].get(ch) +def add_group_list(group_list, divide_list): + for item in divide_list: + value_group = DfaGroup() + value_group.group = item[1] + group_list.append(value_group) - if dfa_in_group(goto_first) != dfa_in_group(goto_next): - new_group = DfaGroup() - group_list.append(new_group) - group.remove(next) - new_group.add(next) - return True - return False +def partition_on_char(jump_table): + for group in G.group_list: + for i in range(ASCII_COUNT): + divide_group = dict() # divide_group的key是group_num,value是一个group中转向编号为group_num的dfa列表 + for dfa in group.group: + ch = chr(i) + partition(jump_table, dfa, divide_group, ch) + if len(divide_group) > 1: # 字符ch将一个group划分成多个 + G.on_partition = True + G.group_list.remove(group) + add_group_list(G.group_list, divide_group.items()) + return + + +def partition(jump_table, dfa, divide_group, ch): + goto = jump_table[dfa.status_num].get(ch) + goto_group = dfa_in_group(goto) + if goto_group is None: + if divide_group.get(-1) is None: + divide_group[-1] = [] + divide_group[-1].append(dfa) + else: + if divide_group.get(goto_group.group_num) is None: + divide_group[goto_group.group_num] = [] + divide_group[goto_group.group_num].append(dfa) def dfa_in_group(status_num): - for group in group_list: + for group in G.group_list: for dfa in group.group: if dfa.status_num == status_num: return group @@ -99,7 +96,7 @@ def dfa_in_group(status_num): def create_mindfa_table(jump_table): trans_table = list_dict(ASCII_COUNT) - for dfa in dfa_list: + for dfa in G.dfa_list: from_dfa = dfa.status_num for i in range(ASCII_COUNT): ch = chr(i) @@ -125,4 +122,3 @@ def log_group(group_list): print('group num: ', group.group_num) for g in group.group: print(' dfa sets: ', g.status_num) - diff --git a/lex/lexer.py b/lex/lexer.py index 47df01c..b72ccc4 100644 --- a/lex/lexer.py +++ b/lex/lexer.py @@ -74,4 +74,4 @@ def handle_hex(self): return 1 def match(self, token): - return self.current_token == token \ No newline at end of file + return self.current_token == token diff --git a/nfa/construction.py b/nfa/construction.py index ce0cb13..96f5ae9 100644 --- a/nfa/construction.py +++ b/nfa/construction.py @@ -17,6 +17,7 @@ def pattern(pattern_string): + Nfa.STATUS_NUM = 0 global lexer lexer = Lexer(pattern_string) lexer.advance() @@ -90,12 +91,13 @@ def nfa_set_char(pair_out): def nfa_set_nega_char(pair_out): if not lexer.match(Token.CCL_START): return False - + neagtion = False lexer.advance() if lexer.match(Token.AT_BOL): neagtion = True - + lexer.advance() + start = pair_out.start_node = Nfa() start.next_1 = pair_out.end_node = Nfa() start.edge = CCL @@ -135,7 +137,7 @@ def dodash(input_set): def factor_conn(pair_out): if is_conn(lexer.current_token): factor(pair_out) - + while is_conn(lexer.current_token): pair = NfaPair() factor(pair) @@ -254,7 +256,7 @@ def group(pair_out): lexer.advance() elif lexer.match(Token.EOS): return False - else: + else: expr(pair_out) while True: @@ -268,10 +270,7 @@ def group(pair_out): lexer.advance() elif lexer.match(Token.EOS): return False - else: + else: expr(pair) pair_out.end_node.next_1 = pair.start_node pair_out.end_node = pair.end_node - - - \ No newline at end of file diff --git a/nfa/nfa.py b/nfa/nfa.py index cb82ed1..f9b33f8 100644 --- a/nfa/nfa.py +++ b/nfa/nfa.py @@ -51,7 +51,7 @@ def log_nfa(start_node): log('in: ', start_node.edge) if not next_1 and not next_2: - log('accept: ', start_node.status_num) + log('accept: ', start_node.status_num) start_node.visited = True if hasattr(start_node, 'input_set'): diff --git a/parse/parse.py b/parse/parse.py index 6403082..ba682b0 100644 --- a/parse/parse.py +++ b/parse/parse.py @@ -44,7 +44,7 @@ def closure(input_set): if next2 not in input_set: input_set.append(next2) nfa_stack.append(next2) - + return input_set @@ -61,4 +61,3 @@ def has_accepted_state(nfa_set): for nfa in nfa_set: if nfa.next_1 is None and nfa.next_2 is None: return True - diff --git a/parse/parse_dfa.py b/parse/parse_dfa.py index 784b35b..1a9a4d3 100644 --- a/parse/parse_dfa.py +++ b/parse/parse_dfa.py @@ -2,9 +2,13 @@ from nfa.construction import pattern from dfa.minimize_dfa import minimize_dfa from dfa.minimize_dfa import dfa_in_group +from dfa.dfa import Dfa +from dfa.dfa_group import DfaGroup def get_jump_table(pattern_string, minimize=True): + Dfa.STATUS_NUM = 0 + DfaGroup.GROUP_COUNT = 0 nfa_start_node = pattern(pattern_string) global jump_table jump_table = convert_to_dfa(nfa_start_node) @@ -18,7 +22,7 @@ def dfa_match(input_string, jump_table, minimize=True): if minimize: cur_status = dfa_in_group(0).group_num else: - cur_status = 0 + cur_status = 0 for i, c in enumerate(input_string): jump_dict = jump_table[cur_status] if jump_dict: @@ -30,4 +34,4 @@ def dfa_match(input_string, jump_table, minimize=True): if i == len(input_string) - 1 and jump_dict.get('accepted'): return True - return jump_table[cur_status].get('accepted') is not None \ No newline at end of file + return jump_table[cur_status].get('accepted') is not None diff --git a/regex.py b/regex.py index 9228ca2..63854bd 100644 --- a/regex.py +++ b/regex.py @@ -1,5 +1,5 @@ from parse.parse import match -from parse.parse_dfa import dfa_match +from parse.parse_dfa import dfa_match from nfa.construction import pattern from parse.parse_dfa import get_jump_table @@ -19,10 +19,10 @@ def match(self): return dfa_match(input_string, jump_table, self.minimize) else: nfa_machine = pattern(pattern_string) - return match(input_string, nfa_machine) + return match(input_string, nfa_machine) def replace(): pass def search(): - pass \ No newline at end of file + pass diff --git a/sample.py b/sample.py index 0074a92..23612d9 100644 --- a/sample.py +++ b/sample.py @@ -19,7 +19,7 @@ pattern = '([A-Z]+[0-9]*abcdefg)([0-9]*)(\*?|a+)(zx|bc*)([a-z]+|[0-9]*)(asd|fgh)(zxc)' -# NFA +# NFA regex = Regex(st, pattern) result = regex.match() log(result) diff --git a/test/test.py b/test.py similarity index 53% rename from test/test.py rename to test.py index 23b7d86..94220e3 100644 --- a/test/test.py +++ b/test.py @@ -1,12 +1,14 @@ import unittest from regex import Regex + class RegexMaterial(object): def __init__(self, str, pattern, result): self.str = str self.pattern = pattern self.result = result + testLists = [] testLists.append(RegexMaterial("a", "a", True)) testLists.append(RegexMaterial("a", "b+", False)) @@ -16,15 +18,28 @@ def __init__(self, str, pattern, result): testLists.append(RegexMaterial("abbbbb", "[^c]+", True)) testLists.append(RegexMaterial("ccccc", "[^c]+", False)) testLists.append(RegexMaterial("123", "[1-3]+", True)) +testLists.append(RegexMaterial("^", "[^1-3]+", True)) +testLists.append(RegexMaterial("fee", "fee|fie", True)) + class TestRegex(unittest.TestCase): def test(self): for t in testLists: - print("str is " + t.str + ", pattern is " + t.pattern + ", expected " + str(t.result)) + print("str is " + t.str + ", pattern is " + + t.pattern + ", expected " + str(t.result)) regex = Regex(t.str, t.pattern) self.assertEqual(regex.match(), t.result) - + for t in testLists: + print("str is " + t.str + ", pattern is " + + t.pattern + ", expected " + str(t.result)) + regex = Regex(t.str, t.pattern, 2, False) + self.assertEqual(regex.match(), t.result) + for t in testLists: + print("str is " + t.str + ", pattern is " + + t.pattern + ", expected " + str(t.result)) + regex = Regex(t.str, t.pattern, 2, True) + self.assertEqual(regex.match(), t.result) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/utils.py b/utils.py index 6541e6a..91620c4 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,12 @@ import time +class G: + dfa_list = [] + group_list = [] + on_partition = True + + def log(*args, **kwargs): format = '%H:%M:%S' value = time.localtime(int(time.time())) @@ -9,4 +15,4 @@ def log(*args, **kwargs): def list_dict(width): - return [dict() for i in range(width)] \ No newline at end of file + return [dict() for i in range(width)]