From 981b1b4fb10e41caaedd4d798167619f496a68ab Mon Sep 17 00:00:00 2001 From: hlib Date: Sun, 8 Mar 2020 21:23:42 +0100 Subject: [PATCH] #5: rename SplitContainer -> Identifier --- codeprep/api/text.py | 38 +++++------ codeprep/parse/subtokens.py | 10 +-- codeprep/prepconfig.py | 4 +- codeprep/tokentypes/containers.py | 2 +- codeprep/tokentypes/noneng.py | 10 +-- tests/parse/test_core.py | 102 ++++++++++++++-------------- tests/parse/test_subtokens.py | 14 ++-- tests/test_subword_separation.py | 18 ++--- tests/test_to_repr.py | 106 +++++++++++++++--------------- 9 files changed, 152 insertions(+), 152 deletions(-) diff --git a/codeprep/api/text.py b/codeprep/api/text.py index 4fd0561..e058977 100644 --- a/codeprep/api/text.py +++ b/codeprep/api/text.py @@ -76,10 +76,10 @@ def nosplit(text: str, extension: Optional[str] = None, no_spaces: bool = False, >>> prepped_tokens.metadata.n_subtokens_per_token [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', 'NewLine', \ -'Tab', 'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', 'NewLine', \ +'Tab', 'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \ 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ -'Tab', 'Tab', 'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', \ +'Tab', 'Tab', 'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', \ 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', 'NewLine', \ 'ClosingCurlyBracket', 'NewLine', \ 'ClosingCurlyBracket', 'SpecialToken'] @@ -137,9 +137,9 @@ def nosplit(text: str, extension: Optional[str] = None, no_spaces: bool = False, >>> prepped_tokens.metadata.n_subtokens_per_token [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ -'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', \ -'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ +'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', \ +'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ 'ClosingCurlyBracket', \ 'ClosingCurlyBracket'] @@ -206,10 +206,10 @@ def chars(text: str, extension: Optional[str] = None, no_spaces: bool = False, n >>> prepped_tokens.metadata.n_subtokens_per_token [1, 30, 1, 1, 1, 1, 1, 4, 1, 1, 9, 1, 1, 1, 1, 6, 4, 1, 10, 1, 33, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ -'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ +'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \ 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ -'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ +'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ 'ClosingCurlyBracket', 'ClosingCurlyBracket', 'SpecialToken'] @@ -271,9 +271,9 @@ def basic(text: str, extension: Optional[str] = None, [1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ -'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ -'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ +'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ +'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ 'ClosingCurlyBracket', \ 'ClosingCurlyBracket', 'SpecialToken'] @@ -289,9 +289,9 @@ def basic(text: str, extension: Optional[str] = None, [1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ -'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ -'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ +'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ +'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ 'ClosingCurlyBracket', \ 'ClosingCurlyBracket'] @@ -315,7 +315,7 @@ def basic(text: str, extension: Optional[str] = None, >>> prepped_tokens ['', 'moving', 'Vehiclesspeed', '', '=', '', '0', '.', '3', '4', '5', 'e', '+', '4', ''] >>> prepped_tokens.metadata - ([4, 1, 10], ['SplitContainer', 'Operator', 'Number']) + ([4, 1, 10], ['Identifier', 'Operator', 'Number']) >>> basic("movingVehiclesspeed = 0.345e+4", "java", ronin=True) @@ -388,9 +388,9 @@ def bpe(text: str, bpe_codes_id: str, extension: Optional[str] = None, no_spaces [1, 11, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 3, 2, 1, 2, 1, 18, 1, 1, 1, 1, 1] >>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types)) - ['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ -'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ -'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ + ['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \ +'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \ +'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \ 'ClosingCurlyBracket', \ 'ClosingCurlyBracket', 'SpecialToken'] diff --git a/codeprep/parse/subtokens.py b/codeprep/parse/subtokens.py index 29cb5c4..c25a82c 100644 --- a/codeprep/parse/subtokens.py +++ b/codeprep/parse/subtokens.py @@ -7,7 +7,7 @@ import regex from codeprep.noneng import is_non_eng -from codeprep.tokentypes.containers import SplitContainer +from codeprep.tokentypes.containers import Identifier from codeprep.tokentypes.noneng import NonEng from codeprep.tokentypes.numeric import Number from codeprep.tokentypes.rootclasses import ParsedToken @@ -15,12 +15,12 @@ from codeprep.tokentypes.word import Underscore, Word, NonCodeChar -def split_identifier(token: str) -> SplitContainer: +def split_identifier(token: str) -> Identifier: parts = [m[0] for m in regex.finditer('(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]])|[^ ])', token)] processable_tokens = [Word.from_(p) if p != '_' else Underscore() for p in parts] - split_container = SplitContainer(processable_tokens) + split_container = Identifier(processable_tokens) return NonEng(split_container) if is_non_eng(token) else split_container @@ -128,7 +128,7 @@ def to_parsed_token(token: str) -> ParsedToken: def split_string(token: str) -> List[ParsedToken]: """ >>> split_string(" var = 9.4\\t\\n") - [ (n_chars=4), SplitContainer[Word(('var', none))], \ + [ (n_chars=4), Identifier[Word(('var', none))], \ (n_chars=1), NonCodeChar(=), (n_chars=1), (9), \ NonCodeChar(.), (4), , ] """ @@ -145,7 +145,7 @@ def split_string(token: str) -> List[ParsedToken]: def split_into_words(token: str) -> List[ParsedToken]: """ >>> split_into_words(" var = 9.4\\t\\n") - [, SplitContainer[Word(('var', none))], NonCodeChar(=), (9), \ + [, Identifier[Word(('var', none))], NonCodeChar(=), (9), \ NonCodeChar(.), (4), , ] """ res = [] diff --git a/codeprep/prepconfig.py b/codeprep/prepconfig.py index c0ec736..248f15a 100644 --- a/codeprep/prepconfig.py +++ b/codeprep/prepconfig.py @@ -14,7 +14,7 @@ from codeprep.bpepkg.bpe_encode import BpeData, get_bpe_subwords from codeprep.preprocess.reprconfig import Splitter, ReprConfig -from codeprep.tokentypes.containers import SplitContainer, StringLiteral, OneLineComment, MultilineComment +from codeprep.tokentypes.containers import Identifier, StringLiteral, OneLineComment, MultilineComment from codeprep.tokentypes.noneng import NonEng from codeprep.tokentypes.numeric import Number from codeprep.tokentypes.whitespace import NewLine, Tab @@ -166,7 +166,7 @@ def get_word_splitter(self) -> Optional[Splitter]: def get_types_to_be_repr(self) -> List[Type]: res = [] if self.get_param_value(PrepParam.SPLIT) in ['1', '2', '3', '4', '5', '6', '7', '8', '9', 's']: - res.extend([SplitContainer, Word]) + res.extend([Identifier, Word]) if self.get_param_value(PrepParam.SPLIT) in ['2', '3', '4', '5', '6', '7', '8', '9', 's']: res.append(Number) if self.get_param_value(PrepParam.COM) == '0': diff --git a/codeprep/tokentypes/containers.py b/codeprep/tokentypes/containers.py index 9265b82..e237af8 100644 --- a/codeprep/tokentypes/containers.py +++ b/codeprep/tokentypes/containers.py @@ -45,7 +45,7 @@ def wrap_in_word_boundaries_if_necessary(res: List[str]) -> List[str]: return [placeholders['word_start']] + res + [placeholders['word_end']] -class SplitContainer(ProcessableTokenContainer): +class Identifier(ProcessableTokenContainer): def __init__(self, subtokens: List[ParsedSubtoken]): super().__init__(subtokens) diff --git a/codeprep/tokentypes/noneng.py b/codeprep/tokentypes/noneng.py index d6cc0fd..d8ca7db 100644 --- a/codeprep/tokentypes/noneng.py +++ b/codeprep/tokentypes/noneng.py @@ -8,14 +8,14 @@ from codeprep.preprocess.core import ReprConfig, torepr from codeprep.preprocess.result import PreprocessingResult from codeprep.preprocess.placeholders import placeholders -from codeprep.tokentypes.containers import SplitContainer +from codeprep.tokentypes.containers import Identifier from codeprep.tokentypes.rootclasses import ParsedToken class NonEng(ParsedToken): - def __init__(self, processable_token: SplitContainer): - if not isinstance(processable_token, SplitContainer): - raise ValueError(f"Only SplitContainer can be wrapped in {self.__class__}. Type passed: {type(processable_token)}") + def __init__(self, processable_token: Identifier): + if not isinstance(processable_token, Identifier): + raise ValueError(f"Only Identifier can be wrapped in {self.__class__}. Type passed: {type(processable_token)}") self.processable_token = processable_token @@ -25,7 +25,7 @@ def non_preprocessed_repr(self, repr_config: Optional[ReprConfig] = None) -> Pre def preprocessed_repr(self, repr_config: ReprConfig) -> PreprocessingResult: if repr_config.bpe_data: token = replace_non_ascii_seqs(str(self.processable_token), placeholders['non_ascii_seq']) - return torepr(SplitContainer.from_single_token(token), repr_config) + return torepr(Identifier.from_single_token(token), repr_config) else: return self._wrap_in_metadata_for_full_word([placeholders['non_eng']]) diff --git a/tests/parse/test_core.py b/tests/parse/test_core.py index 83c8af8..f6b80c2 100644 --- a/tests/parse/test_core.py +++ b/tests/parse/test_core.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from codeprep.parse.core import convert_text -from codeprep.tokentypes.containers import SplitContainer, StringLiteral, OneLineComment, MultilineComment +from codeprep.tokentypes.containers import Identifier, StringLiteral, OneLineComment, MultilineComment from codeprep.tokentypes.numeric import Number from codeprep.tokentypes.whitespace import Tab, NewLine, SpaceInString from codeprep.tokentypes.word import Word, Underscore, KeyWord, Operator, NonCodeChar, OpeningCurlyBracket, \ @@ -15,9 +15,9 @@ def test_longs(): expected_result = [KeyWord('long'), Operator('['), Operator(']'), - SplitContainer([Word.from_('lovely'), - Underscore(), - Word.from_('longs')]), + Identifier([Word.from_('lovely'), + Underscore(), + Word.from_('longs')]), Operator('='), OpeningCurlyBracket(), Number("0x34a35EL"), @@ -43,7 +43,7 @@ def test_ints(): expected_result = [KeyWord('int'), Operator('['), Operator(']'), - SplitContainer( + Identifier( [Underscore(), Word.from_('my'), Underscore(), @@ -80,7 +80,7 @@ def test_floats(): expected_result = [KeyWord('float'), Operator('['), Operator(']'), - SplitContainer.from_single_token('floats'), + Identifier.from_single_token('floats'), Operator('='), OpeningCurlyBracket(), Operator('-'), @@ -105,15 +105,15 @@ def test_floats(): def test_complex_identifiers(): text = '''BigAWESOMEString[] a2y = "abc".doSplit("\\"");''' - expected_result = [SplitContainer( + expected_result = [Identifier( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), - SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), + Identifier([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), - StringLiteral([NonCodeChar('"'), SplitContainer.from_single_token('abc'), NonCodeChar('"')], 5), + StringLiteral([NonCodeChar('"'), Identifier.from_single_token('abc'), NonCodeChar('"')], 5), Operator('.'), - SplitContainer([Word.from_('do'), Word.from_('Split')]), + Identifier([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"')], 4), ClosingBracket(), @@ -129,11 +129,11 @@ def test_string_with_spaces(): text='''"hi dear world !"''' expected = [StringLiteral([ NonCodeChar('"'), - SplitContainer.from_single_token('hi'), + Identifier.from_single_token('hi'), SpaceInString(3), - SplitContainer.from_single_token('dear'), + Identifier.from_single_token('dear'), SpaceInString(5), - SplitContainer.from_single_token('world'), + Identifier.from_single_token('world'), SpaceInString(4), NonCodeChar('!'), NonCodeChar('"'), @@ -146,20 +146,20 @@ def test_string_with_spaces(): def test_spaces_in_strings(): text = '''BigAWESOMEString[] a2y = "a bc".doSplit("\\"");''' - expected_result = [SplitContainer( + expected_result = [Identifier( [Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String')], ), Operator('['), Operator(']'), - SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), + Identifier([Word.from_('a'), Word.from_('2'), Word.from_('y')]), Operator('='), StringLiteral([NonCodeChar('"'), - SplitContainer.from_single_token('a'), + Identifier.from_single_token('a'), SpaceInString(n_chars=4), - SplitContainer.from_single_token('bc'), + Identifier.from_single_token('bc'), NonCodeChar('"')], 9), Operator('.'), - SplitContainer([Word.from_('do'), Word.from_('Split')]), + Identifier([Word.from_('do'), Word.from_('Split')]), OpeningBracket(), StringLiteral([NonCodeChar('"'), NonCodeChar('\\'), NonCodeChar('"'), NonCodeChar('"')], 4), ClosingBracket(), @@ -175,20 +175,20 @@ def test_one_line_comment(): text = '''// this code won't compile but the preprocessing still has to be done corrrectly''' expected_result = [OneLineComment([NonCodeChar('/'), NonCodeChar('/'), - SplitContainer.from_single_token('this'), - SplitContainer.from_single_token('code'), - SplitContainer.from_single_token('won'), NonCodeChar("'"), - SplitContainer.from_single_token('t'), - SplitContainer.from_single_token('compile'), - SplitContainer.from_single_token('but'), - SplitContainer.from_single_token('the'), - SplitContainer.from_single_token('preprocessing'), - SplitContainer.from_single_token('still'), - SplitContainer.from_single_token('has'), - SplitContainer.from_single_token('to'), - SplitContainer.from_single_token('be'), - SplitContainer.from_single_token('done'), - SplitContainer.from_single_token('corrrectly'), + Identifier.from_single_token('this'), + Identifier.from_single_token('code'), + Identifier.from_single_token('won'), NonCodeChar("'"), + Identifier.from_single_token('t'), + Identifier.from_single_token('compile'), + Identifier.from_single_token('but'), + Identifier.from_single_token('the'), + Identifier.from_single_token('preprocessing'), + Identifier.from_single_token('still'), + Identifier.from_single_token('has'), + Identifier.from_single_token('to'), + Identifier.from_single_token('be'), + Identifier.from_single_token('done'), + Identifier.from_single_token('corrrectly'), NewLine() ])] @@ -229,7 +229,7 @@ def test_special_characters(): {}[],.-:();&|\\'~%^ ''' - expected_result = [SplitContainer([Word.from_('abc'), Word.from_('1')]), + expected_result = [Identifier([Word.from_('abc'), Word.from_('1')]), NewLine(), Operator('~'), Operator('-'), @@ -348,9 +348,9 @@ def test_multi_line_comment(): _operations ''' - expected_result = [MultilineComment([NonCodeChar('/'), NonCodeChar('*'), SplitContainer.from_single_token('multi'), NonCodeChar('-'), - SplitContainer.from_single_token('line'), - SplitContainer([ + expected_result = [MultilineComment([NonCodeChar('/'), NonCodeChar('*'), Identifier.from_single_token('multi'), NonCodeChar('-'), + Identifier.from_single_token('line'), + Identifier([ Word.from_('My'), Word.from_('Comment'), Underscore() @@ -358,7 +358,7 @@ def test_multi_line_comment(): NewLine(), NonCodeChar('*'), NonCodeChar('/')]), Operator('/'), NewLine(), - SplitContainer([Underscore(), Word.from_('operations')]), + Identifier([Underscore(), Word.from_('operations')]), NewLine()] actual = [t for t in convert_text(text, 'java')] @@ -371,14 +371,14 @@ def test_capitals(): MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' - expected_result = [SplitContainer([Word.from_("My"), Word.from_("Class")]), - SplitContainer.from_single_token("Class"), - SplitContainer.from_single_token("CONSTANT"), - SplitContainer([Word.from_("VAR"), - Underscore(), - Word.from_("WITH"), - Underscore(), - Word.from_("UNDERSCORES")]), + expected_result = [Identifier([Word.from_("My"), Word.from_("Class")]), + Identifier.from_single_token("Class"), + Identifier.from_single_token("CONSTANT"), + Identifier([Word.from_("VAR"), + Underscore(), + Word.from_("WITH"), + Underscore(), + Word.from_("UNDERSCORES")]), NewLine()] actual = [t for t in convert_text(text, 'java')] @@ -389,13 +389,13 @@ def test_capitals(): def test_string_literal_single(): text = '''a = 'some_text'.split()''' - expected_result = [SplitContainer.from_single_token("a"), + expected_result = [Identifier.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar("'")], 1), - StringLiteral([SplitContainer([Word.from_("some"), Underscore(), Word.from_("text")])], 9), + StringLiteral([Identifier([Word.from_("some"), Underscore(), Word.from_("text")])], 9), StringLiteral([NonCodeChar("'")], 1), Operator('.'), - SplitContainer.from_single_token("split"), + Identifier.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() @@ -409,13 +409,13 @@ def test_string_literal_single(): def test_string_literal_double(): text = '''a = "some_text".split()''' - expected_result = [SplitContainer.from_single_token("a"), + expected_result = [Identifier.from_single_token("a"), Operator('='), StringLiteral([NonCodeChar('"')], 1), - StringLiteral([SplitContainer([Word.from_("some"), Underscore(), Word.from_("text")])], 9), + StringLiteral([Identifier([Word.from_("some"), Underscore(), Word.from_("text")])], 9), StringLiteral([NonCodeChar('"')], 1), Operator('.'), - SplitContainer.from_single_token("split"), + Identifier.from_single_token("split"), OpeningBracket(), ClosingBracket(), NewLine() diff --git a/tests/parse/test_subtokens.py b/tests/parse/test_subtokens.py index b71a13b..1de733a 100644 --- a/tests/parse/test_subtokens.py +++ b/tests/parse/test_subtokens.py @@ -5,7 +5,7 @@ from codeprep.tokentypes.numeric import Number from codeprep.parse.matchers import split_into_words -from codeprep.tokentypes.containers import SplitContainer +from codeprep.tokentypes.containers import Identifier from codeprep.tokentypes.whitespace import NewLine, SpaceInString from codeprep.tokentypes.word import Word, Underscore from codeprep.parse.subtokens import split_string @@ -16,9 +16,9 @@ def test_split_into_tokens(): expected = [Number('123'), NewLine(), - SplitContainer([Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), - Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG')]), - SplitContainer([Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J')])] + Identifier([Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), + Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG')]), + Identifier([Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J')])] assert expected == actual @@ -28,9 +28,9 @@ def test_split_string(): expected = [Number('123'), NewLine(), - SplitContainer([Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), - Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG')]), + Identifier([Word.from_('Ab'), Word.from_('2'), Word.from_('cd'), + Word.from_('34'), Word.from_('Ef'), Word.from_('000'), Word.from_('GG')]), SpaceInString(5), - SplitContainer([Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J')])] + Identifier([Word.from_('j'), Underscore(), Word.from_('89'), Underscore(), Word.from_('J')])] assert expected == actual \ No newline at end of file diff --git a/tests/test_subword_separation.py b/tests/test_subword_separation.py index 43d1525..6545166 100644 --- a/tests/test_subword_separation.py +++ b/tests/test_subword_separation.py @@ -6,7 +6,7 @@ # # from codeprep.bpepkg.bpe_encode import BpeData # from codeprep.parse.core import convert_text -# from codeprep.parse.model.containers import SplitContainer +# from codeprep.parse.model.containers import Identifier # from codeprep.parse.model.numeric import Number # from codeprep.parse.model.placeholders import placeholders # from codeprep.parse.model.word import Underscore, Word @@ -15,15 +15,15 @@ # # test_cases = { # "create": ( -# [SplitContainer.from_single_token("create")], +# [Identifier.from_single_token("create")], # ["create"], # ), # "Vector": ( -# [SplitContainer.from_single_token("Vector")], +# [Identifier.from_single_token("Vector")], # [placeholders["capital"], "vector"], # ), # "players": ( -# [SplitContainer.from_single_token("players")], +# [Identifier.from_single_token("players")], # [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]] # ), # "0.345e+4": ( @@ -31,16 +31,16 @@ # [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]] # ), # "bestPlayers": ( -# [SplitContainer([Word.from_("best"), Word.from_("Players")])], +# [Identifier([Word.from_("best"), Word.from_("Players")])], # [placeholders["word_start"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] # ), # "test_BestPlayers": ( -# [SplitContainer([Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players")])], +# [Identifier([Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players")])], # [placeholders["word_start"], "test", '_', placeholders["capital"], # "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] # ), # "test_BestPlayers_modified": ( -# [SplitContainer( +# [Identifier( # [Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players"), Underscore(), # Word.from_("modified")] # )], @@ -50,13 +50,13 @@ # placeholders["word_end"]] # ), # "N_PLAYERS_NUM": ( -# [SplitContainer([Word.from_("N"), Underscore(), Word.from_("PLAYERS"), Underscore(), Word.from_("NUM")])], +# [Identifier([Word.from_("N"), Underscore(), Word.from_("PLAYERS"), Underscore(), Word.from_("NUM")])], # [placeholders["word_start"], placeholders["capitals"], "n", '_', # placeholders["capitals"], "play", "er", "s", '_', placeholders["capitals"], # "num", placeholders["word_end"]] # ), # "_players": ( -# [SplitContainer([Underscore(), (Word.from_("players"))])], +# [Identifier([Underscore(), (Word.from_("players"))])], # [placeholders['word_start'], '_', "play", "er", "s", placeholders['word_end']] # ), # } diff --git a/tests/test_to_repr.py b/tests/test_to_repr.py index 6d099a2..4649d8b 100644 --- a/tests/test_to_repr.py +++ b/tests/test_to_repr.py @@ -10,7 +10,7 @@ from codeprep.bpepkg.merge import MergeList, Merge from codeprep.preprocess.result import PreprocessingResult from codeprep.tokens import PreppedSubTokenSequence -from codeprep.tokentypes.containers import SplitContainer, OneLineComment, MultilineComment, StringLiteral +from codeprep.tokentypes.containers import Identifier, OneLineComment, MultilineComment, StringLiteral from codeprep.preprocess.metadata import PreppedTokenMetadata from codeprep.tokentypes.noneng import NonEng from codeprep.tokentypes.numeric import Number @@ -26,11 +26,11 @@ tokens = [ Number('1.1'), Operator("*"), - NonEng(SplitContainer([Word.from_("übersetzen")])), + NonEng(Identifier([Word.from_("übersetzen")])), StringLiteral([ NonCodeChar('"'), NonEng( - SplitContainer([ + Identifier([ Word.from_("A"), Word.from_("Wirklicä") ]) @@ -42,10 +42,10 @@ MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ NonEng( - SplitContainer([Word.from_('ц')]), + Identifier([Word.from_('ц')]), ), NonEng( - SplitContainer([ + Identifier([ Word.from_("blanco"), Underscore(), Word.from_("english") @@ -56,7 +56,7 @@ NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( - SplitContainer([ + Identifier([ Word.from_("DIESELBE"), Word.from_("8") ]) @@ -98,11 +98,11 @@ def test_to_repr_0(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1] * 16, - token_types=[Number, Operator, SplitContainer, - StringLiteral, StringLiteral, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), + token_types=[Number, Operator, Identifier, + StringLiteral, StringLiteral, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {'"', "*", "/"}) assert result == expected_result @@ -128,10 +128,10 @@ def test_to_repr_0_max_str_length_7(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - token_types=[Number, Operator, SplitContainer, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), + token_types=[Number, Operator, Identifier, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {'"', "*", "/"}) assert result == expected_result @@ -157,11 +157,11 @@ def test_to_repr_0_max_str_length_B(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1] * 16, - token_types=[Number, Operator, SplitContainer, - StringLiteral, StringLiteral, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), + token_types=[Number, Operator, Identifier, + StringLiteral, StringLiteral, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {'"', "*", "/"}) assert result == expected_result @@ -187,10 +187,10 @@ def test_to_repr_F(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1] * 14, - token_types=[Number, Operator, SplitContainer, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), + token_types=[Number, Operator, Identifier, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {"*", "/"}) assert result == expected_result @@ -216,10 +216,10 @@ def test_to_repr_F_max_str_length_7(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1] * 14, - token_types=[Number, Operator, SplitContainer, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {"*", "/"}) + token_types=[Number, Operator, Identifier, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {"*", "/"}) assert result == expected_result @@ -245,10 +245,10 @@ def test_to_repr_F_max_str_length_B(): '/', '*', 'ц', 'blanco_english', '*', '/', '/', '/', "DIESELBE8", pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[1] * 14, - token_types=[Number, Operator, SplitContainer, StringLiteral, - MultilineComment, MultilineComment, MultilineComment, - MultilineComment, MultilineComment, MultilineComment, - OneLineComment, OneLineComment, OneLineComment, OneLineComment])), + token_types=[Number, Operator, Identifier, StringLiteral, + MultilineComment, MultilineComment, MultilineComment, + MultilineComment, MultilineComment, MultilineComment, + OneLineComment, OneLineComment, OneLineComment, OneLineComment])), {"*", "/"}) assert result == expected_result @@ -342,40 +342,40 @@ def test_to_repr_with_enonlycontents1(): tokens = [ Number("1.1"), Operator("*"), - NonEng(SplitContainer([Word.from_("dinero")])), + NonEng(Identifier([Word.from_("dinero")])), StringLiteral([ NonCodeChar('"'), - NonEng(SplitContainer([Word.from_("ich")])), + NonEng(Identifier([Word.from_("ich")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("weiss")])), + NonEng(Identifier([Word.from_("weiss")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("nicht")])), + NonEng(Identifier([Word.from_("nicht")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("was")])), + NonEng(Identifier([Word.from_("was")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("soll")])), + NonEng(Identifier([Word.from_("soll")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("es")])), + NonEng(Identifier([Word.from_("es")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("bedeuten")])), + NonEng(Identifier([Word.from_("bedeuten")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("dass")])), + NonEng(Identifier([Word.from_("dass")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("ich")])), + NonEng(Identifier([Word.from_("ich")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("so")])), + NonEng(Identifier([Word.from_("so")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("traurig")])), + NonEng(Identifier([Word.from_("traurig")])), SpaceInString(), - NonEng(SplitContainer([Word.from_("bin")])), + NonEng(Identifier([Word.from_("bin")])), NonCodeChar('"'), ], 62), NewLine(), MultilineComment([NonCodeChar('/'), NonCodeChar('*')]), MultilineComment([ - NonEng(SplitContainer([Word.from_('ц')])), + NonEng(Identifier([Word.from_('ц')])), NonEng( - SplitContainer([ + Identifier([ Word.from_("blanco"), Underscore(), Word.from_("english") @@ -386,7 +386,7 @@ def test_to_repr_with_enonlycontents1(): NewLine(), Tab(), OneLineComment([NonCodeChar('/'), NonCodeChar('/'), NonEng( - SplitContainer([ + Identifier([ Word.from_("DIESELBE"), Word.from_("8") ]) @@ -448,7 +448,7 @@ def test_to_repr_with_non_eng(): '/', '/', pl['word_start'], pl['capitals'], 'dieselbe', "8", pl['word_end'], pl['olc_end'] ], PreppedTokenMetadata(n_subtokens_per_token=[5, 1, 1, 1, 6, 1, 1, 1, 1, 5, 1, 1, 1, 1, 5, 1], - token_types=[Number, Operator, SplitContainer] + token_types=[Number, Operator, Identifier] + [StringLiteral] * 3 + [MultilineComment] * 6 + [OneLineComment] * 4)), @@ -649,12 +649,12 @@ def test_1(): PrepParam.CASE: 'u' }) - tokens = [SplitContainer.from_single_token("Whi@le")] + tokens = [Identifier.from_single_token("Whi@le")] result = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']})) expected_result = PreprocessingResult(PreppedSubTokenSequence(["Whi@le" + placeholders['compound_word_end']], - PreppedTokenMetadata(n_subtokens_per_token=[1], token_types=[SplitContainer]), word_end_token_added=True), + PreppedTokenMetadata(n_subtokens_per_token=[1], token_types=[Identifier]), word_end_token_added=True), set()) assert result == expected_result @@ -670,14 +670,14 @@ def test_merges_no_cache(): PrepParam.CASE: 'u' }) - tokens = [SplitContainer.from_single_token("Whi@l@@e@")] + tokens = [Identifier.from_single_token("Whi@l@@e@")] result = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)), merges_cache={} )) expected_result = PreprocessingResult( PreppedSubTokenSequence(["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]], - PreppedTokenMetadata(n_subtokens_per_token=[9], token_types=[SplitContainer]), word_end_token_added=True), + PreppedTokenMetadata(n_subtokens_per_token=[9], token_types=[Identifier]), word_end_token_added=True), set()) assert result == expected_result