Skip to content

Commit

Permalink
#5: rename SplitContainer -> Identifier
Browse files Browse the repository at this point in the history
  • Loading branch information
hlibbabii committed Mar 8, 2020
1 parent 2fc525e commit 981b1b4
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 152 deletions.
38 changes: 19 additions & 19 deletions codeprep/api/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ def nosplit(text: str, extension: Optional[str] = None, no_spaces: bool = False,
>>> prepped_tokens.metadata.n_subtokens_per_token
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', 'NewLine', \
'Tab', 'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', 'NewLine', \
'Tab', 'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \
'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'Tab', 'Tab', 'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', \
'Tab', 'Tab', 'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', \
'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', 'NewLine', \
'ClosingCurlyBracket', 'NewLine', \
'ClosingCurlyBracket', 'SpecialToken']
Expand Down Expand Up @@ -137,9 +137,9 @@ def nosplit(text: str, extension: Optional[str] = None, no_spaces: bool = False,
>>> prepped_tokens.metadata.n_subtokens_per_token
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', \
'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', \
'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'ClosingCurlyBracket', \
'ClosingCurlyBracket']
Expand Down Expand Up @@ -206,10 +206,10 @@ def chars(text: str, extension: Optional[str] = None, no_spaces: bool = False, n
>>> prepped_tokens.metadata.n_subtokens_per_token
[1, 30, 1, 1, 1, 1, 1, 4, 1, 1, 9, 1, 1, 1, 1, 6, 4, 1, 10, 1, 33, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', \
'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'ClosingCurlyBracket', 'ClosingCurlyBracket', 'SpecialToken']
Expand Down Expand Up @@ -271,9 +271,9 @@ def basic(text: str, extension: Optional[str] = None,
[1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'ClosingCurlyBracket', \
'ClosingCurlyBracket', 'SpecialToken']
Expand All @@ -289,9 +289,9 @@ def basic(text: str, extension: Optional[str] = None,
[1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'SplitContainer', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'Identifier', 'OpeningBracket', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'ClosingCurlyBracket', \
'ClosingCurlyBracket']
Expand All @@ -315,7 +315,7 @@ def basic(text: str, extension: Optional[str] = None,
>>> prepped_tokens
['<w>', 'moving', 'Vehiclesspeed', '</w>', '=', '<w>', '0', '.', '3', '4', '5', 'e', '+', '4', '</w>']
>>> prepped_tokens.metadata
([4, 1, 10], ['SplitContainer', 'Operator', 'Number'])
([4, 1, 10], ['Identifier', 'Operator', 'Number'])
>>> basic("movingVehiclesspeed = 0.345e+4", "java", ronin=True)
Expand Down Expand Up @@ -388,9 +388,9 @@ def bpe(text: str, bpe_codes_id: str, extension: Optional[str] = None, no_spaces
[1, 11, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 3, 2, 1, 2, 1, 18, 1, 1, 1, 1, 1]
>>> list(map(lambda x: x.__name__, prepped_tokens.metadata.token_types))
['KeyWord', 'SplitContainer', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'SplitContainer', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'SplitContainer', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
['KeyWord', 'Identifier', 'OpeningBracket', 'ClosingBracket', 'OpeningCurlyBracket', \
'KeyWord', 'OpeningBracket', 'Identifier', 'Operator', 'Operator', 'Number', 'ClosingBracket', 'OpeningCurlyBracket', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', 'OneLineComment', \
'Identifier', 'OpeningBracket', 'StringLiteral', 'ClosingBracket', 'Semicolon', \
'ClosingCurlyBracket', \
'ClosingCurlyBracket', 'SpecialToken']
Expand Down
10 changes: 5 additions & 5 deletions codeprep/parse/subtokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
import regex

from codeprep.noneng import is_non_eng
from codeprep.tokentypes.containers import SplitContainer
from codeprep.tokentypes.containers import Identifier
from codeprep.tokentypes.noneng import NonEng
from codeprep.tokentypes.numeric import Number
from codeprep.tokentypes.rootclasses import ParsedToken
from codeprep.tokentypes.whitespace import NewLine, Tab, SpaceInString
from codeprep.tokentypes.word import Underscore, Word, NonCodeChar


def split_identifier(token: str) -> SplitContainer:
def split_identifier(token: str) -> Identifier:
parts = [m[0] for m in
regex.finditer('(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]])|[^ ])', token)]

processable_tokens = [Word.from_(p) if p != '_' else Underscore() for p in parts]
split_container = SplitContainer(processable_tokens)
split_container = Identifier(processable_tokens)
return NonEng(split_container) if is_non_eng(token) else split_container


Expand Down Expand Up @@ -128,7 +128,7 @@ def to_parsed_token(token: str) -> ParsedToken:
def split_string(token: str) -> List[ParsedToken]:
"""
>>> split_string(" var = 9.4\\t\\n")
[<SpaceInString> (n_chars=4), SplitContainer[Word(('var', none))], \
[<SpaceInString> (n_chars=4), Identifier[Word(('var', none))], \
<SpaceInString> (n_chars=1), NonCodeChar(=), <SpaceInString> (n_chars=1), <Number>(9), \
NonCodeChar(.), <Number>(4), <Tab>, <NewLine>]
"""
Expand All @@ -145,7 +145,7 @@ def split_string(token: str) -> List[ParsedToken]:
def split_into_words(token: str) -> List[ParsedToken]:
"""
>>> split_into_words(" var = 9.4\\t\\n")
[<Tab>, SplitContainer[Word(('var', none))], NonCodeChar(=), <Number>(9), \
[<Tab>, Identifier[Word(('var', none))], NonCodeChar(=), <Number>(9), \
NonCodeChar(.), <Number>(4), <Tab>, <NewLine>]
"""
res = []
Expand Down
4 changes: 2 additions & 2 deletions codeprep/prepconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from codeprep.bpepkg.bpe_encode import BpeData, get_bpe_subwords
from codeprep.preprocess.reprconfig import Splitter, ReprConfig
from codeprep.tokentypes.containers import SplitContainer, StringLiteral, OneLineComment, MultilineComment
from codeprep.tokentypes.containers import Identifier, StringLiteral, OneLineComment, MultilineComment
from codeprep.tokentypes.noneng import NonEng
from codeprep.tokentypes.numeric import Number
from codeprep.tokentypes.whitespace import NewLine, Tab
Expand Down Expand Up @@ -166,7 +166,7 @@ def get_word_splitter(self) -> Optional[Splitter]:
def get_types_to_be_repr(self) -> List[Type]:
res = []
if self.get_param_value(PrepParam.SPLIT) in ['1', '2', '3', '4', '5', '6', '7', '8', '9', 's']:
res.extend([SplitContainer, Word])
res.extend([Identifier, Word])
if self.get_param_value(PrepParam.SPLIT) in ['2', '3', '4', '5', '6', '7', '8', '9', 's']:
res.append(Number)
if self.get_param_value(PrepParam.COM) == '0':
Expand Down
2 changes: 1 addition & 1 deletion codeprep/tokentypes/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def wrap_in_word_boundaries_if_necessary(res: List[str]) -> List[str]:
return [placeholders['word_start']] + res + [placeholders['word_end']]


class SplitContainer(ProcessableTokenContainer):
class Identifier(ProcessableTokenContainer):
def __init__(self, subtokens: List[ParsedSubtoken]):
super().__init__(subtokens)

Expand Down
10 changes: 5 additions & 5 deletions codeprep/tokentypes/noneng.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
from codeprep.preprocess.core import ReprConfig, torepr
from codeprep.preprocess.result import PreprocessingResult
from codeprep.preprocess.placeholders import placeholders
from codeprep.tokentypes.containers import SplitContainer
from codeprep.tokentypes.containers import Identifier
from codeprep.tokentypes.rootclasses import ParsedToken


class NonEng(ParsedToken):
def __init__(self, processable_token: SplitContainer):
if not isinstance(processable_token, SplitContainer):
raise ValueError(f"Only SplitContainer can be wrapped in {self.__class__}. Type passed: {type(processable_token)}")
def __init__(self, processable_token: Identifier):
if not isinstance(processable_token, Identifier):
raise ValueError(f"Only Identifier can be wrapped in {self.__class__}. Type passed: {type(processable_token)}")

self.processable_token = processable_token

Expand All @@ -25,7 +25,7 @@ def non_preprocessed_repr(self, repr_config: Optional[ReprConfig] = None) -> Pre
def preprocessed_repr(self, repr_config: ReprConfig) -> PreprocessingResult:
if repr_config.bpe_data:
token = replace_non_ascii_seqs(str(self.processable_token), placeholders['non_ascii_seq'])
return torepr(SplitContainer.from_single_token(token), repr_config)
return torepr(Identifier.from_single_token(token), repr_config)
else:
return self._wrap_in_metadata_for_full_word([placeholders['non_eng']])

Expand Down
Loading

0 comments on commit 981b1b4

Please sign in to comment.