diff --git a/config.py b/config.py index 96faf38..75b911c 100644 --- a/config.py +++ b/config.py @@ -1 +1,2 @@ -SEED_URL = "http://premchand.co.in/story/pariksha" \ No newline at end of file +SEED_URL = "http://premchand.co.in/story/pariksha" +SENTENCE_DELIMITER = r"(!|\?|ред)" diff --git a/scraping.py b/scraping.py index 829356b..87bbd53 100644 --- a/scraping.py +++ b/scraping.py @@ -5,6 +5,7 @@ import os import re + def clean_text(text): # takes text and gets rid of the english stuff from it return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text) @@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path): # file was already made pass + +def sentence_tokenize(text): + # simple tokenization using regex + + tokenized_list = re.split(SENTENCE_DELIMITER, text) + # splits using the SENTENCE_DELIMITER + # each delimiter is also turned into a token itself + tokenized_list_final = [] + + for index, token in enumerate(tokenized_list): + if bool(re.match(SENTENCE_DELIMITER, token)): + # this is to weed out the elements that are the delimiters themselves + continue + if index < len(tokenized_list)-1: + if tokenized_list[index + 1] != "?": + # we don't want question sentences + + tokenized_list_final.append(token) + return tokenized_list_final +