From 1b2cc951e58ac60133a5e214f85998397cfd9c73 Mon Sep 17 00:00:00 2001 From: Shashwat Singh Date: Fri, 23 Jul 2021 23:54:39 +0530 Subject: [PATCH] [tokenization] introduce a method called sentence_tokenization and a setting called SENTENCE_DELIMITER - SENTENCE_DELIMITER has our list of delimiters as rejex (in config.py) - the sentence_tokenization returns a list of tokenized_sentences (based on SENTENCE_DELIMITER) Note: sentence_tokenization also DELETES the question sentences --- config.py | 3 ++- scraping.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/config.py b/config.py index 96faf38..75b911c 100644 --- a/config.py +++ b/config.py @@ -1 +1,2 @@ -SEED_URL = "http://premchand.co.in/story/pariksha" \ No newline at end of file +SEED_URL = "http://premchand.co.in/story/pariksha" +SENTENCE_DELIMITER = r"(!|\?|ред)" diff --git a/scraping.py b/scraping.py index 829356b..87bbd53 100644 --- a/scraping.py +++ b/scraping.py @@ -5,6 +5,7 @@ import os import re + def clean_text(text): # takes text and gets rid of the english stuff from it return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text) @@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path): # file was already made pass + +def sentence_tokenize(text): + # simple tokenization using regex + + tokenized_list = re.split(SENTENCE_DELIMITER, text) + # splits using the SENTENCE_DELIMITER + # each delimiter is also turned into a token itself + tokenized_list_final = [] + + for index, token in enumerate(tokenized_list): + if bool(re.match(SENTENCE_DELIMITER, token)): + # this is to weed out the elements that are the delimiters themselves + continue + if index < len(tokenized_list)-1: + if tokenized_list[index + 1] != "?": + # we don't want question sentences + + tokenized_list_final.append(token) + return tokenized_list_final +