From 1b2cc951e58ac60133a5e214f85998397cfd9c73 Mon Sep 17 00:00:00 2001
From: Shashwat Singh <singhshashwat1002@gmail.com>
Date: Fri, 23 Jul 2021 23:54:39 +0530
Subject: [PATCH] [tokenization] introduce a method called
 sentence_tokenization and a setting called SENTENCE_DELIMITER

- SENTENCE_DELIMITER has our list of delimiters as rejex (in config.py)
- the sentence_tokenization returns a list of tokenized_sentences (based on SENTENCE_DELIMITER)

Note: sentence_tokenization also DELETES the question sentences
---
 config.py   |  3 ++-
 scraping.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/config.py b/config.py
index 96faf38..75b911c 100644
--- a/config.py
+++ b/config.py
@@ -1 +1,2 @@
-SEED_URL = "http://premchand.co.in/story/pariksha"
\ No newline at end of file
+SEED_URL = "http://premchand.co.in/story/pariksha"
+SENTENCE_DELIMITER = r"(!|\?|।)"
diff --git a/scraping.py b/scraping.py
index 829356b..87bbd53 100644
--- a/scraping.py
+++ b/scraping.py
@@ -5,6 +5,7 @@
 import os
 import re
 
+
 def clean_text(text):
     # takes text and gets rid of the english stuff from it
     return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text)
@@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path):
         # file was already made
         pass
 
+
+def sentence_tokenize(text):
+    # simple tokenization using regex
+
+    tokenized_list = re.split(SENTENCE_DELIMITER, text)
+    # splits using the SENTENCE_DELIMITER
+    # each delimiter is also turned into a token itself
+    tokenized_list_final = []
+
+    for index, token in enumerate(tokenized_list):
+        if bool(re.match(SENTENCE_DELIMITER, token)):
+            # this is to weed out the elements that are the delimiters themselves
+            continue
+        if index < len(tokenized_list)-1:
+            if tokenized_list[index + 1] != "?":
+                # we don't want question sentences
+                
+                tokenized_list_final.append(token)
+    return tokenized_list_final
+