diff --git a/config.py b/config.py
index 96faf38..75b911c 100644
--- a/config.py
+++ b/config.py
@@ -1 +1,2 @@
-SEED_URL = "http://premchand.co.in/story/pariksha"
\ No newline at end of file
+SEED_URL = "http://premchand.co.in/story/pariksha"
+SENTENCE_DELIMITER = r"(!|\?|।)"
diff --git a/scraping.py b/scraping.py
index 829356b..87bbd53 100644
--- a/scraping.py
+++ b/scraping.py
@@ -5,6 +5,7 @@
 import os
 import re
 
+
 def clean_text(text):
     # takes text and gets rid of the english stuff from it
     return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text)
@@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path):
         # file was already made
         pass
 
+
+def sentence_tokenize(text):
+    # simple tokenization using regex
+
+    tokenized_list = re.split(SENTENCE_DELIMITER, text)
+    # splits using the SENTENCE_DELIMITER
+    # each delimiter is also turned into a token itself
+    tokenized_list_final = []
+
+    for index, token in enumerate(tokenized_list):
+        if bool(re.match(SENTENCE_DELIMITER, token)):
+            # this is to weed out the elements that are the delimiters themselves
+            continue
+        if index < len(tokenized_list)-1:
+            if tokenized_list[index + 1] != "?":
+                # we don't want question sentences
+                
+                tokenized_list_final.append(token)
+    return tokenized_list_final
+