-
Notifications
You must be signed in to change notification settings - Fork 0
/
tl_dataset_preprocessor.py
98 lines (78 loc) · 3.22 KB
/
tl_dataset_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import nltk #for tokenization
import re #regex for removing symbols
from bs4 import BeautifulSoup #to extract texts on html. doesnt have to remove it if not dealing with html files.
from langdetect import detect, DetectorFactory #language detection
# Ensure consistent results from langdetect
DetectorFactory.seed = 0
# Download necessary resources
nltk.download('punkt')
nltk.download('punkt_tab')
# Function to read and split text into chunks
def read_and_split_text(file_path, chunk_size=1024*1024): # Default chunk size: 1MB
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
# Split text into chunks
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
print(f"{file_path} is loaded.")
return chunks
# Function to detect if a word is in Tagalog
def is_tagalog_word(word):
try:
lang = detect(word)
return lang == 'tl' # 'tl' is the code for Tagalog
except:
return False
# Function to check if a sentence is at least 50% Tagalog
def is_mostly_tagalog(sentence, threshold=0.5):
words = nltk.word_tokenize(sentence)
if len(words) == 0:
return False
tagalog_word_count = sum(1 for word in words if is_tagalog_word(word))
tagalog_percentage = tagalog_word_count / len(words)
return tagalog_percentage >= threshold
# Function to normalize text
def normalize_text(text):
print("Normalizing text...")
text = BeautifulSoup(text, "html.parser").get_text()
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# Function to count tokens and check the size
def count_tokens(tokens):
print("Counting tokens...")
return len(tokens)
# Function to process a batch of text
def process_batch(batch):
sentences = nltk.sent_tokenize(batch)
tagalog_sentences = []
total_sentences = len(sentences)
for idx, sent in enumerate(sentences):
if is_mostly_tagalog(sent, threshold=0.5): #You can Adjust threshold here
tagalog_sentences.append(sent)
# Calculate and print the progress percentage
progress = (idx + 1) / total_sentences * 100
print(f"Progress: {progress:.2f}%", end="\r")
tagalog_text = ' '.join(tagalog_sentences)
normalized_text = normalize_text(tagalog_text)
tokens = normalized_text #you can use nltk to tokenize this if you want
return tokens
# Example usage
if __name__ == "__main__":
# Step 1: Read and split the text into batches
file_path = "output.txt"
chunks = read_and_split_text(file_path)
all_tokens = []
# Step 2: Process each batch
for chunk in chunks:
print("Processing Batch...")
tokens = process_batch(chunk)
all_tokens.extend(tokens)
print("Batch Processed. Proceeding to the next batch.")
# Step 3: Count the tokens
token_count = count_tokens(all_tokens)
print(f"Total tokens: {token_count}")
# Step 4: If the token count is around 500,000, save the corpus
with open("preprocessed_tagalog_corpus.txt", 'w', encoding='utf-8') as f:
f.write(' '.join(all_tokens[:500_000]))
print(f"Corpus has only {token_count} tokens, consider adding more Tagalog text.")