-
Notifications
You must be signed in to change notification settings - Fork 0
/
step_1_data_to_pandas_normal.py
41 lines (34 loc) · 1.26 KB
/
step_1_data_to_pandas_normal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import re
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
print("Please Enter the Exact Dataset Main Folder Name")
folder_name = input()
labels = os.listdir ("original_datasets/" + folder_name)
if ".DS_Store" in labels:
labels.remove(".DS_Store")
all_data = []
# uncomment to debug
# labels = labels[:5]
counter = 0
for i in labels:
instances_in_a_label = os.listdir ("original_datasets/" + folder_name + '/' + i)
all_data_for_a_label = []
for j in instances_in_a_label:
# uncomment to debug
# if counter < 2:
f = open("original_datasets/" + folder_name + '/' + i + '/' + j, "r", encoding='latin-1')
raw_data = f.read()
preprocessed_data = re.sub('[^a-zA-Z]', ' ', raw_data).lower()
preprocessed_data = preprocessed_data.split()
preprocessed_data = [word for word in preprocessed_data if word not in stopwords.words('english')]
preprocessed_data = ' '.join(preprocessed_data)
all_data.append([j, preprocessed_data, i])
# counter += 1
# counter = 0
all_data = np.asarray(all_data)
df = pd.DataFrame(all_data)
print("===========DataFrame-Complete===========")
df.to_csv('pre_processed_df/pre_processed_' + folder_name + '.csv', index=False)