-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcustom_format_converter.py
58 lines (43 loc) · 1.94 KB
/
custom_format_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
import json
from functools import lru_cache
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
tokenizer = TweetTokenizer(preserve_case=False)
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words("english"))
@lru_cache(maxsize=1000000000)
def lemmatize(w: str):
# caching the word-based lemmatizer to speed the process up
return lemmatizer.lemmatize(w)
def read_amazon_format(path: str, sentence=True):
"""
Reading Amazon dataset-like JSON files, splitting the reviews into sentences (or not),
tokenizing, lemmatizing, filtering and saving into a text file
:param path: a path to a filename
:param sentence: whether to split the reviews into sentences
"""
with open(path + ("" if sentence else "-full_text") + ".txt", "w+", encoding="utf-8") as wf:
for line in tqdm(open(path, "r", encoding="utf-8"), "normalizing texts read from [%s]" % path):
# reading the text
text = json.loads(line.strip())["reviewText"].replace("\n", " ")
# splitting into sentences
sentences = sent_tokenize(text)
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
# removing stopwords and non-alphanumeric tokens
lemmatized_sentences = [[lemmatize(word) for word in s if not word in stops and str.isalpha(word)]
for s in tokenized_sentences]
for sentence in lemmatized_sentences:
wf.write(" ".join(sentence) + "\n" if sentence else " ")
if not sentence:
wf.write("\n")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
path = sys.argv[1]
else:
path = "reviews_Cell_Phones_and_Accessories_5.json"
read_amazon_format(path, sentence=True)