Skip to content

Commit 6926c6a

Browse files
committed
scripts: add extractor and filtering tools
1 parent cb957f5 commit 6926c6a

File tree

2 files changed

+89
-0
lines changed

2 files changed

+89
-0
lines changed

extractor.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import sys
2+
import time
3+
import tarfile
4+
5+
from ast import literal_eval
6+
7+
filename = sys.argv[1]
8+
9+
out_filename = filename.replace(".tar.gz", ".txt")
10+
11+
out_f_p = open(out_filename, "wt")
12+
13+
start_time = time.time()
14+
15+
with tarfile.open(filename, "r:gz") as f_p:
16+
members = f_p.getmembers()
17+
18+
print("Extracting:", members)
19+
20+
assert len(members) == 1
21+
22+
extracted_file = f_p.extractfile(members[0])
23+
24+
file_content = extracted_file.read().decode('utf-8')
25+
26+
file_content = file_content.replace("}{'url'", "}\n{'url'")
27+
28+
for line in file_content.split("\n"):
29+
try:
30+
data = literal_eval(line)
31+
32+
language_score = data["language_score"]
33+
34+
if language_score > 0.98:
35+
out_f_p.write(line + "\n")
36+
except:
37+
continue
38+
39+
print("Extraction took:", round(time.time() - start_time, 2), "seconds.")
40+
41+
out_f_p.close()

filtering.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import sys
2+
import time
3+
4+
from ast import literal_eval
5+
6+
filename = sys.argv[1]
7+
8+
out_filename = filename.replace(".txt", "_filtered.txt")
9+
10+
out_f_p = open(out_filename, "wt")
11+
12+
start_time = time.time()
13+
14+
with open(filename, "rt") as f_p:
15+
for line in f_p:
16+
try:
17+
data = literal_eval(line)
18+
19+
date_download = data["date_download"]
20+
digest = data["digest"]
21+
22+
raw_content = data["raw_content"]
23+
24+
# Some filtering rules
25+
if "<" in raw_content:
26+
continue
27+
28+
if ">" in raw_content:
29+
continue
30+
31+
if "http:" in raw_content:
32+
continue
33+
34+
if "https:" in raw_content:
35+
continue
36+
37+
if "�" in raw_content:
38+
continue
39+
40+
out_f_p.write(raw_content + "\n")
41+
42+
except Exception as e:
43+
print(e)
44+
continue
45+
46+
print("Filtering for", filename, "took:", round(time.time() - start_time, 2), "seconds.")
47+
48+
out_f_p.close()

0 commit comments

Comments
 (0)