-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
32 lines (24 loc) · 1017 Bytes
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pandas as pd
dataset = 'yelp-2018'
train_filename = 'train.txt'
test_filename = 'test.txt'
rows, cols = [], []
with open('./data/{0}/{1}'.format(dataset, train_filename), 'r') as f:
for line in f:
all_elements = line.split(' ')
if '\n' not in all_elements:
for el in all_elements[1:]:
rows.append(int(all_elements[0]))
cols.append(int(el))
train = pd.concat([pd.Series(rows), pd.Series(cols)], axis=1)
rows, cols = [], []
with open('./data/{0}/{1}'.format(dataset, test_filename), 'r') as f:
for line in f:
all_elements = line.split(' ')
if '\n' not in all_elements:
for el in all_elements[1:]:
rows.append(int(all_elements[0]))
cols.append(int(el))
test = pd.concat([pd.Series(rows), pd.Series(cols)], axis=1)
df = pd.concat([train, test], axis=0).sort_values(0).reset_index(drop=True)
df.to_csv('./data/{0}/dataset.tsv'.format(dataset), sep='\t', header=None, index=None)