forked from rajnish-garg/Weka-File-Format
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ConvertToARFF2.py
95 lines (60 loc) · 2.42 KB
/
ConvertToARFF2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python
#Converting a CSV file (assume have attributes with binary features) to ARFF format
#Written by: Ajit kumar, [email protected] ,5March2015
#No license required for any kind of reuse
#Code is adopted from github @ rajnish-garg/Weka-File-Format
#If using this script for your work, please refer this on your willingness
#input: a csv file which have one row for each sample features and ends with class of sample
#output: a arff file in a sparse data notation
#Note: It will identify all class automatically consider last value of each row as the class
# it treat each attribute as binary
import csv
import pdb
#Variable Intialization
filename = "/path/Data.csv"
#If this script and csv is in same foder then path can be given below
#filename = "Data.csv"
#ITEMS will be used as header or list of attributes
ITEMS = set()
#LABELES will have different class/type
LABELS = set()
#with same primary file a .arff file will be created and will be saved in same folder
arffFilename = filename.split(".")[0] + ".arff"
#opening the csv file. each row of file will be in the format [A1,A2..... An-1,classtype]
#CSV file must not have the header
myfile = csv.reader(open(filename,'rb'))
#Get List of all the unique items from the Input
for line in myfile:
[ITEMS.add(item.strip()) for item in line[:-1]]
#Get List of all the unqiue classes
LABELS.add(line[-1].strip())
#Open created arff file to write the data
myARFF = open(arffFilename,'w+')
myARFF.write('@relation Weka\n\n\n')
#So that items will iterate in order
ITEMS = list(ITEMS)
ITEMS.append("class")
LABELS = list(LABELS)
#writing different attributes. this script treat each attribute as nominal and of binary type {true,false}
for item in ITEMS[:-1]: myARFF.write(str("@attribute "+item+" {false,true}\n"))
#writing last attribute as class
myARFF.write("@attribute "+ ITEMS[-1]+ "{" + ",".join(LABELS) + "}" + "\n")
#writing data part
myARFF.write('\n\n@data\n\n')
#Iterate over the data file
myfile = csv.reader(open(filename,'rb'))
for line in myfile:
tmp = []
newLine = "{ "
for word in line[:-1]:
tmp.append(ITEMS.index(word.strip()))
for i in sorted(list(set(tmp))):
newLine += str(i) +" "+"true"+","
#Adding the class of the sample
newLine += str(len(ITEMS)-1) + " " + line[-1]
writeLine = newLine
writeLine += "}\n"
print writeLine
myARFF.write(writeLine)
#closing file
myARFF.close()