-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpairwise-train-AOL-cleaned.py
111 lines (92 loc) · 3.34 KB
/
pairwise-train-AOL-cleaned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
import cPickle
import sys
import re
import os
from sklearn.preprocessing import StandardScaler
# train LeToR model (with AOL queries)
# dependent on rankSVM
# input: features, groundtruth, query length (pre-generated by Zhuyun)
# output:
# model_cwb.dat: trained model
# scaler.pkl: training data's mean and variance. to scale testing data
# constant and input files. read Zhuyun's features
nshard = 123
basedir = "./data/cent1-split-new/aol-train/" # TODO: Yubin, change aol-train to mqt-train to train with MQT queries.
feat_names = cPickle.load(open(basedir + "feat_names.pkl"))
print "reading files..."
feat = cPickle.load(open(basedir + "train_feat.pkl"))
truth = cPickle.load(open(basedir + "train_truth.pkl"))
qlen = cPickle.load(open(basedir + "train_qlen.pkl"))
# TODO: Yubin,append your own features to feat here!
# feat[q][s]: a list. feature vector for [query q, shard s].
# e.g. feat[0][0].append(blockmax_score_00). Add your score for [query 0, shard 0]
# feat[0][0].append(0). if score for [query 0, shard 0] doesn't exist.
# start training
# 1. filter out empty queries, and queries with less than 500 retrieved documents.
# 2. turn the feat 2-D list into numpy matrix
# 3. scale the feature matrix (mean-variance scaler)
# 4. write the normalized matrix into rankSVM format
# 5. call rankSVM. model stored in "model_cwb.dat"
train_queries = range(0, 1000)
# 1. filter out empty queries, and queries with less than 500 retrieved documents.
tmp = []
for q in train_queries:
if qlen[q] > 0 and sum(truth[q]) >= 500:
tmp.append(q)
train_queries = tmp
# 2. turn the feat 2-D list into numpy matrix
X_train = []
X_train_pre = []
Y_train = []
Y_train_pre = []
mem_train = []
for q in train_queries:
feat_q = [feat[q][s] for s in range(nshard)]
redde_q_sorted = sorted([(val[2],i) for i, val in enumerate(feat[q])], reverse=True)
i = 0
for val, s in redde_q_sorted:
if i < nshard or truth[q][s] > 0:
X_train.append(feat_q[s])
Y_train.append(truth[q][s])
mem_train.append((q, s))
i += 1
# 3. scale the feature matrix (mean-variance scaler)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# 4. write the normalized matrix into rankSVM format
print "writing into rankSVM input file train_cwb.feat..."
trainFeatFile = open("train_cwb.feat",'w')
for i, f in enumerate(X_train):
qid = mem_train[i][0] +1
res = ""
if Y_train[i]:
res += str(Y_train[i])
else:
res += "0"
res += " qid:"+str(qid) + " ";
for j in range(len(f)):
res += "{0}:{1} ".format(j + 1, f[j])
res += "\n"
trainFeatFile.write(res)
trainFeatFile.close()
# 5. call rankSVM. model stored in "model_cwb.dat"
import os
print "training..."
stream = os.popen("/Users/zhuyund/Documents/11642-SearchEngines/svm_rank/svm_rank_learn -c 1 -g 0.001 -t 0 train_cwb.feat model_cwb.dat")
print "finished training! model_cwb.dat, scaler.pkl"
# store scaler. Will be used in testing phase.
cPickle.dump(scaler, open("scaler.pkl", 'wb'))
# print model weights
f = open("model_cwb.dat")
lines = f.readlines()
wline = lines[-1]
items = wline.split(' ')
wf = []
for item in items[1:-1]:
fid, w = item.split(':')
fid = int(fid)
w = float(w)
wf.append((w, feat_names[fid-1]))
for w, name in sorted(wf, reverse=True):
print w, name