forked from hengchao0248/ccf2016_sougou
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dm_nn_stack.py
89 lines (73 loc) · 2.95 KB
/
dm_nn_stack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
'''dm-nn stack for education/age/gender'''
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import KFold
from gensim.models import Doc2Vec
from collections import OrderedDict
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
import re
import cfg
#-----------------------myfunc-----------------------
def myAcc(y_true,y_pred):
y_pred = np.argmax(y_pred,axis=1)
return np.mean(y_true == y_pred)
#-----------------------load dataset----------------------
df_all = pd.read_csv(cfg.data_path + 'all_v2.csv',encoding='utf8',usecols=['Id','Education','age','gender'],nrows=200000)
ys = {}
for label in ['Education','age','gender']:
ys[label] = np.array(df_all[label])
model = Doc2Vec.load(cfg.data_path + 'dm_d2v.model')
X_sp = np.array([model.docvecs[i] for i in range(200000)])
#----------------------dmd2v stack for Education/age/gender---------------------------
df_stack = pd.DataFrame(index=range(len(df_all)))
TR = 100000
n = 5
X = X_sp[:TR]
X_te = X_sp[TR:]
feat = 'dmd2v'
for i,lb in enumerate(['Education','age','gender']):
num_class = len(pd.value_counts(ys[lb]))
y = ys[lb][:TR]
y_te = ys[lb][TR:]
stack = np.zeros((X.shape[0],num_class))
stack_te = np.zeros((X_te.shape[0],num_class))
for k,(tr,va) in enumerate(KFold(len(y),n_folds=n)):
print('{} stack:{}/{}'.format(datetime.now(),k+1,n))
nb_classes = num_class
X_train = X[tr]
y_train = y[tr]
X_test = X_te
y_test = y_te
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
model = Sequential()
model.add(Dense(300,input_shape=(X_train.shape[1],)))
model.add(Dropout(0.1))
model.add(Activation('tanh'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
history = model.fit(X_train, Y_train,shuffle=True,
batch_size=128, nb_epoch=35,
verbose=2, validation_data=(X_test, Y_test))
y_pred_va = model.predict_proba(X[va])
y_pred_te = model.predict_proba(X_te)
print('va acc:',myAcc(y[va],y_pred_va))
print('te acc:',myAcc(y_te,y_pred_te))
stack[va] += y_pred_va
stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack,stack_te])
for l in range(stack_all.shape[1]):
df_stack['{}_{}_{}'.format(feat,lb,l)] = stack_all[:,l]
df_stack.to_csv(cfg.data_path + 'dmd2v_stack_20W.csv',encoding='utf8',index=None)
print(datetime.now(),'save dmd2v stack done!')