-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_emb.py
66 lines (53 loc) · 1.97 KB
/
model_emb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# encoding: utf-8
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import accuracy_score
from nltk import word_tokenize
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath,get_tmpfile
def sent_embedding(sentence):
# print(sentence)
sent_emb = np.mean([get_embeddings(sent) for sent in sentence],axis=0)
return sent_emb
def get_embeddings(word):
glove_file = datapath('/home/dipesh/Downloads/glove.6B.50d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)
return model.wv[word]
# here text = row . iterating over all the rows of text column one by one
def preprocess(text):
"""
to do 1) if no words found return unk token in order to get embeddings of unk
:param text:
:return:
"""
text = text.lower()
doc = word_tokenize(text)
# doc = [word for word in doc if word not in stop_words]
doc = [word for word in doc if word.isalpha()]
embedding = sent_embedding(doc)
return embedding
df = pd.read_csv('train.csv',encoding="utf-8")
dataframe = df[df['labels'] == 'football']
dataframe['features'] = dataframe['data'].apply(preprocess)
dataframe.to_csv('temp.csv')
X = dataframe['features']
y = dataframe['labels']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=2)
pl = Pipeline([
('clf',svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1))
])
pl.fit(X_train,y_train)
preds = pl.predict(X_train)
print(" train accuracy: ", accuracy_score(y_train, preds))
preds_test = pl.predict(X_test)
print(" test accuracy: ", accuracy_score(y_test, preds_test))
with open('oneclass_football_emb.pickle', 'wb') as fo:
pickle.dump(pl,fo)