forked from hritik25/Dynamic-CNN-for-Modelling-Sentences
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocabulary.py
23 lines (20 loc) · 838 Bytes
/
vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(input = u'content',
analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
token_pattern = ur',|\b\w+\b', \
max_features = 15448)
import pandas as pd
def generateVocab(filename):
"""
Here, the argument is the name of the file which contains the text data.
It is a tab separated file sentences are under the column header 'sentences'
"""
df = pd.read_csv(filename, sep = '\t')
sentences = []
for i in df['sentence']:
sentences.append(i)
vectorizer.fit(sentences)
return vectorizer.vocabulary_