-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtfidflib.py
41 lines (31 loc) · 1.1 KB
/
tfidflib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# has some tfidf functions
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
def getscores(docs):
vectorizer = CountVectorizer(min_df=1)
transformer = TfidfTransformer(smooth_idf=False)
# count occurrence and calculate tfidf
X = vectorizer.fit_transform(docs)
tfidf = transformer.fit_transform(X.toarray())
# see some random junk
#print(X.toarray())
#print(vectorizer.get_feature_names())
#print(tfidf.toarray())
# make a list of score dictionaries
wordlist = list(vectorizer.get_feature_names())
#print(wordlist)
tfidf_lookup = tfidf.toarray()
scores = list()
dontuse = ['.','a','i','?',"'s","'"] + nltk.corpus.stopwords.words('english')
for i in range(len(docs)):
docscores = dict()
for word in nltk.word_tokenize(docs[i]):
#if word not in dontuse:
if word in wordlist:
wordind = wordlist.index(word.lower())
score = tfidf_lookup[i][wordind]
docscores[word] = score
scores.append(docscores)
return scores