-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscore.py
36 lines (27 loc) · 930 Bytes
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
import nltk
from config import *
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
def clean(text):
# Remove hyphens
text = re.sub(r'[\']', '', text)
# Replace non-whitespace, non-alphanumeric characters with spaces
return re.sub(r'[^\w\s]+', ' ', text)
def tokenize(text):
# Tokenize the text, then stem each word
return [st.stem(x) for x in wordpunct_tokenize(clean(text))]
def find_keyword(word, text):
word = tuple(tokenize(word))
ngrams = nltk.ngrams(text, len(word))
return word in ngrams
def find_keywords(text):
return sum([score for word, score in KEYWORDS.iteritems()
if find_keyword(word, text)])
def score(title, abstract):
tt = tokenize(title)
aa = tokenize(abstract)
ttitle = find_keywords(tt)
aabstract = find_keywords(aa)
return ttitle * 1.5 + aabstract