From ad04e4fdf764af9df8317767fbb74291dd3e6c35 Mon Sep 17 00:00:00 2001 From: Mathis Antony Date: Sun, 14 Oct 2012 00:15:04 +0800 Subject: [PATCH 1/5] Cleaner choose_words function --- utils/score.py | 68 +++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/utils/score.py b/utils/score.py index 9d5e87c..287545e 100644 --- a/utils/score.py +++ b/utils/score.py @@ -110,7 +110,7 @@ def score(data): """ Scores based on list of words or one string. """ - if isinstance(data, type('str')): + if isinstance(data, type('')): return score_wordlist_percentile(filter_words(unique_words(data))) elif isinstance(data, type([])): return score_wordlist_percentile(data) @@ -125,57 +125,47 @@ def test_on_textfile(fname): wl = filter_words(unique_words( open(fname, 'r').read())) return score(wl) * words_in_language() - -# some testing functions -#def get_list(a): -# return unique_words( open("../data/m.txt", 'r').read() ) -# -#def get_score(a): -# return test_on_textfile("../data/m.txt") - - -def choose_words(userid, nwords_to_send = 10): +def choose_words(email, nwords_to_send = 10): """ - Choose words for user to learn. + Choose nwords_to_send words for user to learn. If less words are available + only the available words will be sent. + Words chosen will be assumed to be learned by user and are added to the + user vocabulary in the db. User score in db is updated. """ # query database for known words of user - userwords = database.get_list(userid) + userwords = database.get_list(email) + + # create complete dict and remove known words + unknown_words = reference_wordlist.copy() + for w in userwords: + unknown_words.pop(w,0) + + # convert unknown words dict to sorted list + unknown_words = sorted(unknown_words, + key=lambda x: unknown_words.get(x).freq, reverse=True) + + # at best we can send all the unknown words + nwords_to_send = min(nwords_to_send, len(unknown_words)) # query database for user score - userscore = database.get_score(userid) + userscore = database.get_score(email) - target = int(percentile() * userscore * words_in_language()) - - # add a word not yet known to user to wordlist (ugly solution) - def add_word(target, wordlist): - tries = 0 - while tries < 1000: - candidate = int(target * (1.0 + random.random() \ - * (1 - percentile()))) - tries += 1 - if candidate > words_in_language() + 1: - continue - word = sorted_reference_wordlist[candidate] - if word not in wordlist: - return wordlist + [word] - - # can't find unknown words, returning whatever I have - return wordlist + [word] - - wordlist = [] - - for i in range(nwords_to_send): - wordlist = add_word(target, wordlist) + def add_word(): + target = int(percentile() * userscore * len(unknown_words)) + candidate = int(target * (1 + random.random() * (1 - percentile()))) + return unknown_words.pop(candidate) + + wordlist = [add_word() for i in range(nwords_to_send)] - database.store_user_words(userid, wordlist) + database.store_user_words(email, wordlist) newscore = score(wordlist + userwords) - database.set_score(userid, newscore) + database.set_score(email, newscore) return wordlist def score_user(email, text): """ - Score a new user based on text User is assumed to be in database. + Score a new user based on text. User is assumed to be in database. """ wordlist = filter_words(unique_words(text)) userscore = score(wordlist) From 86364dd69cddf1257f82346d929e45a8553d4ab4 Mon Sep 17 00:00:00 2001 From: Mathis Antony Date: Sun, 14 Oct 2012 00:35:10 +0800 Subject: [PATCH 2/5] cleanup --- utils/score.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/score.py b/utils/score.py index 287545e..35d93c0 100644 --- a/utils/score.py +++ b/utils/score.py @@ -169,10 +169,9 @@ def score_user(email, text): """ wordlist = filter_words(unique_words(text)) userscore = score(wordlist) - print(wordlist) database.store_user_words(email, wordlist) database.set_score(email, userscore) - + print 'User %s knows %i words' % (email, userscore * words_in_language()) return userscore def get_score(email): From dbb624b86e4c0cf5d1a24701f3f291770d8cdb46 Mon Sep 17 00:00:00 2001 From: Mathis Antony Date: Sun, 14 Oct 2012 00:48:17 +0800 Subject: [PATCH 3/5] fix in case percentile is set to 1 --- utils/score.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/score.py b/utils/score.py index 35d93c0..e8cf699 100644 --- a/utils/score.py +++ b/utils/score.py @@ -14,7 +14,7 @@ sorted_reference_wordlist = [] def percentile(): - return 0.8 + return 0.9 def words_in_language(): return len(reference_wordlist) def wordlist_filename(): @@ -153,6 +153,7 @@ def choose_words(email, nwords_to_send = 10): def add_word(): target = int(percentile() * userscore * len(unknown_words)) candidate = int(target * (1 + random.random() * (1 - percentile()))) + candidate = min(candidate, len(unknown_words) - 1) return unknown_words.pop(candidate) wordlist = [add_word() for i in range(nwords_to_send)] From 181511b1110ee0513fc76980c4e9769b19f5d56d Mon Sep 17 00:00:00 2001 From: Mathis Antony Date: Sun, 14 Oct 2012 00:50:34 +0800 Subject: [PATCH 4/5] to Mart with love --- utils/score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/score.py b/utils/score.py index e8cf699..59addf4 100644 --- a/utils/score.py +++ b/utils/score.py @@ -25,7 +25,7 @@ class Word: """ Word class """ - def __init__(self, rank,freq): + def __init__(self, rank, freq): self.rank = rank self.freq = freq From 596764553a11d0e001629478f8b871e046fcf707 Mon Sep 17 00:00:00 2001 From: Mathis Antony Date: Sun, 14 Oct 2012 02:22:48 +0800 Subject: [PATCH 5/5] small --- utils/score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/score.py b/utils/score.py index 59addf4..0d6dc5f 100644 --- a/utils/score.py +++ b/utils/score.py @@ -14,7 +14,7 @@ sorted_reference_wordlist = [] def percentile(): - return 0.9 + return 0.8 def words_in_language(): return len(reference_wordlist) def wordlist_filename():