Skip to content
This repository has been archived by the owner on Sep 18, 2020. It is now read-only.

Commit

Permalink
Try with smaller page size
Browse files Browse the repository at this point in the history
This takes a very long time on the laptop I'm running it on
and cpython

Things that can speed this up

- use less sessions
- ignore urls > rank 10
- use pypy instead of cpython as recommended by PyClick readme
  • Loading branch information
MatMoore committed Jun 10, 2018
1 parent 5ba14f8 commit d137911
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions estimate_with_pyclick.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
import logging
import pyclick.click_models.Evaluation
from collections import OrderedDict, Counter
import pandas as pd
from evaluate_model import ModelTester, QueryDocumentRanker

# Override constant for the page size: we show 20 results per page
# so start by evaluating all of these.
# TODO: Do we get worse results by incluuding the bottom 10 links?
pyclick.click_models.Evaluation.RANK_MAX = 20
pyclick.click_models.Evaluation.RANK_MAX = 10

sdbn_click_model = SDBN()
dbn_click_model = DBN()
Expand Down Expand Up @@ -57,7 +59,7 @@ def map_to_pyclick_format(searches):
unique_links = search.all_urls
counter['ok_urls'] += 1

for url in unique_links[:20]:
for url in unique_links[:10]:
if url in search.clicked_urls:
result = SearchResult(url, 1)
else:
Expand Down Expand Up @@ -114,6 +116,15 @@ def evaluate_fit(trained_model, test_sessions, test_queries):
print("\tperplexity: %f; time: %i secs" % (perp_value, end - start))


class ModelAdapter:
def __init__(self, model):
self.model = model

def relevance(self, query):
documents = self.model.params['attr']._container[query].keys()
return pd.Series(self.model.predict_relevance(query, document) for document in documents)


if __name__ == "__main__":
logging.basicConfig(filename='estimate_with_pyclick.log',level=logging.INFO)

Expand All @@ -133,9 +144,28 @@ def evaluate_fit(trained_model, test_sessions, test_queries):
train_model(sdbn_click_model, train_sessions, train_queries)
evaluate_fit(sdbn_click_model, test_sessions, test_queries)

with open('sdbn_model.json', 'w') as f:
f.write(sdbn_click_model.to_json())

del sdbn_click_model

# Create a new ranking based on the trained model
# ranker = QueryDocumentRanker(ModelAdapter(sdbn_click_model))

# # How does the new ranker do against the saved-effort metrics?
# tester = ModelTester(ranker)
# evaluation = tester.evaluate(test)

# print(f'Median change in rank: {evaluation.change_in_rank.mean()}')
# print(f'Median saved clicks: {evaluation.saved_clicks.median()}')

print('DBN')
train_model(dbn_click_model, train_sessions, train_queries)
evaluate_fit(dbn_click_model, test_sessions, test_queries)

with open('dbn_model.json', 'w') as f:
f.write(dbn_click_model.to_json())


# TODO: evaluate change in rank metrics
# and save the actual rankings to compare to the other implementation

0 comments on commit d137911

Please sign in to comment.