Skip to content

Commit

Permalink
Add support to search providers (fix holmes-app#79)
Browse files Browse the repository at this point in the history
To wrap up changes:

 * Default provider is just a bare-bones-search-in-mysql-using-like
 * Support ElasticSearch for a faster search
 * Create a console to setup, initialize and clear a search provider
 * Adjust *handlers*, models, tests, worker, etc. to changes
 * Create some config entries
 * Update docs (re holmes-app#19)

Thanks @luizgpsantos for the support 👍
  • Loading branch information
scorphus committed May 12, 2014
1 parent b28cea4 commit ac5ffb7
Show file tree
Hide file tree
Showing 20 changed files with 1,011 additions and 121 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
*.py[cod]

# PIDs
*.pid

# C extensions
*.so

Expand Down
72 changes: 72 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,75 @@ To test:
To test without migrations:

make redis_test unit


ElasticSearch
-------------

Holmes supports ElasticSearch for faster searches on big databases.

### Installing and running

After having ES properly installed and configured, *optionally* run:

```bash
make elasticsearch # to start the ES server as daemon
```

To shut it down later, run:

```bash
make kill_elasticsearch # to kill the ES server daemon
```

### Overriding default configuration (local.conf)

Bear in mind that, for testing purposes, overriding these variables is optional.

#### Optional configurations

To set it as the default search provider:

```conf
SEARCH_PROVIDER = 'holmes.search_providers.elastic.ElasticSearchProvider'
```

If -- and only if -- ES runs on a host and/or port other than localhost:9200, set one of or both the following variables accordingly:

```conf
ELASTIC_SEARCH_HOST = 'HOST' # hostname or IP address
ELASTIC_SEARCH_PORT = PORT # default is 9200
```

Should you need or want to use a different index name, just set it at your own will:

```conf
ELASTIC_SEARCH_INDEX = 'INDEX' # name of the index
```

### Setting up

Prior to running the API, setup the index and optionally index all the active reviews:

```bash
make elasticsearch_setup # to create the index
make elasticsearch_index # to index all active reviews (optional, may take too long)
```

### Testing

Tests **expect** elasticsearch to be **running** on the default port **9200**. The index name is `holmes-test`. So, to test:

```bash
make test # this creates the test index for you
```

or

```bash
make elasticsearch_drop_test # to delete the test index
make elasticsearch_setup_test # to create the test index
make unit # to run unit tests
```

**Happy contributing!**
32 changes: 31 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
test: redis_test drop_test data_test unit integration kill_run
test: elasticsearch redis_test drop_test data_test elasticsearch_drop_test elasticsearch_setup_test unit integration kill_run

unit:
@coverage run --branch `which nosetests` -vv --with-yanc -s tests/unit/
Expand Down Expand Up @@ -55,6 +55,36 @@ data db:
data_test:
@cd tests/ && alembic upgrade head

kill_elasticsearch:
-@pkill -F elasticsearch.pid

elasticsearch: kill_elasticsearch
elasticsearch -d -p elasticsearch.pid

elasticsearch_setup:
@python holmes/search_providers/elastic.py -vv -c ./holmes/config/local.conf --create

elasticsearch_drop:
@python holmes/search_providers/elastic.py -vv -c ./holmes/config/local.conf --delete

elasticsearch_index:
@python holmes/search_providers/elastic.py -vv -c ./holmes/config/local.conf --all-keys

elasticsearch_setup_test:
@python holmes/search_providers/elastic.py -vv -c ./holmes/config/local.conf --create --index holmes-test

elasticsearch_drop_test:
@python holmes/search_providers/elastic.py -vv -c ./holmes/config/local.conf --delete --index holmes-test

search_setup:
@holmes-search -vv -c ./holmes/config/local.conf --create

search_drop:
@holmes-search -vv -c ./holmes/config/local.conf --delete

search_index:
@holmes-search -vv -c ./holmes/config/local.conf --all-keys

migration:
@cd holmes/ && alembic revision -m "$(DESC)"

Expand Down
7 changes: 7 additions & 0 deletions holmes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def log(self, message, level=logging.info):
def load_error_handlers(self):
return load_classes(default=self.config.ERROR_HANDLERS)

def load_search_provider(self):
search_provider = load_classes(default=[self.config.SEARCH_PROVIDER])
if isinstance(search_provider, list) and len(search_provider) == 1:
return search_provider.pop()
else:
raise Exception('A search provider must be defined!')

def get_config_class(self):
return Config

Expand Down
6 changes: 6 additions & 0 deletions holmes/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@

Config.define('ERROR_HANDLERS', [], 'List of classes to handle errors', 'General')

Config.define('SEARCH_PROVIDER', 'holmes.search_providers.noexternal.NoExternalSearchProvider', 'Class to handle searching', 'Models')

Config.define('ELASTIC_SEARCH_HOST', 'localhost', 'ElasticSearch host', 'ElasticSearchProvider')
Config.define('ELASTIC_SEARCH_PORT', 9200, 'ElasticSearch port', 'ElasticSearchProvider')
Config.define('ELASTIC_SEARCH_INDEX', 'holmes', 'ElasticSearch index name', 'ElasticSearchProvider')

# SENTRY ERROR HANDLER
Config.define('USE_SENTRY', False, 'If set to true errors will be sent to sentry.', 'Sentry')
Config.define('SENTRY_DSN_URL', '', 'URL to use as sentry DSN.', 'Sentry')
Expand Down
34 changes: 11 additions & 23 deletions holmes/handlers/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,33 +123,21 @@ def get(self, domain_name):
self.set_status(404, 'Domain %s not found' % domain_name)
return

reviews = domain.get_active_reviews(
self.db,
url_starts_with=term,
reviews = yield self.application.search_provider.get_domain_active_reviews(
domain=domain,
current_page=current_page,
page_size=page_size
page_size=page_size,
page_filter=term,
)

if term:
review_count = None
else:
review_count = yield self.cache.get_active_review_count(domain)
if 'reviewsCount' not in reviews:
if not term:
reviews['reviewsCount'] = yield self.cache.get_active_review_count(domain)
else:
reviews['reviewsCount'] = None

result = {
'reviewsCount': review_count,
'pages': [],
}

for page in reviews:
result['pages'].append({
"url": page.url,
"uuid": str(page.uuid),
"violationCount": page.violations_count,
"completedAt": page.last_review_date,
"reviewId": str(page.last_review_uuid)
})

self.write_json(result)
self.write_json(reviews)
self.finish()


class DomainGroupedViolationsHandler(BaseHandler):
Expand Down
44 changes: 19 additions & 25 deletions holmes/handlers/violation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from functools import partial

from holmes.handlers import BaseHandler
from holmes.models import Review, Violation
from holmes.models import Review, Violation, Domain


class MostCommonViolationsHandler(BaseHandler):
Expand Down Expand Up @@ -36,44 +36,38 @@ def get(self, key_name):
domain_filter = self.get_argument('domain_filter', None)
page_filter = self.get_argument('page_filter', None)

domain = None
if domain_filter is not None:
domain = Domain.get_domain_by_name(domain_filter, self.db)
if not domain:
self.set_status(404, 'Domain %s not found' % domain_filter)
self.finish()
return

violations = self.application.violation_definitions
if key_name not in violations:
self.set_status(404, 'Invalid violation key %s' % key_name)
self.finish()
return

violation_title = violations[key_name]['title']
key_id = violations[key_name]['key'].id

reviews = Review.get_by_violation_key_name(
self.db,
key_id,
violation = yield self.application.search_provider.get_by_violation_key_name(
key_id=key_id,
current_page=current_page,
page_size=page_size,
domain_filter=domain_filter,
domain=domain,
page_filter=page_filter,
)

if domain_filter or page_filter:
reviews_count = None
else:
reviews_count = Review.count_by_violation_key_name(self.db, key_id)

reviews_data = []
for item in reviews:
reviews_data.append({
'uuid': item.review_uuid,
'page': {
'uuid': item.page_uuid,
'url': item.url,
'completedAt': item.completed_date
}
})
if 'reviewsCount' not in violation:
if not domain and not page_filter:
violation['reviewsCount'] = Review.count_by_violation_key_name(self.db, key_id)
else:
violation['reviewsCount'] = None

violation = {
'title': violation_title,
'reviews': reviews_data,
'reviewsCount': reviews_count
}
violation['title'] = violation_title

self.write_json(violation)
self.finish()
Expand Down
12 changes: 6 additions & 6 deletions holmes/models/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get_violations_per_day(self, db):

return result

def get_active_reviews(self, db, url_starts_with=None, current_page=1, page_size=10):
def get_active_reviews(self, db, page_filter=None, current_page=1, page_size=10):
from holmes.models import Page # Prevent circular dependency

lower_bound = (current_page - 1) * page_size
Expand All @@ -121,8 +121,8 @@ def get_active_reviews(self, db, url_starts_with=None, current_page=1, page_size
.filter(Page.last_review_date != None) \
.filter(Page.domain == self)

if url_starts_with:
items_query = items_query.filter(Page.url.like('%s/%s%%' % (self.url, url_starts_with)))
if page_filter:
items_query = items_query.filter(Page.url.like('%s/%s%%' % (self.url, page_filter)))

items = items_query.order_by('violations_count desc')[lower_bound:upper_bound]

Expand All @@ -132,14 +132,14 @@ def get_active_reviews(self, db, url_starts_with=None, current_page=1, page_size
def get_domain_by_name(self, domain_name, db):
return db.query(Domain).filter(Domain.name == domain_name).first()

def get_active_review_count(self, db, url_starts_with=None):
def get_active_review_count(self, db, page_filter=None):
from holmes.models import Review, Page

query = db.query(func.count(distinct(Review.page_id))) # See #111

if url_starts_with:
if page_filter:
query = query.join(Page, Page.id == Review.page_id) \
.filter(Page.url.like('%s%%' % url_starts_with))
.filter(Page.url.like('%s/%s%%' % (self.url, page_filter)))

query = query.filter(Review.is_active == True, Review.domain_id == self.id)

Expand Down
Loading

0 comments on commit ac5ffb7

Please sign in to comment.