From b12959d00d68668456f86cc43afe631be5f2b2b4 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 09:32:59 -0500 Subject: [PATCH 1/3] initial commit for 1.2.1 Fixes unit tests as seen in dev, fixes docker login command --- .travis.yml | 4 ++-- README.md | 4 ++-- crawler/tests/online.py | 2 +- docker-compose.yml | 8 ++++---- docs/conf.py | 4 ++-- docs/topics/advanced/docker.rst | 2 +- docs/topics/changelog.rst | 9 +++++++++ elk/docker-compose.elk.yml | 8 ++++---- travis/docker.sh | 2 +- 9 files changed, 26 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index eff258c1..8cfc27ef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,10 +16,10 @@ env: run_opts: "" - docker: 1 dockerfile_name: Dockerfile - docker_tag_suffix: 1.2 + docker_tag_suffix: 1.2.1 - docker: 1 dockerfile_name: Dockerfile.py2alpine - docker_tag_suffix: 1.2-alpine + docker_tag_suffix: 1.2.1-alpine install: true diff --git a/README.md b/README.md index 3cd60b1a..bf77043a 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,10 @@ To set up a pre-canned Scrapy Cluster test environment, make sure you have the l ## Documentation -Please check out the official [Scrapy Cluster 1.2 documentation](http://scrapy-cluster.readthedocs.org/en/latest/) for more information on how everything works! +Please check out the official [Scrapy Cluster 1.2.1 documentation](http://scrapy-cluster.readthedocs.org/en/latest/) for more information on how everything works! ## Branches -The `master` branch of this repository contains the latest stable release code for `Scrapy Cluster 1.2`. +The `master` branch of this repository contains the latest stable release code for `Scrapy Cluster 1.2.1`. The `dev` branch contains bleeding edge code and is currently working towards [Scrapy Cluster 1.3](https://github.com/istresearch/scrapy-cluster/milestone/3). Please note that not everything may be documented, finished, tested, or finalized but we are happy to help guide those who are interested. diff --git a/crawler/tests/online.py b/crawler/tests/online.py index be3e9d03..d9287cd4 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -38,7 +38,7 @@ class TestLinkSpider(TestCase): "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ - "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" + "test-link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" def setUp(self): self.settings = get_project_settings() diff --git a/docker-compose.yml b/docker-compose.yml index 261cc0be..b6855b16 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,27 +2,27 @@ version: '2' services: kafka_monitor: - image: istresearch/scrapy-cluster:kafka-monitor-1.2 + image: istresearch/scrapy-cluster:kafka-monitor-1.2.1 depends_on: - kafka - redis restart: always redis_monitor: - image: istresearch/scrapy-cluster:redis-monitor-1.2 + image: istresearch/scrapy-cluster:redis-monitor-1.2.1 depends_on: - kafka - redis - zookeeper restart: always crawler: - image: istresearch/scrapy-cluster:crawler-1.2 + image: istresearch/scrapy-cluster:crawler-1.2.1 depends_on: - kafka - redis - zookeeper restart: always rest: - image: istresearch/scrapy-cluster:rest-1.2 + image: istresearch/scrapy-cluster:rest-1.2.1 depends_on: - kafka - redis diff --git a/docs/conf.py b/docs/conf.py index d1c4f56b..6ecd90b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,9 +56,9 @@ # built documents. # # The short X.Y version. -version = '1.2' +version = '1.2.1' # The full version, including alpha/beta/rc tags. -release = '1.2' +release = '1.2.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/topics/advanced/docker.rst b/docs/topics/advanced/docker.rst index fca01884..ed9ca364 100644 --- a/docs/topics/advanced/docker.rst +++ b/docs/topics/advanced/docker.rst @@ -77,7 +77,7 @@ It is recommended you use docker compose to orchestrate your cluster with all of :: - image: istresearch/scrapy-cluster:kafka-monitor-1.2 + image: istresearch/scrapy-cluster:kafka-monitor-1.2.1 build: context: . dockerfile: docker/kafka-monitor/Dockerfile diff --git a/docs/topics/changelog.rst b/docs/topics/changelog.rst index 61cc2d74..d1dcbec4 100644 --- a/docs/topics/changelog.rst +++ b/docs/topics/changelog.rst @@ -5,6 +5,15 @@ Change Log This page serves to document any changes made between releases. +Scrapy Cluster 1.2.1 +-------------------- + +Date: 01/19/2018 + +- Fixes unit test syntax for link spider + +- Fixes docker version upgrade on Travis for continuous integration tests + Scrapy Cluster 1.2 ------------------ diff --git a/elk/docker-compose.elk.yml b/elk/docker-compose.elk.yml index 1badac39..53a74786 100644 --- a/elk/docker-compose.elk.yml +++ b/elk/docker-compose.elk.yml @@ -5,7 +5,7 @@ version: '2' services: kafka_monitor: - image: istresearch/scrapy-cluster:kafka-monitor-1.2 + image: istresearch/scrapy-cluster:kafka-monitor-1.2.1 volumes: - logs:/usr/src/app/logs environment: @@ -16,7 +16,7 @@ services: - redis restart: always redis_monitor: - image: istresearch/scrapy-cluster:redis-monitor-1.2 + image: istresearch/scrapy-cluster:redis-monitor-1.2.1 volumes: - logs:/usr/src/app/logs environment: @@ -28,7 +28,7 @@ services: - zookeeper restart: always crawler: - image: istresearch/scrapy-cluster:crawler-1.2 + image: istresearch/scrapy-cluster:crawler-1.2.1 volumes: - logs:/usr/src/app/logs environment: @@ -40,7 +40,7 @@ services: - zookeeper restart: always rest: - image: istresearch/scrapy-cluster:rest-1.2 + image: istresearch/scrapy-cluster:rest-1.2.1 volumes: - logs:/usr/src/app/logs depends_on: diff --git a/travis/docker.sh b/travis/docker.sh index 38b419e0..ac5ae839 100755 --- a/travis/docker.sh +++ b/travis/docker.sh @@ -39,7 +39,7 @@ if [ "$TRAVIS_BRANCH" = "master" ] && [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ sudo docker rmi istresearch/scrapy-cluster:rest-test # log into docker - sudo docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" + sudo docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" # push new containers sudo docker push istresearch/scrapy-cluster From 506429504b7bbc51fee6a59546b6229ac3a33c3b Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 09:58:42 -0500 Subject: [PATCH 2/3] change istresearch.com to something more stable Hopefully this fixes the crawler integration tests which have been failing, tested locally and works --- crawler/tests/online.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/tests/online.py b/crawler/tests/online.py index d9287cd4..89b48a3a 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -35,7 +35,7 @@ class CustomSpider(LinkSpider): class TestLinkSpider(TestCase): example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\ - "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ + "crawlid\":\"abc12345\",\"url\":\"http://dmoztools.net/\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ "test-link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" @@ -75,7 +75,7 @@ def test_crawler_process(self): d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis - key = "test-spider:istresearch.com:queue" + key = "test-spider:dmoztools.net:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, From 5db560b32f8a966e9dbf218abe17af1da751cfe8 Mon Sep 17 00:00:00 2001 From: Madison Bahmer Date: Fri, 19 Jan 2018 10:11:52 -0500 Subject: [PATCH 3/3] documentation updates for new url URL changed from istresearch.com to dmoztools.net for better stability --- docs/topics/advanced/rediskeys.rst | 2 +- docs/topics/introduction/quickstart.rst | 6 +++--- docs/topics/kafka-monitor/quickstart.rst | 10 +++++----- docs/topics/rest/api.rst | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/topics/advanced/rediskeys.rst b/docs/topics/advanced/rediskeys.rst index 77f74fa6..e5ca5d0a 100644 --- a/docs/topics/advanced/rediskeys.rst +++ b/docs/topics/advanced/rediskeys.rst @@ -69,6 +69,6 @@ If you run the integration tests, there may be temporary Redis keys created that - **cluster:test** - Used when testing the Kafka Monitor can act and set a key in Redis -- **test-spider:istresearch.com:queue** - Used when testing the crawler installation can interact with Redis and Kafka +- **test-spider:dmoztools.net:queue** - Used when testing the crawler installation can interact with Redis and Kafka - **stats:crawler::test-spider:** - Automatically created and destoryed during crawler testing by the stats collection mechanism settings. diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst index cf63c8ea..652e4882 100644 --- a/docs/topics/introduction/quickstart.rst +++ b/docs/topics/introduction/quickstart.rst @@ -431,7 +431,7 @@ Which ever setup you chose, every process within should stay running for the rem :: - python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"abc123"}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc123"}' You will see the following output on the command line for that successful request: @@ -439,7 +439,7 @@ You will see the following output on the command line for that successful reques 2015-12-22 15:45:37,457 [kafka-monitor] INFO: Feeding JSON into demo.incoming { - "url": "http://istresearch.com", + "url": "http://dmoztools.net", "crawlid": "abc123", "appid": "testapp" } @@ -460,7 +460,7 @@ Crawl Request: :: - python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}' + python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"abc1234", "maxdepth":1}' Now send an ``info`` action request to see what is going on with the crawl: diff --git a/docs/topics/kafka-monitor/quickstart.rst b/docs/topics/kafka-monitor/quickstart.rst index 1392b60c..9708bf83 100644 --- a/docs/topics/kafka-monitor/quickstart.rst +++ b/docs/topics/kafka-monitor/quickstart.rst @@ -33,7 +33,7 @@ JSON Object feeder into your desired Kafka Topic. This takes a valid JSON object :: - $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}' + $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}' The command line feed is very slow and should not be used in production. Instead, you should write your own continuously running application to feed Kafka the desired API requests that you require. @@ -89,10 +89,10 @@ Feed an item :: - $ python kafka_monitor.py feed '{"url": "http://istresearch.com", "appid":"testapp", "crawlid":"ABC123"}' + $ python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"ABC123"}' 2016-01-05 15:14:44,829 [kafka-monitor] INFO: Feeding JSON into demo.incoming { - "url": "http://istresearch.com", + "url": "http://dmoztools.net", "crawlid": "ABC123", "appid": "testapp" } @@ -116,8 +116,8 @@ If you have a :ref:`Crawler ` running, you should see the html come thr "response_headers": { }, - "response_url": "http://istresearch.com", - "url": "http://istresearch.com", + "response_url": "http://dmoztools.net", + "url": "http://dmoztools.net", "status_code": 200, "status_msg": "OK", "appid": "testapp", diff --git a/docs/topics/rest/api.rst b/docs/topics/rest/api.rst index cb2aab4b..524b17a6 100644 --- a/docs/topics/rest/api.rst +++ b/docs/topics/rest/api.rst @@ -156,7 +156,7 @@ Feed a crawl request :: - $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"istresearch.com", "appid":"madisonTest", "crawlid":"abc123"}' + $ curl scdev:5343/feed -H "Content-Type: application/json" -d '{"url":"http://dmoztools.net", "appid":"madisonTest", "crawlid":"abc123"}' Feed a Stats request