diff --git a/.travis.yml b/.travis.yml index 849ce1fd5..e5fbd98b1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,6 @@ -sudo: required language: python cache: pip python: - - 3.4 - 3.5 - 3.6 - 3.7 @@ -11,29 +9,25 @@ services: - docker - mongodb - rabbitmq - - redis-server + - redis - mysql - #- elasticsearch + # - elasticsearch - postgresql addons: postgresql: "9.4" apt: packages: - rabbitmq-server +env: + - IGNORE_COUCHDB=1 before_install: - - echo "deb https://apache.bintray.com/couchdb-deb xenial main" | sudo tee -a /etc/apt/sources.list - - curl -L https://couchdb.apache.org/repo/bintray-pubkey.asc | sudo apt-key add - - sudo apt-get update -qq - - sudo apt-get install -y couchdb - - sudo systemctl start couchdb - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart - npm install express puppeteer - sudo docker pull scrapinghub/splash - sudo docker run -d --net=host scrapinghub/splash before_script: - - curl -X PUT http://127.0.0.1:5984/_users - - curl -X PUT http://127.0.0.1:5984/_replicator - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres diff --git a/README.md b/README.md index bfe1aca8f..1dc169585 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. - Write script in Python - Powerful WebUI with script editor, task monitor, project manager and result viewer -- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend +- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc... diff --git a/docker-compose.yaml b/docker-compose.yaml index 3d18bc071..983fc566d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,26 +13,15 @@ services: networks: - pyspider command: rabbitmq-server - couchdb: - image: couchdb:latest - container_name: couchdb + mysql: + image: mysql:latest + container_name: mysql + volumes: + - /tmp:/var/lib/mysql environment: - - COUCHDB_USER=user - - COUCHDB_PASSWORD=password + - MYSQL_ALLOW_EMPTY_PASSWORD=yes networks: - pyspider - ports: - - "5984:5984" - # OR we can replace couchdb with mysql - #mysql: - # image: mysql:latest - # container_name: mysql - # volumes: - # - /tmp:/var/lib/mysql - # environment: - # - MYSQL_ALLOW_EMPTY_PASSWORD=yes - # networks: - # - pyspider phantomjs: image: pyspider:latest container_name: phantomjs diff --git a/pyspider/database/__init__.py b/pyspider/database/__init__.py index 735ad1a34..04755b904 100644 --- a/pyspider/database/__init__.py +++ b/pyspider/database/__init__.py @@ -214,26 +214,8 @@ def _connect_couchdb(parsed, dbtype, url): params = {} # default to env, then url, then hard coded - params['username'] = os.environ.get('COUCHDB_USER') or parsed.username or 'user' - params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password or 'password' - - # create necessary DBs + the admin user - res = requests.put(url + "_users") - if 'error' in res and res['error'] == 'unauthorized': - # user is already created. This will happen if CouchDB is running in docker - # and COUCHDB_USER and COUCHDB_PASSWORD are set - from requests.auth import HTTPBasicAuth - requests.put(url + "_users", - auth=HTTPBasicAuth(params['username'], params['password'])) - requests.put(url + "_replicator", - auth=HTTPBasicAuth(params['username'], params['password'])) - requests.put(url + '_node/_local/_config/admins/' + params['username'], - data=params['password'], - auth=HTTPBasicAuth(params['username'], params['password'])) - else: - requests.put(url + "_replicator") - requests.put(url + '_node/_local/_config/admins/' + params['username'], - data=params['password']) + params['username'] = os.environ.get('COUCHDB_USER') or parsed.username + params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password if dbtype == 'taskdb': from .couchdb.taskdb import TaskDB diff --git a/pyspider/database/couchdb/couchdbbase.py b/pyspider/database/couchdb/couchdbbase.py index 797953f7c..13eb7fb57 100644 --- a/pyspider/database/couchdb/couchdbbase.py +++ b/pyspider/database/couchdb/couchdbbase.py @@ -4,6 +4,12 @@ class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 + def __init__(self): + self.session = requests.session() + if self.username: + self.session.auth = HTTPBasicAuth(self.username, self.password) + self.session.headers.update({'Content-Type': 'application/json'}) + def _collection_name(self, project): if self.collection_prefix: return "%s_%s" % (self.collection_prefix, project) @@ -32,10 +38,7 @@ def _list_project(self): prefix = '' url = self.base_url + "_all_dbs" - res = requests.get(url, - data=json.dumps({}), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.get(url, json={}).json() for each in res: if each.startswith('_'): continue @@ -45,9 +48,7 @@ def _list_project(self): def create_database(self, name): url = self.base_url + name - res = requests.put(url, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.put(url).json() if 'error' in res and res['error'] == 'unauthorized': raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password)) return res @@ -55,9 +56,7 @@ def create_database(self, name): def get_doc(self, db_name, doc_id): url = self.base_url + db_name + "/" + doc_id - res = requests.get(url, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.get(url).json() if "error" in res and res["error"] == "not_found": return None return res @@ -66,10 +65,7 @@ def get_doc(self, db_name, doc_id): def get_docs(self, db_name, selector): url = self.base_url + db_name + "/_find" selector['use_index'] = self.index - res = requests.post(url, - data=json.dumps(selector), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(url, json=selector).json() if 'error' in res and res['error'] == 'not_found': return [] return res['docs'] @@ -81,10 +77,7 @@ def get_all_docs(self, db_name): def insert_doc(self, db_name, doc_id, doc): url = self.base_url + db_name + "/" + doc_id - return requests.put(url, - data=json.dumps(doc), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + return self.session.put(url, json=doc).json() def update_doc(self, db_name, doc_id, new_doc): @@ -94,14 +87,9 @@ def update_doc(self, db_name, doc_id, new_doc): for key in new_doc: doc[key] = new_doc[key] url = self.base_url + db_name + "/" + doc_id - return requests.put(url, - data=json.dumps(doc), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + return self.session.put(url, json=doc).json() def delete(self, url): - return requests.delete(url, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + return self.session.delete(url).json() diff --git a/pyspider/database/couchdb/projectdb.py b/pyspider/database/couchdb/projectdb.py index 05c4fed74..17c1f6ff3 100644 --- a/pyspider/database/couchdb/projectdb.py +++ b/pyspider/database/couchdb/projectdb.py @@ -6,17 +6,19 @@ class ProjectDB(BaseProjectDB): __collection_name__ = 'projectdb' - def __init__(self, url, database='projectdb', username='username', password='password'): + def __init__(self, url, database='projectdb', username=None, password=None): self.username = username self.password = password self.url = url + self.__collection_name__ + "_" + database + "/" self.database = database - self.insert('', {}) + + self.session = requests.session() + if username: + self.session.auth = HTTPBasicAuth(self.username, self.password) + self.session.headers.update({'Content-Type': 'application/json'}) # Create the db - res = requests.put(self.url, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.put(self.url).json() if 'error' in res and res['error'] == 'unauthorized': raise Exception( "Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], @@ -29,9 +31,7 @@ def __init__(self, url, database='projectdb', username='username', password='pas }, 'name': self.__collection_name__ + "_" + database } - res = requests.post(self.url+"_index", data=json.dumps(payload), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(self.url + "_index", json=payload).json() self.index = res['id'] def _default_fields(self, each): @@ -51,10 +51,7 @@ def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() - res = requests.put(url, - data = json.dumps(obj), - headers = {"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.put(url, json=obj).json() return res def update(self, name, obj={}, **kwargs): @@ -78,10 +75,7 @@ def get_all(self, fields=None): "use_index": self.index } url = self.url + "_find" - res = requests.post(url, - data=json.dumps(payload), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(url, json=payload).json() for doc in res['docs']: yield self._default_fields(doc) @@ -95,10 +89,7 @@ def get(self, name, fields=None): "use_index": self.index } url = self.url + "_find" - res = requests.post(url, - data=json.dumps(payload), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(url, json=payload).json() if len(res['docs']) == 0: return None return self._default_fields(res['docs'][0]) @@ -115,13 +106,7 @@ def drop(self, name): doc = self.get(name) payload = {"rev": doc["_rev"]} url = self.url + name - return requests.delete(url, - params=payload, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + return self.session.delete(url, params=payload).json() def drop_database(self): - return requests.delete(self.url, - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() - + return self.session.delete(self.url).json() diff --git a/pyspider/database/couchdb/resultdb.py b/pyspider/database/couchdb/resultdb.py index 0426143e5..163a6c17b 100644 --- a/pyspider/database/couchdb/resultdb.py +++ b/pyspider/database/couchdb/resultdb.py @@ -1,5 +1,4 @@ -import time, json, requests -from requests.auth import HTTPBasicAuth +import time, json from pyspider.database.base.resultdb import ResultDB as BaseResultDB from .couchdbbase import SplitTableMixin @@ -7,13 +6,14 @@ class ResultDB(SplitTableMixin, BaseResultDB): collection_prefix = '' - def __init__(self, url, database='resultdb', username='username', password='password'): + def __init__(self, url, database='resultdb', username=None, password=None): self.username = username self.password = password - self.base_url = url self.url = url + database + "/" self.database = database + + super().__init__() self.create_database(database) self.index = None @@ -31,10 +31,7 @@ def _create_project(self, project): 'name': collection_name } - res = requests.post(self.base_url + collection_name + "/_index", - data=json.dumps(payload), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() self.index = res['id'] self._list_project() diff --git a/pyspider/database/couchdb/taskdb.py b/pyspider/database/couchdb/taskdb.py index 6c3008342..9110be82a 100644 --- a/pyspider/database/couchdb/taskdb.py +++ b/pyspider/database/couchdb/taskdb.py @@ -1,5 +1,4 @@ -import json, time, requests -from requests.auth import HTTPBasicAuth +import json, time from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from .couchdbbase import SplitTableMixin @@ -7,15 +6,17 @@ class TaskDB(SplitTableMixin, BaseTaskDB): collection_prefix = '' - def __init__(self, url, database='taskdb', username='username', password='password'): + def __init__(self, url, database='taskdb', username=None, password=None): self.username = username self.password = password self.base_url = url self.url = url + database + "/" self.database = database - self.create_database(database) self.index = None + super().__init__() + + self.create_database(database) self.projects = set() self._list_project() @@ -32,10 +33,7 @@ def _create_project(self, project): }, 'name': collection_name } - res = requests.post(self.base_url + collection_name + "/_index", - data=json.dumps(payload), - headers={"Content-Type": "application/json"}, - auth=HTTPBasicAuth(self.username, self.password)).json() + res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() self.index = res['id'] self._list_project() diff --git a/pyspider/libs/utils.py b/pyspider/libs/utils.py index 1c653b17d..336021a03 100644 --- a/pyspider/libs/utils.py +++ b/pyspider/libs/utils.py @@ -432,9 +432,9 @@ def python_console(namespace=None): def check_port_open(port, addr='127.0.0.1'): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - result = sock.connect_ex((addr, port)) - if result == 0: - return True - else: - return False + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + result = sock.connect_ex((addr, port)) + if result == 0: + return True + else: + return False diff --git a/requirements.txt b/requirements.txt index b8750cb84..85e030fef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ Flask==0.10 Jinja2==2.7 -chardet==2.2.1 +chardet==3.0.4 cssselect==0.9 lxml==4.3.3 pycurl==7.43.0.3 pyquery==1.4.0 -requests==2.2 +requests==2.24.0 tornado==4.5.3 mysql-connector-python==8.0.16 pika==1.1.0 diff --git a/setup.py b/setup.py index e8cb37fd3..2512f4708 100644 --- a/setup.py +++ b/setup.py @@ -20,25 +20,21 @@ install_requires = [ 'Flask==0.10', 'Jinja2==2.7', - 'chardet==2.2.1', + 'chardet==3.0.4', 'cssselect==0.9', "lxml==4.3.3", 'pycurl==7.43.0.3', - 'requests==2.2', + 'requests==2.24.0', 'Flask-Login==0.2.11', 'u-msgpack-python==1.6', 'click==3.3', 'six==1.10.0', - 'tblib==1.4.0' + 'tblib==1.4.0', + 'wsgidav==2.3.0', + 'tornado>=3.2,<=4.5.3', + 'pyquery', ] -if sys.version_info >= (3, 0): # 3.* - install_requires.extend([ - 'wsgidav==2.3.0', - 'tornado>=3.2,<=4.5.3', - 'pyquery', - ]) - extras_require_all = [ 'mysql-connector-python==8.0.16', 'pymongo==3.9.0', @@ -46,15 +42,11 @@ 'redis-py-cluster==1.3.6', 'psycopg2==2.8.2', 'elasticsearch==2.3.0', + 'kombu==4.4.0', + 'amqp==2.4.0', + 'SQLAlchemy==1.3.10', + 'pika==1.1.0' ] -if sys.version_info >= (3, 0): # 3.* - extras_require_all.extend([ - 'kombu==4.4.0', - 'amqp==2.4.0', - 'SQLAlchemy==1.3.10', - 'pika==1.1.0' - ]) - setup( name='pyspider', @@ -72,9 +64,6 @@ classifiers=[ 'Development Status :: 4 - Beta', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', @@ -100,7 +89,8 @@ 'all': extras_require_all, 'test': [ 'coverage', - 'httpbin<=0.5.0', + 'Werkzeug==0.16.1', + 'httpbin==0.7.0', 'pyproxy==0.1.6', 'easywebdav==1.2.0', ] diff --git a/tests/test_database.py b/tests/test_database.py index c0c5f3164..f9d563a3b 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -697,11 +697,6 @@ class TestCouchDBProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): # create a test admin user - import requests - requests.put('http://localhost:5984/_node/_local/_config/admins/test', - data='"password"') - os.environ["COUCHDB_USER"] = "test" - os.environ["COUCHDB_PASSWORD"] = "password" self.projectdb = database.connect_database( 'couchdb+projectdb://localhost:5984/' ) @@ -710,12 +705,6 @@ def setUpClass(self): @classmethod def tearDownClass(self): # remove the test admin user - import requests - from requests.auth import HTTPBasicAuth - requests.delete('http://localhost:5984/_node/_local/_config/admins/test', - auth=HTTPBasicAuth('test', 'password')) - del os.environ["COUCHDB_USER"] - del os.environ["COUCHDB_PASSWORD"] self.projectdb.drop_database() @@ -725,11 +714,6 @@ class TestCouchDBResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): # create a test admin user - import requests - requests.put('http://localhost:5984/_node/_local/_config/admins/test', - data='"password"') - os.environ["COUCHDB_USER"] = "test" - os.environ["COUCHDB_PASSWORD"] = "password" self.resultdb = database.connect_database( 'couchdb+resultdb://localhost:5984/' ) @@ -738,12 +722,6 @@ def setUpClass(self): @classmethod def tearDownClass(self): # remove the test admin user - import requests - from requests.auth import HTTPBasicAuth - requests.delete('http://localhost:5984/_node/_local/_config/admins/test', - auth=HTTPBasicAuth('test', 'password')) - del os.environ["COUCHDB_USER"] - del os.environ["COUCHDB_PASSWORD"] self.resultdb.drop_database() def test_create_project(self): @@ -759,10 +737,6 @@ class TestCouchDBTaskDB(TaskDBCase, unittest.TestCase): def setUpClass(self): # create a test admin user import requests - requests.put('http://localhost:5984/_node/_local/_config/admins/test', - data='"password"') - os.environ["COUCHDB_USER"] = "test" - os.environ["COUCHDB_PASSWORD"] = "password" self.taskdb = database.connect_database( 'couchdb+taskdb://localhost:5984/' ) @@ -773,10 +747,6 @@ def tearDownClass(self): # remove the test admin user import requests from requests.auth import HTTPBasicAuth - requests.delete('http://localhost:5984/_node/_local/_config/admins/test', - auth=HTTPBasicAuth('test', 'password')) - del os.environ["COUCHDB_USER"] - del os.environ["COUCHDB_PASSWORD"] self.taskdb.drop_database() def test_create_project(self): diff --git a/tests/test_run.py b/tests/test_run.py index 396dc34fa..490844ee4 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -156,14 +156,9 @@ def test_60_docker_mongodb(self): def test_60a_docker_couchdb(self): try: # create a test admin user - import requests - requests.put('http://localhost:5984/_node/_local/_config/admins/test', - data='"password"') os.environ['COUCHDB_NAME'] = 'couchdb' os.environ['COUCHDB_PORT_5984_TCP_ADDR'] = 'localhost' os.environ['COUCHDB_PORT_5984_TCP_PORT'] = '5984' - os.environ["COUCHDB_USER"] = "test" - os.environ["COUCHDB_PASSWORD"] = "password" ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) @@ -172,15 +167,9 @@ def test_60a_docker_couchdb(self): self.assertIsNone(e) finally: # remove the test admin user - import requests - from requests.auth import HTTPBasicAuth - requests.delete('http://localhost:5984/_node/_local/_config/admins/test', - auth=HTTPBasicAuth('test', 'password')) del os.environ['COUCHDB_NAME'] del os.environ['COUCHDB_PORT_5984_TCP_ADDR'] del os.environ['COUCHDB_PORT_5984_TCP_PORT'] - del os.environ["COUCHDB_USER"] - del os.environ["COUCHDB_PASSWORD"] @unittest.skip('only available in docker') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') diff --git a/tox.ini b/tox.ini index dd0526188..506758f08 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26,py27,py33,py34,py35 +envlist = py35,py36,py37,py38 [testenv] install_command = pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1' {opts} -e .[all,test] {packages}