diff --git a/docs/Deployment-demo.pyspider.org.md b/docs/Deployment-demo.pyspider.org.md index 325e6d801..e01c77199 100644 --- a/docs/Deployment-demo.pyspider.org.md +++ b/docs/Deployment-demo.pyspider.org.md @@ -112,7 +112,7 @@ With the config, you can change the scale by `docker-compose scale phantomjs=2 p #### load balance -phantomjs-lb, fetcher-lb, webui-lb are automaticlly configed haproxy, allow any number of upstreams. +phantomjs-lb, fetcher-lb, webui-lb are automatically configured haproxy, allow any number of upstreams. #### phantomjs @@ -120,7 +120,7 @@ phantomjs have memory leak issue, memory limit applied, and it's recommended to #### fetcher -fetcher is implemented with aync IO, it supportes 100 concurrent connections. If the upstream queue are not choked, one fetcher should be enough. +fetcher is implemented with aync IO, it supports 100 concurrent connections. If the upstream queue are not choked, one fetcher should be enough. #### processor diff --git a/docs/Frequently-Asked-Questions.md b/docs/Frequently-Asked-Questions.md index 962d4e47d..ab87fc838 100644 --- a/docs/Frequently-Asked-Questions.md +++ b/docs/Frequently-Asked-Questions.md @@ -56,4 +56,4 @@ You can have only have one scheduler, and multiple fetcher/processor/result_work For example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers. -The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips. \ No newline at end of file +The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and scheduler. You can put your mouse over the numbers to see the tips. \ No newline at end of file diff --git a/docs/Quickstart.md b/docs/Quickstart.md index 7bda9af42..c3ba7bc46 100644 --- a/docs/Quickstart.md +++ b/docs/Quickstart.md @@ -18,7 +18,7 @@ to install binary packages first. please install PhantomJS if needed: http://phantomjs.org/build.html -note that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment +note that PhantomJS will be enabled only if it is executable in the `PATH` or in the System Environment **Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment). diff --git a/docs/tutorial/Render-with-PhantomJS.md b/docs/tutorial/Render-with-PhantomJS.md index cc0ad331f..15cf5e238 100644 --- a/docs/tutorial/Render-with-PhantomJS.md +++ b/docs/tutorial/Render-with-PhantomJS.md @@ -3,7 +3,7 @@ Level 3: Render with PhantomJS Sometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS]. -To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`. +To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if executable in the `PATH`. Make sure phantomjs is working by running ``` @@ -43,7 +43,7 @@ Running JavaScript on Page We will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page. -To scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: +To scrape images as many as possible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: ``` class Handler(BaseHandler): diff --git a/pyspider/database/__init__.py b/pyspider/database/__init__.py index 04755b904..0b8c35af9 100644 --- a/pyspider/database/__init__.py +++ b/pyspider/database/__init__.py @@ -185,7 +185,7 @@ def _connect_sqlalchemy(parsed, dbtype,url, other_scheme): def _connect_elasticsearch(parsed, dbtype): - # in python 2.6 url like "http://host/?query", query will not been splitted + # in python 2.6 url like "http://host/?query", query will not been split if parsed.path.startswith('/?'): index = parse_qs(parsed.path[2:]) else: diff --git a/pyspider/database/basedb.py b/pyspider/database/basedb.py index ca71d6d2c..7a99d2c43 100644 --- a/pyspider/database/basedb.py +++ b/pyspider/database/basedb.py @@ -19,7 +19,7 @@ class BaseDB: ''' BaseDB - dbcur should be overwirte + dbcur should be overwrite ''' __tablename__ = None placeholder = '%s' diff --git a/pyspider/fetcher/tornado_fetcher.py b/pyspider/fetcher/tornado_fetcher.py index d64169351..3583767b8 100644 --- a/pyspider/fetcher/tornado_fetcher.py +++ b/pyspider/fetcher/tornado_fetcher.py @@ -265,7 +265,7 @@ def pack_tornado_request_parameters(self, url, task): _t = track_headers.get('etag') if _t and 'If-None-Match' not in fetch['headers']: fetch['headers']['If-None-Match'] = _t - # last modifed + # last modified if task_fetch.get('last_modified', task_fetch.get('last_modifed', True)): last_modified = task_fetch.get('last_modified', task_fetch.get('last_modifed', True)) _t = None diff --git a/pyspider/libs/base_handler.py b/pyspider/libs/base_handler.py index d2ebe9584..f56a1243f 100644 --- a/pyspider/libs/base_handler.py +++ b/pyspider/libs/base_handler.py @@ -440,7 +440,7 @@ def _on_cronjob(self, response, task): # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in # Response.save. Scheduler may at least send the trigger task every GCD of the - # inverval of the cronjobs. The method should check the tick for each cronjob + # interval of the cronjobs. The method should check the tick for each cronjob # function to confirm the execute interval. for cronjob in self._cron_jobs: if response.save['tick'] % cronjob.tick != 0: @@ -449,7 +449,7 @@ def _on_cronjob(self, response, task): self._run_func(function, response, task) def _on_get_info(self, response, task): - """Sending runtime infomation about this script.""" + """Sending runtime information about this script.""" for each in response.save or []: if each == 'min_tick': self.save[each] = self._min_tick diff --git a/pyspider/libs/response.py b/pyspider/libs/response.py index 8975781b2..ee89ffb04 100644 --- a/pyspider/libs/response.py +++ b/pyspider/libs/response.py @@ -156,7 +156,7 @@ def etree(self): except LookupError: # lxml would raise LookupError when encoding not supported # try fromstring without encoding instead. - # on windows, unicode is not availabe as encoding for lxml + # on windows, unicode is not available as encoding for lxml self._elements = lxml.html.fromstring(self.content) if isinstance(self._elements, lxml.etree._ElementTree): self._elements = self._elements.getroot() diff --git a/pyspider/message_queue/rabbitmq.py b/pyspider/message_queue/rabbitmq.py index 9e4e72595..f5323d2e0 100644 --- a/pyspider/message_queue/rabbitmq.py +++ b/pyspider/message_queue/rabbitmq.py @@ -68,11 +68,11 @@ def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. - lazy_limit: as rabbitmq is shared between multipul instance, for a strict + lazy_limit: as rabbitmq is shared between multiple instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every - max_size / 10 put operation for better performace. + max_size / 10 put operation for better performance. """ self.name = name self.amqp_url = amqp_url @@ -201,11 +201,11 @@ def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. - lazy_limit: as rabbitmq is shared between multipul instance, for a strict + lazy_limit: as rabbitmq is shared between multiple instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every - max_size / 10 put operation for better performace. + max_size / 10 put operation for better performance. """ self.name = name self.amqp_url = amqp_url diff --git a/pyspider/processor/processor.py b/pyspider/processor/processor.py index ae0de1f46..e72cb8e69 100644 --- a/pyspider/processor/processor.py +++ b/pyspider/processor/processor.py @@ -21,7 +21,7 @@ class ProcessorResult(object): - """The result and logs producted by a callback""" + """The result and logs produced by a callback""" def __init__(self, result=None, follows=(), messages=(), logs=(), exception=None, extinfo=None, save=None): @@ -45,7 +45,7 @@ def logstr(self): """handler the log records to formatted string""" result = [] - formater = LogFormatter(color=False) + formatter = LogFormatter(color=False) for record in self.logs: if isinstance(record, six.string_types): result.append(pretty_unicode(record)) @@ -54,7 +54,7 @@ def logstr(self): a, b, tb = record.exc_info tb = hide_me(tb, globals()) record.exc_info = a, b, tb - result.append(pretty_unicode(formater.format(record))) + result.append(pretty_unicode(formatter.format(record))) result.append(u'\n') return u''.join(result) diff --git a/pyspider/result/result_worker.py b/pyspider/result/result_worker.py index 16935fa18..cfea3094e 100644 --- a/pyspider/result/result_worker.py +++ b/pyspider/result/result_worker.py @@ -38,7 +38,7 @@ def on_result(self, task, result): result=result ) else: - logger.warning('result UNKNOW -> %.30r' % result) + logger.warning('result UNKNOWN -> %.30r' % result) return def quit(self): @@ -83,5 +83,5 @@ def on_result(self, task, result): 'updatetime': time.time() })) else: - logger.warning('result UNKNOW -> %.30r' % result) + logger.warning('result UNKNOWN -> %.30r' % result) return diff --git a/pyspider/scheduler/scheduler.py b/pyspider/scheduler/scheduler.py index 084baff28..039eefb53 100644 --- a/pyspider/scheduler/scheduler.py +++ b/pyspider/scheduler/scheduler.py @@ -245,7 +245,7 @@ def _update_project(self, project): }, }) - # load task queue when project is running and delete task_queue when project is stoped + # load task queue when project is running and delete task_queue when project is stopped if project.active: if not project.task_loaded: self._load_tasks(project) @@ -989,7 +989,7 @@ def on_task_failed(self, task): def on_select_task(self, task): '''Called when a task is selected to fetch & process''' - # inject informations about project + # inject information about project logger.info('select %(project)s:%(taskid)s %(url)s', task) project_info = self.projects.get(task['project']) diff --git a/tests/test_message_queue.py b/tests/test_message_queue.py index d5e19559b..6ff84dec5 100644 --- a/tests/test_message_queue.py +++ b/tests/test_message_queue.py @@ -73,7 +73,7 @@ def setUpClass(self): self.q3 = connect_message_queue('test_queue_for_threading_test') -#@unittest.skipIf(six.PY3, 'pika not suport python 3') +#@unittest.skipIf(six.PY3, 'pika not support python 3') @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') class TestPikaRabbitMQ(TestMessageQueue, unittest.TestCase):