From 67816491034be35a70ff04c9ef045d36304f2bea Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 4 Oct 2019 03:31:08 +0100 Subject: [PATCH 1/7] Added: Supporting filter for resources --- shub/items.py | 9 +++++++-- shub/log.py | 10 ++++++++-- shub/requests.py | 9 +++++++-- shub/utils.py | 15 +++++++++++---- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/shub/items.py b/shub/items.py index ad656b56..fbb7b870 100644 --- a/shub/items.py +++ b/shub/items.py @@ -30,6 +30,10 @@ by providing the -f flag: shub items -f 2/15 + +Additional filters may be applied to the query: + + shub items 12345/2/15 --filter '["foo","exists",[]]' """ SHORT_HELP = "Fetch items from Scrapy Cloud" @@ -40,8 +44,9 @@ @click.option('-f', '--follow', help='output new items as they are scraped', is_flag=True) @click.option('-n', '--tail', help='output last N items only', type=int) -def cli(job_id, follow, tail): +@click.option('--filter', 'filter_', help='filter to be applied to the query') +def cli(job_id, follow, tail, filter_): job = get_job(job_id) for item in job_resource_iter(job, job.items, output_json=True, - follow=follow, tail=tail): + follow=follow, tail=tail, filter_=filter_): click.echo(item) diff --git a/shub/log.py b/shub/log.py index 8b0df496..13de424c 100644 --- a/shub/log.py +++ b/shub/log.py @@ -32,6 +32,10 @@ providing the -f flag: shub log -f 2/15 + +Additional filters may be applied to the query: + + shub log 12345/2/15 --filter '["level",">=",["20"]]' # loglevel>=INFO """ SHORT_HELP = "Fetch log from Scrapy Cloud" @@ -42,9 +46,11 @@ @click.option('-f', '--follow', help='output new log entries as they are ' 'produced', is_flag=True) @click.option('-n', '--tail', help='output last N log entries only', type=int) -def cli(job_id, follow, tail): +@click.option('--filter', 'filter_', help='filter to be applied to the query') +def cli(job_id, follow, tail, filter_): job = get_job(job_id) - for item in job_resource_iter(job, job.logs, follow=follow, tail=tail): + for item in job_resource_iter(job, job.logs, follow=follow, tail=tail, + filter_=filter_): click.echo( u"{} {} {}".format( datetime.utcfromtimestamp(item['time']/1000), diff --git a/shub/requests.py b/shub/requests.py index 45425f45..5ac5dc37 100644 --- a/shub/requests.py +++ b/shub/requests.py @@ -30,6 +30,10 @@ by providing the -f flag: shub requests -f 2/15 + +Additional filters may be applied to the query: + + shub requests 12345/2/15 --filter '["url","icontains",["foo"]]' """ SHORT_HELP = "Fetch requests from Scrapy Cloud" @@ -40,8 +44,9 @@ @click.option('-f', '--follow', help='output new requests as they are made', is_flag=True) @click.option('-n', '--tail', help='output last N requests only', type=int) -def cli(job_id, follow, tail): +@click.option('--filter', 'filter_', help='filter to be applied to the query') +def cli(job_id, follow, tail, filter_): job = get_job(job_id) for item in job_resource_iter(job, job.requests, output_json=True, - follow=follow, tail=tail): + follow=follow, tail=tail, filter_=filter_): click.echo(item) diff --git a/shub/utils.py b/shub/utils.py index 6a988ed1..790e1c6c 100644 --- a/shub/utils.py +++ b/shub/utils.py @@ -531,7 +531,7 @@ def job_live(job, refresh_meta_after=60): def job_resource_iter(job, resource, output_json=False, follow=True, - tail=None): + tail=None, filter_=None): """ Given a python-hubstorage job and resource (e.g. job.items), return a generator that periodically checks the job resource and yields its items. @@ -549,17 +549,24 @@ def job_resource_iter(job, resource, output_json=False, follow=True, last_item_key = '{}/{}'.format(job.key, last_item) if not job_live(job): follow = False + # XXX: Some simple validations for the filter value? + api_params = { + # It's okay to have null-values included here since the underlying + # package would have it removed + 'startafter': last_item_key, + 'filter': filter_, + } resource_iter = resource.iter_json if output_json else resource.iter_values if not follow: - for item in resource_iter(startafter=last_item_key): + for item in resource_iter(**api_params): yield item return while True: # XXX: Always use iter_json until Kumo team fixes iter_values to also # return '_key' - for json_line in resource.iter_json(startafter=last_item_key): + for json_line in resource.iter_json(**api_params): item = json.loads(json_line) - last_item_key = item['_key'] + api_params['startafter'] = item['_key'] yield json_line if output_json else item if not job_live(job): break From 841637be0dde521f8590b96651f1bac5ff4d7326 Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 4 Oct 2019 04:10:46 +0100 Subject: [PATCH 2/7] Added: Tests for job resource filters --- tests/test_jobresource.py | 12 ++++++++++++ tests/test_utils.py | 14 +++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/test_jobresource.py b/tests/test_jobresource.py index 27bd63a7..2a354e0c 100644 --- a/tests/test_jobresource.py +++ b/tests/test_jobresource.py @@ -35,13 +35,24 @@ def _test_forwards_follow(self, cmd_mod): self.runner.invoke(cmd_mod.cli, ('1/2/3', '-f')) self.assertTrue(mock_jri.call_args[1]['follow']) + def _test_resource_filter(self, cmd_mod): + with mock.patch.object(cmd_mod, 'get_job'), \ + mock.patch.object(cmd_mod, 'job_resource_iter', autospec=True) \ + as mock_jri: + self.runner.invoke(cmd_mod.cli, ('1/2/3',)) + self.assertFalse(mock_jri.call_args[1]['filter_']) + self.runner.invoke(cmd_mod.cli, ('1/2/3', '--filter', '["foo"]')) + self.assertEqual(mock_jri.call_args[1]['filter_'], '["foo"]') + def test_items(self): self._test_prints_objects(items, 'items') self._test_forwards_follow(items) + self._test_resource_filter(items) def test_requests(self): self._test_prints_objects(requests, 'requests') self._test_forwards_follow(requests) + self._test_resource_filter(requests) def test_log(self): objects = [ @@ -57,6 +68,7 @@ def test_log(self): self.assertIn('1970-01-01 00:00:00 INFO message 1', result.output) self.assertIn('2015-12-23 12:41:11 CRITICAL message 2', result.output) self._test_forwards_follow(log) + self._test_resource_filter(log) def test_log_unicode(self): objects = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index cbdc3f49..80d6e3fe 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -229,6 +229,8 @@ def magic_iter(*args, **kwargs): Return two different iterators on the first two calls, set job's state to 'finished' after the second call. """ + if magic_iter.expect_filter: + self.assertEqual(kwargs['filter'], magic_iter.expect_filter) if magic_iter.stage == 0: if 'startafter' in kwargs: self.assertEqual(kwargs['startafter'], None) @@ -243,32 +245,38 @@ def magic_iter(*args, **kwargs): self.assertEqual(kwargs['startafter'], 'jobkey/996') return iter([]) - def jri_result(follow, tail=None): + def jri_result(follow, tail=None, filter_=None): return list(utils.job_resource_iter( job, job.resource, follow=follow, tail=tail, + filter_=filter_, output_json=True, )) job.resource.iter_json = magic_iter + tmp_filter = '["foo"]' magic_iter.stage = 0 + magic_iter.expect_filter = None self.assertEqual(jri_result(False), make_items([1, 2, 3])) self.assertFalse(mock_sleep.called) magic_iter.stage = 0 - self.assertEqual(jri_result(True), make_items([1, 2, 3, 4, 5, 6])) + magic_iter.expect_filter = tmp_filter + self.assertEqual(jri_result(True, filter_=tmp_filter), make_items([1, 2, 3, 4, 5, 6])) self.assertTrue(mock_sleep.called) magic_iter.stage = 0 + magic_iter.expect_filter = None job.metadata = {'state': 'finished'} self.assertEqual(jri_result(True), make_items([1, 2, 3])) magic_iter.stage = 2 + magic_iter.expect_filter = tmp_filter job.resource.stats.return_value = {'totals': {'input_values': 1000}} - self.assertEqual(jri_result(True, tail=3), []) + self.assertEqual(jri_result(True, tail=3, filter_=tmp_filter), []) @patch('shub.utils.requests.get', autospec=True) def test_latest_github_release(self, mock_get): From 27d3ff506ad552ecf701e76df115114cfc85e014 Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 4 Oct 2019 04:35:42 +0100 Subject: [PATCH 3/7] Added: Supporting additional filter types (filterall/filterany) for items --- shub/items.py | 8 ++++++-- shub/utils.py | 4 ++-- tests/test_jobresource.py | 7 +++++-- tests/test_utils.py | 12 ++++++++---- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/shub/items.py b/shub/items.py index fbb7b870..ac854aac 100644 --- a/shub/items.py +++ b/shub/items.py @@ -45,8 +45,12 @@ is_flag=True) @click.option('-n', '--tail', help='output last N items only', type=int) @click.option('--filter', 'filter_', help='filter to be applied to the query') -def cli(job_id, follow, tail, filter_): +@click.option('--filter_type', default='filter', + type=click.Choice(['filter', 'filterall', 'filterany']), + help='type of filter to be applied') +def cli(job_id, follow, tail, filter_, filter_type): job = get_job(job_id) for item in job_resource_iter(job, job.items, output_json=True, - follow=follow, tail=tail, filter_=filter_): + follow=follow, tail=tail, filter_=filter_, + filter_type=filter_type): click.echo(item) diff --git a/shub/utils.py b/shub/utils.py index 790e1c6c..8a3f1d1b 100644 --- a/shub/utils.py +++ b/shub/utils.py @@ -531,7 +531,7 @@ def job_live(job, refresh_meta_after=60): def job_resource_iter(job, resource, output_json=False, follow=True, - tail=None, filter_=None): + tail=None, filter_=None, filter_type=None): """ Given a python-hubstorage job and resource (e.g. job.items), return a generator that periodically checks the job resource and yields its items. @@ -554,7 +554,7 @@ def job_resource_iter(job, resource, output_json=False, follow=True, # It's okay to have null-values included here since the underlying # package would have it removed 'startafter': last_item_key, - 'filter': filter_, + filter_type or 'filter': filter_, } resource_iter = resource.iter_json if output_json else resource.iter_values if not follow: diff --git a/tests/test_jobresource.py b/tests/test_jobresource.py index 2a354e0c..667d9ad9 100644 --- a/tests/test_jobresource.py +++ b/tests/test_jobresource.py @@ -35,7 +35,7 @@ def _test_forwards_follow(self, cmd_mod): self.runner.invoke(cmd_mod.cli, ('1/2/3', '-f')) self.assertTrue(mock_jri.call_args[1]['follow']) - def _test_resource_filter(self, cmd_mod): + def _test_resource_filter(self, cmd_mod, test_filter_type=False): with mock.patch.object(cmd_mod, 'get_job'), \ mock.patch.object(cmd_mod, 'job_resource_iter', autospec=True) \ as mock_jri: @@ -43,11 +43,14 @@ def _test_resource_filter(self, cmd_mod): self.assertFalse(mock_jri.call_args[1]['filter_']) self.runner.invoke(cmd_mod.cli, ('1/2/3', '--filter', '["foo"]')) self.assertEqual(mock_jri.call_args[1]['filter_'], '["foo"]') + if test_filter_type: + self.runner.invoke(cmd_mod.cli, ('1/2/3', '--filter', '["foo"]', '--filter_type', 'filterall')) + self.assertEqual(mock_jri.call_args[1]['filter_type'], 'filterall') def test_items(self): self._test_prints_objects(items, 'items') self._test_forwards_follow(items) - self._test_resource_filter(items) + self._test_resource_filter(items, test_filter_type=True) def test_requests(self): self._test_prints_objects(requests, 'requests') diff --git a/tests/test_utils.py b/tests/test_utils.py index 80d6e3fe..ecc1b503 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -229,8 +229,7 @@ def magic_iter(*args, **kwargs): Return two different iterators on the first two calls, set job's state to 'finished' after the second call. """ - if magic_iter.expect_filter: - self.assertEqual(kwargs['filter'], magic_iter.expect_filter) + self.assertEqual(kwargs.get(magic_iter.filter_type), magic_iter.expect_filter) if magic_iter.stage == 0: if 'startafter' in kwargs: self.assertEqual(kwargs['startafter'], None) @@ -245,13 +244,14 @@ def magic_iter(*args, **kwargs): self.assertEqual(kwargs['startafter'], 'jobkey/996') return iter([]) - def jri_result(follow, tail=None, filter_=None): + def jri_result(follow, tail=None, filter_=None, filter_type=None): return list(utils.job_resource_iter( job, job.resource, follow=follow, tail=tail, filter_=filter_, + filter_type=filter_type, output_json=True, )) @@ -259,24 +259,28 @@ def jri_result(follow, tail=None, filter_=None): tmp_filter = '["foo"]' magic_iter.stage = 0 + magic_iter.filter_type = 'filter' magic_iter.expect_filter = None self.assertEqual(jri_result(False), make_items([1, 2, 3])) self.assertFalse(mock_sleep.called) magic_iter.stage = 0 + magic_iter.filter_type = 'filter' magic_iter.expect_filter = tmp_filter self.assertEqual(jri_result(True, filter_=tmp_filter), make_items([1, 2, 3, 4, 5, 6])) self.assertTrue(mock_sleep.called) magic_iter.stage = 0 + magic_iter.filter_type = 'filter' magic_iter.expect_filter = None job.metadata = {'state': 'finished'} self.assertEqual(jri_result(True), make_items([1, 2, 3])) magic_iter.stage = 2 + magic_iter.filter_type = 'filterall' magic_iter.expect_filter = tmp_filter job.resource.stats.return_value = {'totals': {'input_values': 1000}} - self.assertEqual(jri_result(True, tail=3, filter_=tmp_filter), []) + self.assertEqual(jri_result(True, tail=3, filter_=tmp_filter, filter_type='filterall'), []) @patch('shub.utils.requests.get', autospec=True) def test_latest_github_release(self, mock_get): From 469b2cfefe0ea725879684674a954b79691b71fe Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Thu, 24 Oct 2019 17:48:11 +0100 Subject: [PATCH 4/7] Enhanced (simplified) testing --- tests/test_utils.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index ecc1b503..d3685745 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -229,7 +229,6 @@ def magic_iter(*args, **kwargs): Return two different iterators on the first two calls, set job's state to 'finished' after the second call. """ - self.assertEqual(kwargs.get(magic_iter.filter_type), magic_iter.expect_filter) if magic_iter.stage == 0: if 'startafter' in kwargs: self.assertEqual(kwargs['startafter'], None) @@ -255,32 +254,25 @@ def jri_result(follow, tail=None, filter_=None, filter_type=None): output_json=True, )) - job.resource.iter_json = magic_iter - tmp_filter = '["foo"]' + job.resource.iter_json = Mock(wraps=magic_iter) magic_iter.stage = 0 - magic_iter.filter_type = 'filter' - magic_iter.expect_filter = None self.assertEqual(jri_result(False), make_items([1, 2, 3])) self.assertFalse(mock_sleep.called) magic_iter.stage = 0 - magic_iter.filter_type = 'filter' - magic_iter.expect_filter = tmp_filter - self.assertEqual(jri_result(True, filter_=tmp_filter), make_items([1, 2, 3, 4, 5, 6])) + self.assertEqual(jri_result(True, filter_='["foo"]'), make_items([1, 2, 3, 4, 5, 6])) + self.assertEqual(job.resource.iter_json.call_args[1]['filter'], '["foo"]') self.assertTrue(mock_sleep.called) magic_iter.stage = 0 - magic_iter.filter_type = 'filter' - magic_iter.expect_filter = None job.metadata = {'state': 'finished'} self.assertEqual(jri_result(True), make_items([1, 2, 3])) magic_iter.stage = 2 - magic_iter.filter_type = 'filterall' - magic_iter.expect_filter = tmp_filter job.resource.stats.return_value = {'totals': {'input_values': 1000}} - self.assertEqual(jri_result(True, tail=3, filter_=tmp_filter, filter_type='filterall'), []) + self.assertEqual(jri_result(True, tail=3, filter_='["foo"]', filter_type='filterall'), []) + self.assertEqual(job.resource.iter_json.call_args[1]['filterall'], '["foo"]') @patch('shub.utils.requests.get', autospec=True) def test_latest_github_release(self, mock_get): From 66a604e12aa7d6f37c7c985d7d9ba035ccadba6a Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 25 Oct 2019 00:41:15 +0100 Subject: [PATCH 5/7] Fixed build error due to recent homebrew update --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e8941654..457b48c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,7 +40,8 @@ branches: before_install: | if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then # From https://pythonhosted.org/CodeChat/.travis.yml.html - brew install pyenv-virtualenv + # Homebrew currently fails after updating. See also: https://discuss.circleci.com/t/brew-install-fails-while-updating/32992/4 + HOMEBREW_NO_AUTO_UPDATE=1 brew install pyenv-virtualenv eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" # See https://github.com/travis-ci/travis-ci/issues/4834, but From e7116eb2a6026e137d6cd07666c46090007027bf Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 25 Oct 2019 00:56:34 +0100 Subject: [PATCH 6/7] Specifying "2.7.9" (relatively newer) instead of "2.7" in travis config since pyenv may seriously install Python 2.7.0 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 457b48c8..ca82befa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ matrix: language: generic env: - TOX_ENV=py27 - - PYTHON_VERSION='2.7' + - PYTHON_VERSION='2.7.9' - os: osx language: generic env: From 7935ad5c8ad595de1d751cece67c6bfea9f67dbb Mon Sep 17 00:00:00 2001 From: Pengyu Chen Date: Fri, 25 Oct 2019 18:10:15 +0100 Subject: [PATCH 7/7] Requesting pip<19.3 for now due to incompatibility (pip._internal.main no longer callable) --- requirements.in | 6 ++++-- requirements.txt | 3 +++ setup.py | 2 +- shub/utils.py | 2 ++ tests/test_utils.py | 4 ++++ 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index 3ac4583b..22932550 100644 --- a/requirements.in +++ b/requirements.in @@ -1,13 +1,15 @@ click docker-py -PyYAML -requests +#PyYAML +#requests retrying six tqdm scrapinghub>=2.0.3 +pip<19.3 + # address known vulnerabilities requests>=2.20.0 # CVE-2018-18074 pyyaml>=4.2b1 # CVE-2017-18342 diff --git a/requirements.txt b/requirements.txt index a2bba22e..68744217 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,6 @@ six==1.10.0 tqdm==4.11.2 urllib3==1.25.3 # via requests websocket-client==0.37.0 # via docker-py + +# The following packages are considered to be unsafe in a requirements file: +# pip==19.2.3 diff --git a/setup.py b/setup.py index 01abbe29..b11d9fff 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ install_requires=[ 'click', 'docker-py', - 'pip', + 'pip<19.3', 'PyYAML', 'retrying', 'requests', diff --git a/shub/utils.py b/shub/utils.py index 6a988ed1..f7e9b7ef 100644 --- a/shub/utils.py +++ b/shub/utils.py @@ -637,6 +637,8 @@ def download_from_pypi(dest, pkg=None, reqfile=None, extra_args=None): no_wheel = ['--no-binary=:all:'] if pip_version >= LooseVersion('8'): cmd = 'download' + if pip_version >= LooseVersion('19.3'): + raise NotImplementedError('Expecting pip<19.3') with patch_sys_executable(): pip_main([cmd, '-d', dest, '--no-deps'] + no_wheel + extra_args + target) diff --git a/tests/test_utils.py b/tests/test_utils.py index cbdc3f49..72b88f63 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -371,6 +371,10 @@ def _call(*args, **kwargs): pipargs = _call('tmpdir', reqfile='req.txt') self.assertEqual(pipargs.index('-r') + 1, pipargs.index('req.txt')) + # pip>=19.3 shall be unsupported for now + mock_pip.__version__ = '19.3' + self.assertRaises(NotImplementedError, _call, ['tmpdir'], {'pkg': 'shub'}) + # Replace deprecated commands in newer versions mock_pip.__version__ = '7.1.2.dev0' pipargs = _call('tmpdir', pkg='shub')