From 8ce8c3424be7094a2501a8f84ba2a94b0fb582b6 Mon Sep 17 00:00:00 2001 From: Valerio Cosentino Date: Mon, 11 May 2020 13:20:49 +0200 Subject: [PATCH] [elk] Handle private repos in git studies This code prevents to store the credentials of the git private repos in the studies indexes (aoc and git branches). This change requires to improve also the way the project attribute is assigned to a repository. In the case of a private repo, the corresponding data source in the projects.json is accessed, each repo is anonymized and the output is compared with current repo (which value is stored in the indexes). Tests have been updated accordingly. Signed-off-by: Valerio Cosentino --- grimoire_elk/enriched/enrich.py | 12 +- grimoire_elk/enriched/git.py | 11 +- .../enriched/graal_study_evolution.py | 10 +- grimoire_elk/raw/git.py | 1 + .../support-for-git-private-repos.yml | 12 ++ tests/base.py | 12 +- tests/data/git.json | 2 +- tests/test_git.py | 201 ++++++++++++++---- 8 files changed, 208 insertions(+), 53 deletions(-) create mode 100644 releases/unreleased/support-for-git-private-repos.yml diff --git a/grimoire_elk/enriched/enrich.py b/grimoire_elk/enriched/enrich.py index 7dfe9f952..3bdc6cd4f 100644 --- a/grimoire_elk/enriched/enrich.py +++ b/grimoire_elk/enriched/enrich.py @@ -546,7 +546,7 @@ def find_item_project(self, eitem): # elk.enrich_backend) if self.projects_json_repo: project = self.prjs_map[ds_name][self.projects_json_repo] - # if `projects_json_repo`, which shouldn't never happen, use the + # if `projects_json_repo` (e.g., AOC study), use the # method `get_project_repository` (defined in each enricher) else: repository = self.get_project_repository(eitem) @@ -561,6 +561,16 @@ def find_item_project(self, eitem): fltr = eitem['origin'] + ' --filter-raw=' + self.filter_raw if ds_name in self.prjs_map and fltr in self.prjs_map[ds_name]: project = self.prjs_map[ds_name][fltr] + elif ds_name in self.prjs_map: + # this code is executed to retrieve the project of private repositories (in particular Git ones) + # the URLs in the prjs_map are retrieved, anonymized and compared with the value + # returned by `get_project_repository` + repository = self.get_project_repository(eitem) + for r in self.prjs_map[ds_name]: + anonymized_repo = anonymize_url(r) + if repository == anonymized_repo: + project = self.prjs_map[ds_name][r] + break if project == UNKNOWN_PROJECT: return None diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 17edd0ba8..0c91fa26f 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -568,9 +568,10 @@ def enrich_areas_of_code(self, ocean_backend, enrich_backend, no_incremental=Fal repos.extend(items) for repo in repos: - logger.info("{} Processing repo: {}".format(log_prefix, repo)) - in_conn.update_repo(repo) - out_conn.update_repo(repo) + anonymize_repo = anonymize_url(repo) + logger.info("{} Processing repo: {}".format(log_prefix, anonymize_repo)) + in_conn.update_repo(anonymize_repo) + out_conn.update_repo(anonymize_repo) areas_of_code(git_enrich=enrich_backend, in_conn=in_conn, out_conn=out_conn) # Create alias if output index exists and alias does not @@ -776,7 +777,7 @@ def delete_commit_branches(self, git_repo, enrich_backend): } } ] - """ % git_repo.uri + """ % anonymize_url(git_repo.uri) # reset references in enrich index es_query = """ @@ -856,7 +857,7 @@ def __process_commits_in_branch(self, enrich_backend, repo_origin, branch_name, logger.warning("[git] Change branch name from {} to {}".format(branch_name, digested_branch_name)) # update enrich index - fltr = self.__prepare_filter("hash", commits_str, repo_origin) + fltr = self.__prepare_filter("hash", commits_str, anonymize_url(repo_origin)) es_query = """ { diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py index c840a15f5..b783bc301 100644 --- a/grimoire_elk/enriched/graal_study_evolution.py +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -20,7 +20,7 @@ # Nishchith Shetty # -from grimoirelab_toolkit.datetime import str_to_datetime +from grimoirelab_toolkit.datetime import str_to_datetime, unixtime_to_datetime def get_unique_repository(): @@ -178,9 +178,15 @@ def get_to_date(es_in, in_index, out_index, repository_url, interval): index=out_index, body=get_last_study_date(repository_url, interval))["aggregations"]["1"] - if last_study_date["value"] is not None: + if "value_as_string" in last_study_date and last_study_date["value_as_string"]: study_data_available = True to_date = str_to_datetime(last_study_date["value_as_string"]) + elif "value" in last_study_date and last_study_date["value"]: + study_data_available = True + try: + to_date = unixtime_to_datetime(last_study_date["value"]) + except Exception: + to_date = unixtime_to_datetime(last_study_date["value"] / 1000) if not study_data_available: first_item_date = es_in.search( diff --git a/grimoire_elk/raw/git.py b/grimoire_elk/raw/git.py index 35eb8927e..2abb2a3fe 100644 --- a/grimoire_elk/raw/git.py +++ b/grimoire_elk/raw/git.py @@ -64,6 +64,7 @@ class GitOcean(ElasticOcean): def _fix_item(self, item): item['origin'] = anonymize_url(item['origin']) + item['tag'] = anonymize_url(item['tag']) @classmethod def get_perceval_params_from_url(cls, url): diff --git a/releases/unreleased/support-for-git-private-repos.yml b/releases/unreleased/support-for-git-private-repos.yml new file mode 100644 index 000000000..e867587cf --- /dev/null +++ b/releases/unreleased/support-for-git-private-repos.yml @@ -0,0 +1,12 @@ +--- +title: Support for Git private repos +category: added +author: Valerio Cosentino +issue: 873 +notes: > Git private repos can now be handled by +ELK, which allows to deal with the credentials that +appear in the repo URLs passed via the projects.json. +These URLs are processed when storing/retrieving the +data in the raw, enriched and studies indexes to make +sure that the credentials are not included in the +indexes nor visible on the dashboards. diff --git a/tests/base.py b/tests/base.py index e39857f4b..e36b3ddda 100644 --- a/tests/base.py +++ b/tests/base.py @@ -261,7 +261,7 @@ def _test_refresh_project(self): total = refresh_projects(self.enrich_backend) return total - def _test_study(self, test_study): + def _test_study(self, test_study, projects_json_repo=None, projects_json=None, prjs_map=None): """Test the execution of a study""" # populate raw index @@ -280,6 +280,16 @@ def _test_study(self, test_study): elastic_enrich = get_elastic(self.es_con, self.enrich_index, clean, self.enrich_backend) self.enrich_backend.set_elastic(elastic_enrich) + + if projects_json: + self.enrich_backend.json_projects = projects_json + + if projects_json_repo: + self.enrich_backend.projects_json_repo = projects_json_repo + + if prjs_map: + self.enrich_backend.prjs_map = prjs_map + self.enrich_backend.enrich_items(self.ocean_backend) for study in self.enrich_backend.studies: diff --git a/tests/data/git.json b/tests/data/git.json index 3cdf979d0..8110d0182 100644 --- a/tests/data/git.json +++ b/tests/data/git.json @@ -438,7 +438,7 @@ } ] } - },{ +},{ "backend_name": "Git", "backend_version": "0.12.0", "perceval_version": "0.14.0", diff --git a/tests/test_git.py b/tests/test_git.py index 610f39b28..547d3c05d 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -56,30 +56,42 @@ def test_items_to_raw(self): """Test whether JSON items are properly inserted into ES""" result = self._test_items_to_raw() - self.assertEqual(result['items'], 9) - self.assertEqual(result['raw'], 9) + self.assertEqual(result['items'], 11) + self.assertEqual(result['raw'], 11) aliases = self.ocean_backend.elastic.list_aliases() self.assertListEqual(self.ocean_aliases, list(aliases.keys())) + url = self.es_con + "/" + self.ocean_index + "/_search?size=20" + response = self.ocean_backend.requests.get(url, verify=False).json() + + time.sleep(5) # HACK: Wait until git enrich index has been written + for hit in response['hits']['hits']: + item = hit['_source'] + self.assertNotIn('username:password', item['origin']) + self.assertNotIn('username:password', item['tag']) + def test_raw_to_enrich(self): """Test whether the raw index is properly enriched""" result = self._test_raw_to_enrich() - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() item = self.items[0] - item['origin'] = 'https://admin:admin@gittest' eitem = enrich_backend.get_rich_item(item) self.assertEqual(eitem['committer_name'], '') + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) for item in self.items[1:]: eitem = enrich_backend.get_rich_item(item) self.assertNotEqual(eitem['committer_name'], 'Unknown') self.assertNotEqual(eitem['author_name'], 'Unknown') + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) item = self.items[1] eitem = enrich_backend.get_rich_item(item) @@ -88,11 +100,9 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['author_date_hour'], 14) self.assertEqual(eitem['utc_author_date_weekday'], 2) self.assertEqual(eitem['utc_author_date_hour'], 17) - self.assertEqual(eitem['author_uuid'], '8b8d552af706acff79df0f18f5295391c51acd79') self.assertEqual(eitem['author_domain'], 'gmail.com') self.assertEqual(eitem['author_name'], 'Eduardo Morais and Zhongpeng Lin') - self.assertEqual(eitem['commit_date'], '2012-08-14T14:32:15') self.assertEqual(eitem['commit_date_weekday'], 2) self.assertEqual(eitem['commit_date_hour'], 14) @@ -106,11 +116,9 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['author_date_hour'], 22) self.assertEqual(eitem['utc_author_date_weekday'], 3) self.assertEqual(eitem['utc_author_date_hour'], 6) - self.assertEqual(eitem['author_uuid'], '8abda7ad626330d5065d4c3a93fb45029a32bdcb') self.assertEqual(eitem['author_domain'], 'gmail.com') self.assertEqual(eitem['author_name'], 'Zhongpeng Lin (林中鹏)') - self.assertEqual(eitem['commit_date'], '2014-02-11T22:10:39') self.assertEqual(eitem['commit_date_weekday'], 2) self.assertEqual(eitem['commit_date_hour'], 22) @@ -124,11 +132,11 @@ def test_raw_to_enrich_pair_programming(self): """Test whether the raw index is properly enriched with pair programming info""" result = self._test_raw_to_enrich(pair_programming=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 11) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 13) enrich_backend = self.connectors[self.connector][2](pair_programming=True) - url = self.es_con + "/" + self.enrich_index + "/_search" + url = self.es_con + "/" + self.enrich_index + "/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() time.sleep(5) # HACK: Wait until git enrich index has been written @@ -166,38 +174,41 @@ def test_enrich_repo_labels(self): for item in self.items: eitem = enrich_backend.get_rich_item(item) self.assertIn(REPO_LABELS, eitem) + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) def test_raw_to_enrich_sorting_hat(self): """Test enrich with SortingHat""" result = self._test_raw_to_enrich(sortinghat=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() enrich_backend.sortinghat = True item = self.items[0] eitem = enrich_backend.get_rich_item(item) - self.assertEqual(eitem['committer_name'], '') - self.assertEqual(eitem['Commit_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_user_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_org_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_multi_org_names'], ['-- UNDEFINED --']) - - self.assertEqual(eitem['author_name'], 'Eduardo Morais') - self.assertEqual(eitem['Author_name'], 'Eduardo Morais') - self.assertEqual(eitem['Author_user_name'], 'Unknown') - self.assertEqual(eitem['Author_multi_org_names'], ['Unknown']) + self.assertIn('Commit_org_name', eitem) + self.assertIn('Commit_user_name', eitem) + self.assertIn('Commit_name', eitem) + self.assertIn('committer_name', eitem) + self.assertIn('Author_name', eitem) + self.assertIn('author_name', eitem) + self.assertIn('Commit_multi_org_names', eitem) + self.assertIn('Author_user_name', eitem) + self.assertIn('Author_multi_org_names', eitem) + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) def test_raw_to_enrich_projects(self): """Test enrich with Projects""" result = self._test_raw_to_enrich(projects=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() - url = self.es_con + "/" + self.enrich_index + "/_search" + url = self.es_con + "/" + self.enrich_index + "/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() for hit in response['hits']['hits']: @@ -220,6 +231,8 @@ def test_raw_to_enrich_projects(self): self.assertIn('project_1', source) self.assertEqual(source['project'], 'Main') self.assertEqual(source['project_1'], 'Main') + self.assertNotIn('username:password', source['origin']) + self.assertNotIn('username:password', source['tag']) def test_refresh_identities(self): """Test refresh identities""" @@ -251,9 +264,13 @@ def test_demography_study(self): % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until git enrich index has been written - for item in enrich_backend.fetch(): + items = [item for item in enrich_backend.fetch()] + self.assertEqual(len(items), 11) + for item in items: self.assertTrue('demography_min_date' in item.keys()) self.assertTrue('demography_max_date' in item.keys()) + self.assertNotIn('username:password', item['origin']) + self.assertNotIn('username:password', item['tag']) r = enrich_backend.elastic.requests.get(enrich_backend.elastic.index_url + "/_alias", headers=HEADER_JSON, verify=False) @@ -272,8 +289,13 @@ def test_extra_study(self): "ba298a6fb09558e68c5e4ec6ae23b1c89fe920ef/test_extra_study.txt") time.sleep(5) # HACK: Wait until git enrich index has been written - for item in enrich_backend.fetch(): - self.assertTrue('extra_secret_repo' in item.keys()) + items = [item for item in enrich_backend.fetch()] + self.assertEqual(len(items), 11) + for item in items: + if item['origin'] == '/tmp/perceval_mc84igfc/gittest': + self.assertIn('extra_secret_repo', item.keys()) + else: + self.assertNotIn('extra_secret_repo', item.keys()) def test_enrich_forecast_activity(self): """ Test that the forecast activity study works correctly """ @@ -291,7 +313,7 @@ def test_enrich_forecast_activity(self): '[enrich-forecast-activity] End study') time.sleep(5) # HACK: Wait until git enrich index has been written - url = self.es_con + "/git_study_forecast_activity/_search" + url = self.es_con + "/git_study_forecast_activity/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() for hit in response['hits']['hits']: source = hit['_source'] @@ -335,10 +357,26 @@ def test_onion_study(self): time.sleep(1) - url = self.es_con + "/git_onion-enriched/_count" + url = self.es_con + "/git_onion-enriched/_search?size=20" response = requests.get(url, verify=False).json() - - self.assertGreater(response['count'], 0) + hits = response['hits']['hits'] + self.assertEqual(len(hits), 12) + for hit in hits: + source = hit['_source'] + self.assertIn('timeframe', source) + self.assertIn('author_uuid', source) + self.assertIn('author_name', source) + self.assertIn('contributions', source) + self.assertIn('metadata__timestamp', source) + self.assertIn('project', source) + self.assertIn('author_org_name', source) + self.assertIn('cum_net_sum', source) + self.assertIn('percent_cum_net_sum', source) + self.assertIn('onion_role', source) + self.assertIn('quarter', source) + self.assertIn('metadata__enriched_on', source) + self.assertIn('data_source', source) + self.assertIn('grimoire_creation_date', source) delete_onion = self.es_con + "/git_onion-enriched" requests.delete(delete_onion, verify=False) @@ -346,8 +384,6 @@ def test_onion_study(self): def test_enrich_areas_of_code(self): """ Test that areas of code works correctly""" - study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code') - projects_json_repo = "/tmp/perceval_mc84igfc/gittest" projects_json = { "project": { @@ -362,14 +398,18 @@ def test_enrich_areas_of_code(self): } } - enrich_backend.json_projects = projects_json - enrich_backend.projects_json_repo = projects_json_repo - enrich_backend.prjs_map = prjs_map + study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code', + projects_json=projects_json, + prjs_map=prjs_map, + projects_json_repo=projects_json_repo) + study(ocean_backend, enrich_backend, in_index='test_git') time.sleep(5) # HACK: Wait until git area of code has been written - url = self.es_con + "/git_aoc-enriched/_search" + url = self.es_con + "/git_aoc-enriched/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() - for hit in response['hits']['hits']: + hits = response['hits']['hits'] + self.assertEqual(len(hits), 12) + for hit in hits: source = hit['_source'] self.assertIn('addedlines', source) self.assertIn('author_bot', source) @@ -408,9 +448,84 @@ def test_enrich_areas_of_code(self): self.assertIn('removedlines', source) self.assertIn('repository', source) self.assertIn('uuid', source) + self.assertEqual(source['origin'], '/tmp/perceval_mc84igfc/gittest') + self.assertEqual(source['repository'], '/tmp/perceval_mc84igfc/gittest') - delete_survival = self.es_con + "/git_aoc-enriched" - requests.delete(delete_survival, verify=False) + delete_aoc = self.es_con + "/git_aoc-enriched" + requests.delete(delete_aoc, verify=False) + + def test_enrich_areas_of_code_private_repo(self): + """ Test that areas of code works correctly for git private repos""" + + projects_json_repo = "https://username:password@github.com/acme/errors" + projects_json = { + "secret-repo": { + "git": [ + "https://username:password@github.com/acme/errors" + ] + } + } + prjs_map = { + "git": { + "https://username:password@github.com/acme/errors": "secret-repo" + } + } + + study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code', + projects_json=projects_json, + prjs_map=prjs_map, + projects_json_repo=projects_json_repo) + + study(ocean_backend, enrich_backend, in_index='test_git') + time.sleep(5) # HACK: Wait until git area of code has been written + url = self.es_con + "/git_aoc-enriched_anonymized/_search?size=20" + response = enrich_backend.requests.get(url, verify=False).json() + hits = response['hits']['hits'] + self.assertEqual(len(hits), 2) + for hit in hits: + source = hit['_source'] + self.assertIn('addedlines', source) + self.assertIn('author_bot', source) + self.assertIn('author_domain', source) + self.assertIn('author_id', source) + self.assertIn('author_name', source) + self.assertIn('author_org_name', source) + self.assertIn('author_multi_org_names', source) + self.assertIn('author_user_name', source) + self.assertIn('author_uuid', source) + self.assertIn('committer', source) + self.assertIn('committer_date', source) + self.assertIn('date', source) + self.assertIn('eventtype', source) + self.assertIn('fileaction', source) + self.assertIn('filepath', source) + self.assertIn('files', source) + self.assertIn('filetype', source) + self.assertIn('file_name', source) + self.assertIn('file_ext', source) + self.assertIn('file_dir_name', source) + self.assertIn('file_path_list', source) + self.assertIn('git_author_domain', source) + self.assertIn('grimoire_creation_date', source) + self.assertIn('hash', source) + self.assertIn('id', source) + self.assertIn('message', source) + self.assertIn('metadata__enriched_on', source) + self.assertIn('metadata__timestamp', source) + self.assertIn('metadata__updated_on', source) + self.assertIn('origin', source) + self.assertIn('owner', source) + self.assertIn('perceval_uuid', source) + self.assertIn('project', source) + self.assertIn('project_1', source) + self.assertIn('removedlines', source) + self.assertIn('repository', source) + self.assertIn('uuid', source) + self.assertEqual(source['origin'], 'https://github.com/acme/errors') + self.assertEqual(source['repository'], 'https://github.com/acme/errors') + + delete_aoc = self.es_con + "/git_aoc-enriched_anonymized" + requests.delete(delete_aoc, verify=False) def test_perceval_params(self): """Test the extraction of perceval params from an URL"""