diff --git a/grimoire_elk/enriched/enrich.py b/grimoire_elk/enriched/enrich.py index 7dfe9f952..3bdc6cd4f 100644 --- a/grimoire_elk/enriched/enrich.py +++ b/grimoire_elk/enriched/enrich.py @@ -546,7 +546,7 @@ def find_item_project(self, eitem): # elk.enrich_backend) if self.projects_json_repo: project = self.prjs_map[ds_name][self.projects_json_repo] - # if `projects_json_repo`, which shouldn't never happen, use the + # if `projects_json_repo` (e.g., AOC study), use the # method `get_project_repository` (defined in each enricher) else: repository = self.get_project_repository(eitem) @@ -561,6 +561,16 @@ def find_item_project(self, eitem): fltr = eitem['origin'] + ' --filter-raw=' + self.filter_raw if ds_name in self.prjs_map and fltr in self.prjs_map[ds_name]: project = self.prjs_map[ds_name][fltr] + elif ds_name in self.prjs_map: + # this code is executed to retrieve the project of private repositories (in particular Git ones) + # the URLs in the prjs_map are retrieved, anonymized and compared with the value + # returned by `get_project_repository` + repository = self.get_project_repository(eitem) + for r in self.prjs_map[ds_name]: + anonymized_repo = anonymize_url(r) + if repository == anonymized_repo: + project = self.prjs_map[ds_name][r] + break if project == UNKNOWN_PROJECT: return None diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 17edd0ba8..0c91fa26f 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -568,9 +568,10 @@ def enrich_areas_of_code(self, ocean_backend, enrich_backend, no_incremental=Fal repos.extend(items) for repo in repos: - logger.info("{} Processing repo: {}".format(log_prefix, repo)) - in_conn.update_repo(repo) - out_conn.update_repo(repo) + anonymize_repo = anonymize_url(repo) + logger.info("{} Processing repo: {}".format(log_prefix, anonymize_repo)) + in_conn.update_repo(anonymize_repo) + out_conn.update_repo(anonymize_repo) areas_of_code(git_enrich=enrich_backend, in_conn=in_conn, out_conn=out_conn) # Create alias if output index exists and alias does not @@ -776,7 +777,7 @@ def delete_commit_branches(self, git_repo, enrich_backend): } } ] - """ % git_repo.uri + """ % anonymize_url(git_repo.uri) # reset references in enrich index es_query = """ @@ -856,7 +857,7 @@ def __process_commits_in_branch(self, enrich_backend, repo_origin, branch_name, logger.warning("[git] Change branch name from {} to {}".format(branch_name, digested_branch_name)) # update enrich index - fltr = self.__prepare_filter("hash", commits_str, repo_origin) + fltr = self.__prepare_filter("hash", commits_str, anonymize_url(repo_origin)) es_query = """ { diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py index c840a15f5..b783bc301 100644 --- a/grimoire_elk/enriched/graal_study_evolution.py +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -20,7 +20,7 @@ # Nishchith Shetty # -from grimoirelab_toolkit.datetime import str_to_datetime +from grimoirelab_toolkit.datetime import str_to_datetime, unixtime_to_datetime def get_unique_repository(): @@ -178,9 +178,15 @@ def get_to_date(es_in, in_index, out_index, repository_url, interval): index=out_index, body=get_last_study_date(repository_url, interval))["aggregations"]["1"] - if last_study_date["value"] is not None: + if "value_as_string" in last_study_date and last_study_date["value_as_string"]: study_data_available = True to_date = str_to_datetime(last_study_date["value_as_string"]) + elif "value" in last_study_date and last_study_date["value"]: + study_data_available = True + try: + to_date = unixtime_to_datetime(last_study_date["value"]) + except Exception: + to_date = unixtime_to_datetime(last_study_date["value"] / 1000) if not study_data_available: first_item_date = es_in.search( diff --git a/grimoire_elk/raw/git.py b/grimoire_elk/raw/git.py index 35eb8927e..2abb2a3fe 100644 --- a/grimoire_elk/raw/git.py +++ b/grimoire_elk/raw/git.py @@ -64,6 +64,7 @@ class GitOcean(ElasticOcean): def _fix_item(self, item): item['origin'] = anonymize_url(item['origin']) + item['tag'] = anonymize_url(item['tag']) @classmethod def get_perceval_params_from_url(cls, url): diff --git a/releases/unreleased/support-for-git-private-repos.yml b/releases/unreleased/support-for-git-private-repos.yml new file mode 100644 index 000000000..e867587cf --- /dev/null +++ b/releases/unreleased/support-for-git-private-repos.yml @@ -0,0 +1,12 @@ +--- +title: Support for Git private repos +category: added +author: Valerio Cosentino +issue: 873 +notes: > Git private repos can now be handled by +ELK, which allows to deal with the credentials that +appear in the repo URLs passed via the projects.json. +These URLs are processed when storing/retrieving the +data in the raw, enriched and studies indexes to make +sure that the credentials are not included in the +indexes nor visible on the dashboards. diff --git a/tests/base.py b/tests/base.py index e39857f4b..e36b3ddda 100644 --- a/tests/base.py +++ b/tests/base.py @@ -261,7 +261,7 @@ def _test_refresh_project(self): total = refresh_projects(self.enrich_backend) return total - def _test_study(self, test_study): + def _test_study(self, test_study, projects_json_repo=None, projects_json=None, prjs_map=None): """Test the execution of a study""" # populate raw index @@ -280,6 +280,16 @@ def _test_study(self, test_study): elastic_enrich = get_elastic(self.es_con, self.enrich_index, clean, self.enrich_backend) self.enrich_backend.set_elastic(elastic_enrich) + + if projects_json: + self.enrich_backend.json_projects = projects_json + + if projects_json_repo: + self.enrich_backend.projects_json_repo = projects_json_repo + + if prjs_map: + self.enrich_backend.prjs_map = prjs_map + self.enrich_backend.enrich_items(self.ocean_backend) for study in self.enrich_backend.studies: diff --git a/tests/data/git.json b/tests/data/git.json index 3cdf979d0..8110d0182 100644 --- a/tests/data/git.json +++ b/tests/data/git.json @@ -438,7 +438,7 @@ } ] } - },{ +},{ "backend_name": "Git", "backend_version": "0.12.0", "perceval_version": "0.14.0", diff --git a/tests/test_git.py b/tests/test_git.py index 610f39b28..547d3c05d 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -56,30 +56,42 @@ def test_items_to_raw(self): """Test whether JSON items are properly inserted into ES""" result = self._test_items_to_raw() - self.assertEqual(result['items'], 9) - self.assertEqual(result['raw'], 9) + self.assertEqual(result['items'], 11) + self.assertEqual(result['raw'], 11) aliases = self.ocean_backend.elastic.list_aliases() self.assertListEqual(self.ocean_aliases, list(aliases.keys())) + url = self.es_con + "/" + self.ocean_index + "/_search?size=20" + response = self.ocean_backend.requests.get(url, verify=False).json() + + time.sleep(5) # HACK: Wait until git enrich index has been written + for hit in response['hits']['hits']: + item = hit['_source'] + self.assertNotIn('username:password', item['origin']) + self.assertNotIn('username:password', item['tag']) + def test_raw_to_enrich(self): """Test whether the raw index is properly enriched""" result = self._test_raw_to_enrich() - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() item = self.items[0] - item['origin'] = 'https://admin:admin@gittest' eitem = enrich_backend.get_rich_item(item) self.assertEqual(eitem['committer_name'], '') + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) for item in self.items[1:]: eitem = enrich_backend.get_rich_item(item) self.assertNotEqual(eitem['committer_name'], 'Unknown') self.assertNotEqual(eitem['author_name'], 'Unknown') + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) item = self.items[1] eitem = enrich_backend.get_rich_item(item) @@ -88,11 +100,9 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['author_date_hour'], 14) self.assertEqual(eitem['utc_author_date_weekday'], 2) self.assertEqual(eitem['utc_author_date_hour'], 17) - self.assertEqual(eitem['author_uuid'], '8b8d552af706acff79df0f18f5295391c51acd79') self.assertEqual(eitem['author_domain'], 'gmail.com') self.assertEqual(eitem['author_name'], 'Eduardo Morais and Zhongpeng Lin') - self.assertEqual(eitem['commit_date'], '2012-08-14T14:32:15') self.assertEqual(eitem['commit_date_weekday'], 2) self.assertEqual(eitem['commit_date_hour'], 14) @@ -106,11 +116,9 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['author_date_hour'], 22) self.assertEqual(eitem['utc_author_date_weekday'], 3) self.assertEqual(eitem['utc_author_date_hour'], 6) - self.assertEqual(eitem['author_uuid'], '8abda7ad626330d5065d4c3a93fb45029a32bdcb') self.assertEqual(eitem['author_domain'], 'gmail.com') self.assertEqual(eitem['author_name'], 'Zhongpeng Lin (林中鹏)') - self.assertEqual(eitem['commit_date'], '2014-02-11T22:10:39') self.assertEqual(eitem['commit_date_weekday'], 2) self.assertEqual(eitem['commit_date_hour'], 22) @@ -124,11 +132,11 @@ def test_raw_to_enrich_pair_programming(self): """Test whether the raw index is properly enriched with pair programming info""" result = self._test_raw_to_enrich(pair_programming=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 11) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 13) enrich_backend = self.connectors[self.connector][2](pair_programming=True) - url = self.es_con + "/" + self.enrich_index + "/_search" + url = self.es_con + "/" + self.enrich_index + "/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() time.sleep(5) # HACK: Wait until git enrich index has been written @@ -166,38 +174,41 @@ def test_enrich_repo_labels(self): for item in self.items: eitem = enrich_backend.get_rich_item(item) self.assertIn(REPO_LABELS, eitem) + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) def test_raw_to_enrich_sorting_hat(self): """Test enrich with SortingHat""" result = self._test_raw_to_enrich(sortinghat=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() enrich_backend.sortinghat = True item = self.items[0] eitem = enrich_backend.get_rich_item(item) - self.assertEqual(eitem['committer_name'], '') - self.assertEqual(eitem['Commit_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_user_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_org_name'], '-- UNDEFINED --') - self.assertEqual(eitem['Commit_multi_org_names'], ['-- UNDEFINED --']) - - self.assertEqual(eitem['author_name'], 'Eduardo Morais') - self.assertEqual(eitem['Author_name'], 'Eduardo Morais') - self.assertEqual(eitem['Author_user_name'], 'Unknown') - self.assertEqual(eitem['Author_multi_org_names'], ['Unknown']) + self.assertIn('Commit_org_name', eitem) + self.assertIn('Commit_user_name', eitem) + self.assertIn('Commit_name', eitem) + self.assertIn('committer_name', eitem) + self.assertIn('Author_name', eitem) + self.assertIn('author_name', eitem) + self.assertIn('Commit_multi_org_names', eitem) + self.assertIn('Author_user_name', eitem) + self.assertIn('Author_multi_org_names', eitem) + self.assertNotIn('username:password', eitem['origin']) + self.assertNotIn('username:password', eitem['tag']) def test_raw_to_enrich_projects(self): """Test enrich with Projects""" result = self._test_raw_to_enrich(projects=True) - self.assertEqual(result['raw'], 9) - self.assertEqual(result['enrich'], 9) + self.assertEqual(result['raw'], 11) + self.assertEqual(result['enrich'], 11) enrich_backend = self.connectors[self.connector][2]() - url = self.es_con + "/" + self.enrich_index + "/_search" + url = self.es_con + "/" + self.enrich_index + "/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() for hit in response['hits']['hits']: @@ -220,6 +231,8 @@ def test_raw_to_enrich_projects(self): self.assertIn('project_1', source) self.assertEqual(source['project'], 'Main') self.assertEqual(source['project_1'], 'Main') + self.assertNotIn('username:password', source['origin']) + self.assertNotIn('username:password', source['tag']) def test_refresh_identities(self): """Test refresh identities""" @@ -251,9 +264,13 @@ def test_demography_study(self): % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until git enrich index has been written - for item in enrich_backend.fetch(): + items = [item for item in enrich_backend.fetch()] + self.assertEqual(len(items), 11) + for item in items: self.assertTrue('demography_min_date' in item.keys()) self.assertTrue('demography_max_date' in item.keys()) + self.assertNotIn('username:password', item['origin']) + self.assertNotIn('username:password', item['tag']) r = enrich_backend.elastic.requests.get(enrich_backend.elastic.index_url + "/_alias", headers=HEADER_JSON, verify=False) @@ -272,8 +289,13 @@ def test_extra_study(self): "ba298a6fb09558e68c5e4ec6ae23b1c89fe920ef/test_extra_study.txt") time.sleep(5) # HACK: Wait until git enrich index has been written - for item in enrich_backend.fetch(): - self.assertTrue('extra_secret_repo' in item.keys()) + items = [item for item in enrich_backend.fetch()] + self.assertEqual(len(items), 11) + for item in items: + if item['origin'] == '/tmp/perceval_mc84igfc/gittest': + self.assertIn('extra_secret_repo', item.keys()) + else: + self.assertNotIn('extra_secret_repo', item.keys()) def test_enrich_forecast_activity(self): """ Test that the forecast activity study works correctly """ @@ -291,7 +313,7 @@ def test_enrich_forecast_activity(self): '[enrich-forecast-activity] End study') time.sleep(5) # HACK: Wait until git enrich index has been written - url = self.es_con + "/git_study_forecast_activity/_search" + url = self.es_con + "/git_study_forecast_activity/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() for hit in response['hits']['hits']: source = hit['_source'] @@ -335,10 +357,26 @@ def test_onion_study(self): time.sleep(1) - url = self.es_con + "/git_onion-enriched/_count" + url = self.es_con + "/git_onion-enriched/_search?size=20" response = requests.get(url, verify=False).json() - - self.assertGreater(response['count'], 0) + hits = response['hits']['hits'] + self.assertEqual(len(hits), 12) + for hit in hits: + source = hit['_source'] + self.assertIn('timeframe', source) + self.assertIn('author_uuid', source) + self.assertIn('author_name', source) + self.assertIn('contributions', source) + self.assertIn('metadata__timestamp', source) + self.assertIn('project', source) + self.assertIn('author_org_name', source) + self.assertIn('cum_net_sum', source) + self.assertIn('percent_cum_net_sum', source) + self.assertIn('onion_role', source) + self.assertIn('quarter', source) + self.assertIn('metadata__enriched_on', source) + self.assertIn('data_source', source) + self.assertIn('grimoire_creation_date', source) delete_onion = self.es_con + "/git_onion-enriched" requests.delete(delete_onion, verify=False) @@ -346,8 +384,6 @@ def test_onion_study(self): def test_enrich_areas_of_code(self): """ Test that areas of code works correctly""" - study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code') - projects_json_repo = "/tmp/perceval_mc84igfc/gittest" projects_json = { "project": { @@ -362,14 +398,18 @@ def test_enrich_areas_of_code(self): } } - enrich_backend.json_projects = projects_json - enrich_backend.projects_json_repo = projects_json_repo - enrich_backend.prjs_map = prjs_map + study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code', + projects_json=projects_json, + prjs_map=prjs_map, + projects_json_repo=projects_json_repo) + study(ocean_backend, enrich_backend, in_index='test_git') time.sleep(5) # HACK: Wait until git area of code has been written - url = self.es_con + "/git_aoc-enriched/_search" + url = self.es_con + "/git_aoc-enriched/_search?size=20" response = enrich_backend.requests.get(url, verify=False).json() - for hit in response['hits']['hits']: + hits = response['hits']['hits'] + self.assertEqual(len(hits), 12) + for hit in hits: source = hit['_source'] self.assertIn('addedlines', source) self.assertIn('author_bot', source) @@ -408,9 +448,84 @@ def test_enrich_areas_of_code(self): self.assertIn('removedlines', source) self.assertIn('repository', source) self.assertIn('uuid', source) + self.assertEqual(source['origin'], '/tmp/perceval_mc84igfc/gittest') + self.assertEqual(source['repository'], '/tmp/perceval_mc84igfc/gittest') - delete_survival = self.es_con + "/git_aoc-enriched" - requests.delete(delete_survival, verify=False) + delete_aoc = self.es_con + "/git_aoc-enriched" + requests.delete(delete_aoc, verify=False) + + def test_enrich_areas_of_code_private_repo(self): + """ Test that areas of code works correctly for git private repos""" + + projects_json_repo = "https://username:password@github.com/acme/errors" + projects_json = { + "secret-repo": { + "git": [ + "https://username:password@github.com/acme/errors" + ] + } + } + prjs_map = { + "git": { + "https://username:password@github.com/acme/errors": "secret-repo" + } + } + + study, ocean_backend, enrich_backend = self._test_study('enrich_areas_of_code', + projects_json=projects_json, + prjs_map=prjs_map, + projects_json_repo=projects_json_repo) + + study(ocean_backend, enrich_backend, in_index='test_git') + time.sleep(5) # HACK: Wait until git area of code has been written + url = self.es_con + "/git_aoc-enriched_anonymized/_search?size=20" + response = enrich_backend.requests.get(url, verify=False).json() + hits = response['hits']['hits'] + self.assertEqual(len(hits), 2) + for hit in hits: + source = hit['_source'] + self.assertIn('addedlines', source) + self.assertIn('author_bot', source) + self.assertIn('author_domain', source) + self.assertIn('author_id', source) + self.assertIn('author_name', source) + self.assertIn('author_org_name', source) + self.assertIn('author_multi_org_names', source) + self.assertIn('author_user_name', source) + self.assertIn('author_uuid', source) + self.assertIn('committer', source) + self.assertIn('committer_date', source) + self.assertIn('date', source) + self.assertIn('eventtype', source) + self.assertIn('fileaction', source) + self.assertIn('filepath', source) + self.assertIn('files', source) + self.assertIn('filetype', source) + self.assertIn('file_name', source) + self.assertIn('file_ext', source) + self.assertIn('file_dir_name', source) + self.assertIn('file_path_list', source) + self.assertIn('git_author_domain', source) + self.assertIn('grimoire_creation_date', source) + self.assertIn('hash', source) + self.assertIn('id', source) + self.assertIn('message', source) + self.assertIn('metadata__enriched_on', source) + self.assertIn('metadata__timestamp', source) + self.assertIn('metadata__updated_on', source) + self.assertIn('origin', source) + self.assertIn('owner', source) + self.assertIn('perceval_uuid', source) + self.assertIn('project', source) + self.assertIn('project_1', source) + self.assertIn('removedlines', source) + self.assertIn('repository', source) + self.assertIn('uuid', source) + self.assertEqual(source['origin'], 'https://github.com/acme/errors') + self.assertEqual(source['repository'], 'https://github.com/acme/errors') + + delete_aoc = self.es_con + "/git_aoc-enriched_anonymized" + requests.delete(delete_aoc, verify=False) def test_perceval_params(self): """Test the extraction of perceval params from an URL"""