From 9b14c1800bfaf2f8983f6d5ba6fdd547fde4e908 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 13 Apr 2023 10:00:58 -0500 Subject: [PATCH] chore(security): removing unmaintained es7 upgrade proces (#7790) --- contrib/elasticsearch/es7-upgrade/Dockerfile | 5 - contrib/elasticsearch/es7-upgrade/transfer.py | 240 ------------------ docs-website/sidebars.js | 1 - docs/advanced/es-7-upgrade.md | 38 --- docs/how/updating-datahub.md | 2 + 5 files changed, 2 insertions(+), 284 deletions(-) delete mode 100644 contrib/elasticsearch/es7-upgrade/Dockerfile delete mode 100644 contrib/elasticsearch/es7-upgrade/transfer.py delete mode 100644 docs/advanced/es-7-upgrade.md diff --git a/contrib/elasticsearch/es7-upgrade/Dockerfile b/contrib/elasticsearch/es7-upgrade/Dockerfile deleted file mode 100644 index 19b624989ec51..0000000000000 --- a/contrib/elasticsearch/es7-upgrade/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:3.10 -COPY . . -RUN pip install --upgrade pip -RUN pip install elasticsearch -ENTRYPOINT ["python", "transfer.py"] diff --git a/contrib/elasticsearch/es7-upgrade/transfer.py b/contrib/elasticsearch/es7-upgrade/transfer.py deleted file mode 100644 index 6e443336bc6c7..0000000000000 --- a/contrib/elasticsearch/es7-upgrade/transfer.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copies indices (settings, mappings, and optionally data) from a 5 cluster to a 7 cluster. -# Note that when copying data, the copied is performed through this machine, meaning all data is downloaded from 5, -# and then uploaded to 7. This can be a very slow process if you have a lot of data, and is recommended you only do -# this for small indices as a result. - -# Requires python 3+ and elasticsearch's python lib to be installed (pip install elasticsearch). - -import argparse -import elasticsearch -import elasticsearch.helpers -import ssl -import time - -parser = argparse.ArgumentParser(description="Transfers ES indexes between clusters.") -parser.add_argument('-s', '--source', required=True, help='Source cluster URL and port.') -parser.add_argument('-d', '--dest', required=True, help='Destination cluster URL and port.') -parser.add_argument('--disable-source-ssl', required=False, action='store_true', help='If set, disable source SSL.') -parser.add_argument('--disable-dest-ssl', required=False, action='store_true', help='If set, disable destination SSL.') -parser.add_argument('--cert-file', required=False, default=None, help='Cert file to use with SSL.') -parser.add_argument('--key-file', required=False, default=None, help='Key file to use with SSL.') -parser.add_argument('--ca-file', required=False, default=None, help='Certificate authority file to use for SSL.') -parser.add_argument('--create-only', required=False, action='store_true', help='If set, only create the index (with settings/mappings/aliases).') -parser.add_argument('-i', '--indices', required=False, default="*", help='Regular expression for indexes to copy.') -parser.add_argument('--name-override', required=False, default=None, help='destination index name override') - -args = parser.parse_args() - - -def create_ssl_context(): - if args.cert_file is None: - raise Error('--cert-file is required with SSL.') - if args.key_file is None: - raise Error('--key-file is required with SSL.') - if args.ca_file is None: - raise Error('--ca-file is required with SSL.') - - context = ssl.create_default_context( - ssl.Purpose.SERVER_AUTH, - cafile=args.ca_file - ) - context.load_cert_chain( - certfile=args.cert_file, - keyfile=args.key_file - ) - - return context - - -def create_client(host, ssl_context): - return elasticsearch.Elasticsearch( - [host], - ssl_context=ssl_context - ) - - -class EsClients: - def __init__(self, source_client, dest_client): - self.source_client = source_client - self.dest_client = dest_client - - -def get_index_settings(client, pattern): - indices = elasticsearch.client.IndicesClient(client).get(pattern) - return indices - - -def clean_settings(config): - # Settings set by the server that we can read, but not write. - del config['settings']['index']['provided_name'] - del config['settings']['index']['version'] - del config['settings']['index']['creation_date'] - del config['settings']['index']['uuid'] - return config - - -def find_max_ngram_diff_helper(obj): - # Finds the greatest diff in ngram settings and returns the value. In Elasticsearch 7, an upper bound must be - # explicitly set. - if not isinstance(obj, dict): - return -1 - - diff = -1 - - if 'min_gram' in obj and 'max_gram' in obj: - diff = int(obj['max_gram']) - int(obj['min_gram']) - - for value in obj.values(): - t = find_max_ngram_diff_helper(value) - diff = max(t, diff) - - return diff - - -def find_max_ngram_diff(config): - settings = config['settings'] - return find_max_ngram_diff_helper(settings) - - -def update_for_seven(config): - # Updates settings and mappings for Elasticsearch 7. - - # Should only be one value in 5 - the doc type. Unwrap for 7; document types are deprecated. - config['mappings'] = next(iter(config['mappings'].values())) - - # Need to set max_ngram_diff if any ngram diffs are more than 1. - max_ngram = find_max_ngram_diff(config) - if max_ngram > 1: - config['settings']['index']['max_ngram_diff'] = max_ngram - - # _all is deprecated and also false by default; so not even explicitly needed... - if '_all' in config['mappings']: - enabled = config['mappings']['_all']['enabled'] - if enabled: - raise Error('_all is enabled') - del config['mappings']['_all'] - - return config - - -def create_index(client, name, config, name_override=None): - name_override = name if name_override is None else name_override - # Creates the given index on the client. - indices_client = elasticsearch.client.IndicesClient(client) - if indices_client.exists(name_override): - print('WARNING: Index %s already exists!' % name_override) - return - indices_client.create(name_override, body=config) - - -timing_samples = [] - -# Copy pasted from source code so that we can transform documents while copying -def reindex( - client, - source_index, - target_index, - query=None, - target_client=None, - chunk_size=500, - scroll="5m", - scan_kwargs={}, - bulk_kwargs={}, -): - # Like the elasticsearch.helpers.reindex function, but with some custom logic. Namely, allows for source/dest - # indices to be on different clusters, prints status updates, and deletes the _type field. - - target_client = client if target_client is None else target_client - docs = elasticsearch.helpers.scan(client, query=query, index=source_index, scroll=scroll, **scan_kwargs) - - start = time.time() - count = 0 - count_at_last_update = 0 - last_print = start - update_interval = 5 - - def _change_doc_index(hits, index): - for h in hits: - h["_index"] = index - if "fields" in h: - h.update(h.pop("fields")) - - # TODO: Need to remove "_type" otherwise it complains about keyword becoming text? Is this legitimate? - if "_type" in h: - del h["_type"] - - nonlocal count - nonlocal last_print - nonlocal count_at_last_update - count = count + 1 - - # Use a window of samples to average over. - if (time.time() - last_print) > update_interval: - timing_samples.append((count - count_at_last_update) / (time.time() - last_print)) - if len(timing_samples) > 10: - timing_samples.pop(0) - count_at_last_update = count - last_print = time.time() - print('Transferring %s docs/second. Total %s.' % (sum(timing_samples) / len(timing_samples), count)) - - yield h - - kwargs = {"stats_only": True} - kwargs.update(bulk_kwargs) - return elasticsearch.helpers.bulk( - target_client, - _change_doc_index(docs, target_index), - chunk_size=chunk_size, - raise_on_error=False, - **kwargs - ) - - -def copy_index_data(clients, index, name_override): - # Copies all documents from the source to the dest index. - name_override = index if name_override is None else name_override - print('Copying index %s' % index) - start = time.time() - res = reindex( - clients.source_client, - index, - name_override, - target_client=clients.dest_client - ) - end = time.time() - print('Documents written %s. Errors %s.' % res) - print('Took %s seconds.' % (end - start)) - - -def main(): - ssl_context = create_ssl_context() if not args.disable_source_ssl or not args.disable_dest_ssl else None - source_ssl_context = ssl_context if not args.disable_source_ssl else None - dest_ssl_context = ssl_context if not args.disable_dest_ssl else None - clients = EsClients(create_client(args.source, source_ssl_context), create_client(args.dest, dest_ssl_context)) - indices = get_index_settings(clients.source_client, args.indices) - - def by_index(item): - return item[0] - - # Sort for repeatability, and to make it easy to restart part way if the script failed. - indexSorted = list(indices.items()) - indexSorted.sort(key=by_index) - - for index, config in indexSorted: - # Skip this "hidden" index that is listed for some reason. - if index == '.kibana': - continue - - config = clean_settings(config) - config = update_for_seven(config) - print('Creating index %s' % (index if args.name_override is None else args.name_override)) - create_index(clients.dest_client, index, config, args.name_override) - - if args.create_only: - return - - for index, config in indexSorted: - copy_index_data(clients, index, args.name_override) - - -main() diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 9e64cd161f106..575f1b30dc1a4 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -508,7 +508,6 @@ module.exports = { "docker/datahub-upgrade/README", "docs/advanced/no-code-modeling", "datahub-web-react/src/app/analytics/README", - "docs/advanced/es-7-upgrade", "docs/how/migrating-graph-service-implementation", "docs/advanced/field-path-spec-v2", "metadata-ingestion/adding-source", diff --git a/docs/advanced/es-7-upgrade.md b/docs/advanced/es-7-upgrade.md deleted file mode 100644 index 58e86e54d921b..0000000000000 --- a/docs/advanced/es-7-upgrade.md +++ /dev/null @@ -1,38 +0,0 @@ -# Elasticsearch upgrade from 5.6.8 to 7.9.3 - -## Summary of changes -Checkout the list of breaking changes for [Elasticsearch 6](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/breaking-changes-6.0.html) and [Elasticsearch 7](https://www.elastic.co/guide/en/elasticsearch/reference/7.x/breaking-changes-7.0.html). Following is the summary of changes that impact Datahub. - -### Search index mapping & settings -- Removal of mapping types (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/removal-of-types.html)) -- Specify the maximum allowed difference between `min_gram` and `max_gram` for NGramTokenizer and NGramTokenFilter by adding property `max_ngram_diff` in index settings, particularly if the difference is greater than 1 (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html)) - -### Search query -The following parameters are/were `optional` and hence automatically populated in the search query. Some tests that expect a certain search query to be sent to ES will change with the ES upgrade. -- `disable_coord` parameter of the `bool` and `common_terms` queries has been removed (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/breaking-changes-6.0.html)) -- `auto_generate_synonyms_phrase_query` parameter in `match` query is added with a default value of `true` (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-match-query.html)) - -### Java High Level Rest Client -- In 7.9.3, Java High Level Rest Client instance needs a REST low-level client builder to be built. In 5.6.8, the same instance needs REST low-level client -- Document APIs such as the Index API, Delete API, etc no longer takes the doc `type` as an input - -## Migration strategy - -As mentioned in the docs, indices created in Elasticsearch 5.x are not readable by Elasticsearch 7.x. Running the upgraded elasticsearch container on the existing esdata volume will fail. - -For local development, our recommendation is to run the `docker/nuke.sh` script to remove the existing esdata volume before starting up the containers. Note, all data will be lost. - -To migrate without losing data, please refer to the python script and Dockerfile in `contrib/elasticsearch/es7-upgrade`. The script takes source and destination elasticsearch cluster URL and SSL configuration (if applicable) as input. It ports the mappings and settings for all indices in the source cluster to the destination cluster making the necessary changes stated above. Then it transfers all documents in the source cluster to the destination cluster. - -You can run the script in a docker container as follows -``` -docker build -t migrate-es-7 . -docker run migrate-es-7 -s SOURCE -d DEST [--disable-source-ssl] - [--disable-dest-ssl] [--cert-file CERT_FILE] - [--key-file KEY_FILE] [--ca-file CA_FILE] [--create-only] - [-i INDICES] [--name-override NAME_OVERRIDE] -``` - -## Plan - -We will create an "elasticsearch-5-legacy" branch with the version of master prior to the elasticsearch 7 upgrade. However, we will not be supporting this branch moving forward and all future development will be done using elasticsearch 7.9.3 \ No newline at end of file diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index c4552d0df823a..9e12a15ca09c7 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -13,6 +13,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Potential Downtime ### Deprecations +- The docker image and script for updating from Elasticsearch 6 to 7 is no longer being maintained and will be removed from the `/contrib` section of +the repository. Please refer to older releases if needed. ### Other notable Changes