-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #343 from thepsalmist/chore/ES_reindex_api
chore: ES reindex implementation
- Loading branch information
Showing
2 changed files
with
323 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#!/bin/sh | ||
|
||
# Elasticsearch Reindexing Script | ||
# This script performs a reindexing operation from a source index to a destination index in Elasticsearch, | ||
# with the following checks: | ||
# 1. Ensures Elasticsearch is running and reachable. | ||
# 2. Verifies that the source index exists. | ||
# 3. Checks that the destination index does not already exist. | ||
# 4. Initiates the reindex task asynchronously and retrieves the task ID. | ||
# 5. Logs output for each step and provides appropriate error handling. | ||
# | ||
|
||
# Usage: | ||
# Modify the variables ES_HOST, SOURCE_INDEX, and DEST_SUFFIX and MAX_DOCS as needed. | ||
# Run this script using: sh reindex.sh | ||
# ./run-elastic-reindex.sh @ | ||
# | ||
|
||
display_help() { | ||
echo "Elasticsearch Reindexing Script" | ||
echo | ||
echo "Usage: sh run-elastic-reindex.sh [-h] [-e ES_HOST] -s SOURCE_INDEX... -d DEST_SUFFIX" | ||
echo "Example: sh run-elastic-reindex.sh -e http://localhost:9200 -s mc_search-000001 mc_search-000002 -d reindexed -m 1000" | ||
echo | ||
echo "Arguments:" | ||
echo " -h Show help information." | ||
echo " -e ES_HOST Optional. The URL of the Elasticsearch host (default: http://localhost:9200)." | ||
echo " -s SOURCE_INDEX Required. One or more source indices to reindex from (space-separated)." | ||
echo " -d DEST_SUFFIX Required. Suffix for the destination index names." | ||
echo " -m MAX_DOCS Optional. The maximum number of documents to re-index. Must be a positive integer." | ||
echo " -q QUERY Optional. The query to reindex a sub-set of documents." | ||
echo | ||
} | ||
|
||
ES_HOST="http://localhost:9200" | ||
OP_TYPE="create" # Operation type for reindex, could be `create` or `index` | ||
SOURCE_INDEXES=() # Array to hold source indices | ||
DEST_SUFFIX="" #Suffix destination indexes | ||
MAX_DOCS="" # Maximum number of documents to reindex | ||
|
||
while getopts ":he:s:d:m:q:" opt; do | ||
case $opt in | ||
h) | ||
display_help | ||
exit 0 | ||
;; | ||
e) | ||
ES_HOST=$OPTARG | ||
;; | ||
s) | ||
shift $((OPTIND - 2)) | ||
while [[ "$1" != -* && -n "$1" ]]; do | ||
SOURCE_INDEXES+=("$1") | ||
shift | ||
done | ||
OPTIND=1 | ||
;; | ||
d) | ||
DEST_SUFFIX=$OPTARG | ||
;; | ||
m) | ||
MAX_DOCS=$OPTARG | ||
;; | ||
q) | ||
QUERY=$OPTARG | ||
;; | ||
\?) | ||
echo "Invalid option: -$OPTARG" >&2 | ||
display_help | ||
exit 1 | ||
;; | ||
:) | ||
echo "Option -$OPTARG requires an argument." >&2 | ||
display_help | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
if [ ${#SOURCE_INDEXES[@]} -eq 0 ] || [ -z "$DEST_SUFFIX" ]; then | ||
echo "Error: At least one SOURCE_INDEX and a DEST_SUFFIX are required." | ||
display_help | ||
exit 5 | ||
fi | ||
|
||
check_es_alive() { | ||
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST") | ||
if [ "$response" -ne 200 ]; then | ||
echo "Error: Elasticsearch is not reachable. Status code: $response" | ||
exit 1 | ||
fi | ||
echo "Elasticsearch is up and running." | ||
} | ||
|
||
check_source_index_exists() { | ||
for index in "${SOURCE_INDEXES[@]}"; do | ||
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST/$index") | ||
if [ "$response" -ne 200 ]; then | ||
echo "Error: Source index '$index' does not exist." | ||
exit 2 | ||
fi | ||
echo "Source index '$index' exists." | ||
done | ||
} | ||
|
||
check_dest_index_not_exists() { | ||
for index in "${SOURCE_INDEXES[@]}"; do | ||
DEST_INDEX="${index}-${DEST_SUFFIX}" | ||
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST/$DEST_INDEX") | ||
if [ "$response" -eq 200 ]; then | ||
echo "Error: Destination index '$DEST_INDEX' already exists." | ||
exit 3 | ||
fi | ||
echo "Destination index '$DEST_INDEX' does not exist." | ||
done | ||
} | ||
|
||
validate_query() { | ||
if [ -n "$QUERY" ]; then | ||
response=$(curl -s -X GET "$ES_HOST/_validate/query" \ | ||
-H 'Content-Type: application/json' \ | ||
-d "{\"query\": $QUERY}") | ||
|
||
echo "Response :$response" | ||
|
||
is_valid=$(echo "$response" | grep -o '"valid":\s*\(true\|false\)' | cut -d':' -f2 | tr -d ' ') | ||
|
||
if [ "$is_valid" != "true" ]; then | ||
error_message=$(echo "$response" | grep -o '"error":{[^}]*}' | cut -d':' -f2-) | ||
echo "Error: The provided query is not valid. Details: $error_message" | ||
exit 6 | ||
fi | ||
|
||
echo "Query validated successfully." | ||
fi | ||
} | ||
|
||
# From ES https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-from-multiple-sources | ||
# Indexing multiple sources | ||
start_reindex() { | ||
for index in "${SOURCE_INDEXES[@]}"; do | ||
DEST_INDEX="${index}-${DEST_SUFFIX}" | ||
echo "Starting reindex from '$index' to '$DEST_INDEX'..." | ||
|
||
reindex_body="{\"source\": {\"index\": \"$index\"" | ||
|
||
if [ -n "$QUERY" ]; then | ||
reindex_body="${reindex_body}, \"query\": $QUERY" | ||
fi | ||
|
||
reindex_body="${reindex_body}}, \"dest\": {\"index\": \"$DEST_INDEX\", \"op_type\": \"$OP_TYPE\"}" | ||
|
||
if [ -n "$MAX_DOCS" ]; then | ||
# If max_docs is provided, do not use slices to avoid complexities. | ||
reindex_body="${reindex_body}, \"max_docs\": $MAX_DOCS" | ||
fi | ||
|
||
reindex_body="${reindex_body}}" | ||
|
||
echo "Reindex body: $reindex_body" | ||
|
||
# If max_docs is provided, do not use slices to avoid complexities. | ||
# Using slices with max_docs can lead to unpredictable document counts | ||
# due to how slices operate independently, which may exceed the intended limit. | ||
if [ -n "$MAX_DOCS" ]; then | ||
task_response=$(curl -s -X POST "$ES_HOST/_reindex?wait_for_completion=false" \ | ||
-H 'Content-Type: application/json' \ | ||
-d "$reindex_body") | ||
else | ||
# If max_docs is not provided, we can safely use slices. | ||
task_response=$(curl -s -X POST "$ES_HOST/_reindex?slices=auto&wait_for_completion=false" \ | ||
-H 'Content-Type: application/json' \ | ||
-d "$reindex_body") | ||
fi | ||
|
||
# Extract task ID from the response | ||
task_id=$(echo "$task_response" | grep -o '"task":"[^"]*"' | cut -d':' -f2- | tr -d '"') | ||
|
||
if [ -z "$task_id" ]; then | ||
echo "Error: Failed to start reindexing task for index '$index'." | ||
exit 4 | ||
fi | ||
|
||
echo "Reindexing task started for index '$index' with task ID: $task_id" | ||
done | ||
} | ||
|
||
check_es_alive | ||
check_source_index_exists | ||
check_dest_index_not_exists | ||
validate_query | ||
start_reindex | ||
|
||
echo "Reindexing script executed successfully." | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
## Elasticsearch Reindexing Guide | ||
|
||
The _reindex API in Elasticsearch allows you to copy documents from one index to another. This can be useful when you need to change the mappings of an index, upgrade Elasticsearch versions, or simply migrate data. | ||
This guide covers reindexing using two methods; | ||
|
||
1. Using the Kibana Dev Tools | ||
2. Using `curl` and Elasticsearch `reindex` API as per the script [here](../../bin/run-elastic-reindex.sh) | ||
|
||
### Reindexing with Kibana Dev Tools | ||
|
||
The Kibana Dev Tools provides an interactive environment to execute Elasticsearch queries and API commands | ||
|
||
#### Steps | ||
|
||
1. Open Kibana and navigate to Dev Tools > Console | ||
|
||
2. Use the following `POST` request to re-index documents from the source index to the destination index | ||
|
||
``` | ||
POST _reindex | ||
{ | ||
"source": { | ||
"index": "source_index_name" | ||
}, | ||
"dest": { | ||
"index": "dest_index_name", | ||
"op_type": "create" | ||
} | ||
} | ||
``` | ||
|
||
*** Replace source-index-name and dest-index-name with the names of your source and destination indices. | ||
|
||
*** Set the "op-type":"create" to avoid overwriting existing documents in the destination index. To allow overwriting use "op_type":"index" | ||
|
||
3. Use the following GET request from the Kibana Dev Tools console to get the Re-indexing status | ||
|
||
``` | ||
GET _tasks/<task_id> | ||
``` | ||
|
||
### Reindexing with Curl & Elasticsearch Reindex API | ||
|
||
The Elasticsearch Reindex API provides for a REST endpoint to re-index documents. | ||
The bash script available [here](../../bin/run-elastic-reindex.sh), allows for re-indexing by performing the necessary checks, and initiating the re-indexing process asynchronously. | ||
|
||
The script returns a `task ID` that can be used to monitor the Reindexing status via curl command | ||
|
||
``` | ||
curl -X GET "http://localhost:9200/_tasks/<task_id>" | ||
``` | ||
|
||
#### Reindexing Limited Number of Documents for Testing | ||
|
||
The Elasticsearch Reindex API provides for a `max_docs` argument to specify the maximum number of documents to reindex. | ||
|
||
``` | ||
{ | ||
"source": { | ||
"index": "mc_search-000002" | ||
}, | ||
"dest": { | ||
"index": "mc_search-000002-test", | ||
"op_type": "create" | ||
}, | ||
"max_docs": 10 | ||
} | ||
``` | ||
|
||
The [script](../../bin/run-elastic-reindex.sh) provides for an optional argument `-m` to specify the number of documents to re-index. | ||
|
||
#### Reindexing from multiple sources | ||
|
||
Elasticsearch recommends to index one document at a time if we have many indices to reindex from, as referenced [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-from-multiple-sources). | ||
|
||
### Reindexing select documents with a query | ||
|
||
To reindex documents based on specific criteria, you can utilize the query parameter in the reindex request. This allows you to specify a query that filters the documents being reindexed. Here’s how you can structure your request: | ||
|
||
``` | ||
POST _reindex | ||
{ | ||
"source": { | ||
"index": "mc_search-000002", | ||
"query": { | ||
"match": { | ||
"canonical-domain": "mediacloud.org" | ||
} | ||
} | ||
}, | ||
"dest": { | ||
"index": "mc_search-000002-test" | ||
} | ||
} | ||
``` | ||
|
||
Example in the bash script [here](../../bin/run-elastic-reindex.sh) | ||
|
||
``` | ||
bin/run-elastic-reindex.sh -s mc_search-000003 mc_search-000004 -d reindexed -m 1000 -q '{ | ||
"match": { | ||
"canonical_domain": "okezone.com" | ||
} | ||
}' | ||
``` | ||
|
||
#### Slicing | ||
|
||
The Reindex API supports Sliced scroll to parallelize the [reindexing process](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-slice), thereby improving efficiency. | ||
We can perform slicing Manually (providing the no.of slices for each request) or Automatically (let Elasticsearch chose the number of slices to use). | ||
|
||
``` | ||
curl -s -X POST "$ES_HOST/_reindex?slices=auto&wait_for_completion=false" | ||
``` | ||
|
||
#### Throttling | ||
|
||
The Reindex API supports throttling during reindexing by setting the `requests_per_second` to throttle the rate at which `_reindex` issues batches of index operations. | ||
|
||
##### Rethrotting During Reindex | ||
|
||
Based on the cluster monitoring stats, you can adjust the throttling dynamically using the _rethrottle API. This allows us to manage the load to our cluster. | ||
|
||
``` | ||
POST _reindex/<task_id>/_rethrottle?requests_per_second=10 | ||
``` |