Skip to content

Commit

Permalink
Merge pull request #343 from thepsalmist/chore/ES_reindex_api
Browse files Browse the repository at this point in the history
chore: ES reindex implementation
  • Loading branch information
thepsalmist authored Nov 11, 2024
2 parents 161aa11 + eb92b85 commit 2320597
Show file tree
Hide file tree
Showing 2 changed files with 323 additions and 0 deletions.
195 changes: 195 additions & 0 deletions bin/run-elastic-reindex.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/bin/sh

# Elasticsearch Reindexing Script
# This script performs a reindexing operation from a source index to a destination index in Elasticsearch,
# with the following checks:
# 1. Ensures Elasticsearch is running and reachable.
# 2. Verifies that the source index exists.
# 3. Checks that the destination index does not already exist.
# 4. Initiates the reindex task asynchronously and retrieves the task ID.
# 5. Logs output for each step and provides appropriate error handling.
#

# Usage:
# Modify the variables ES_HOST, SOURCE_INDEX, and DEST_SUFFIX and MAX_DOCS as needed.
# Run this script using: sh reindex.sh
# ./run-elastic-reindex.sh @
#

display_help() {
echo "Elasticsearch Reindexing Script"
echo
echo "Usage: sh run-elastic-reindex.sh [-h] [-e ES_HOST] -s SOURCE_INDEX... -d DEST_SUFFIX"
echo "Example: sh run-elastic-reindex.sh -e http://localhost:9200 -s mc_search-000001 mc_search-000002 -d reindexed -m 1000"
echo
echo "Arguments:"
echo " -h Show help information."
echo " -e ES_HOST Optional. The URL of the Elasticsearch host (default: http://localhost:9200)."
echo " -s SOURCE_INDEX Required. One or more source indices to reindex from (space-separated)."
echo " -d DEST_SUFFIX Required. Suffix for the destination index names."
echo " -m MAX_DOCS Optional. The maximum number of documents to re-index. Must be a positive integer."
echo " -q QUERY Optional. The query to reindex a sub-set of documents."
echo
}

ES_HOST="http://localhost:9200"
OP_TYPE="create" # Operation type for reindex, could be `create` or `index`
SOURCE_INDEXES=() # Array to hold source indices
DEST_SUFFIX="" #Suffix destination indexes
MAX_DOCS="" # Maximum number of documents to reindex

while getopts ":he:s:d:m:q:" opt; do
case $opt in
h)
display_help
exit 0
;;
e)
ES_HOST=$OPTARG
;;
s)
shift $((OPTIND - 2))
while [[ "$1" != -* && -n "$1" ]]; do
SOURCE_INDEXES+=("$1")
shift
done
OPTIND=1
;;
d)
DEST_SUFFIX=$OPTARG
;;
m)
MAX_DOCS=$OPTARG
;;
q)
QUERY=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG" >&2
display_help
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
display_help
exit 1
;;
esac
done

if [ ${#SOURCE_INDEXES[@]} -eq 0 ] || [ -z "$DEST_SUFFIX" ]; then
echo "Error: At least one SOURCE_INDEX and a DEST_SUFFIX are required."
display_help
exit 5
fi

check_es_alive() {
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST")
if [ "$response" -ne 200 ]; then
echo "Error: Elasticsearch is not reachable. Status code: $response"
exit 1
fi
echo "Elasticsearch is up and running."
}

check_source_index_exists() {
for index in "${SOURCE_INDEXES[@]}"; do
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST/$index")
if [ "$response" -ne 200 ]; then
echo "Error: Source index '$index' does not exist."
exit 2
fi
echo "Source index '$index' exists."
done
}

check_dest_index_not_exists() {
for index in "${SOURCE_INDEXES[@]}"; do
DEST_INDEX="${index}-${DEST_SUFFIX}"
response=$(curl -s -o /dev/null -w "%{http_code}" "$ES_HOST/$DEST_INDEX")
if [ "$response" -eq 200 ]; then
echo "Error: Destination index '$DEST_INDEX' already exists."
exit 3
fi
echo "Destination index '$DEST_INDEX' does not exist."
done
}

validate_query() {
if [ -n "$QUERY" ]; then
response=$(curl -s -X GET "$ES_HOST/_validate/query" \
-H 'Content-Type: application/json' \
-d "{\"query\": $QUERY}")

echo "Response :$response"

is_valid=$(echo "$response" | grep -o '"valid":\s*\(true\|false\)' | cut -d':' -f2 | tr -d ' ')

if [ "$is_valid" != "true" ]; then
error_message=$(echo "$response" | grep -o '"error":{[^}]*}' | cut -d':' -f2-)
echo "Error: The provided query is not valid. Details: $error_message"
exit 6
fi

echo "Query validated successfully."
fi
}

# From ES https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-from-multiple-sources
# Indexing multiple sources
start_reindex() {
for index in "${SOURCE_INDEXES[@]}"; do
DEST_INDEX="${index}-${DEST_SUFFIX}"
echo "Starting reindex from '$index' to '$DEST_INDEX'..."

reindex_body="{\"source\": {\"index\": \"$index\""

if [ -n "$QUERY" ]; then
reindex_body="${reindex_body}, \"query\": $QUERY"
fi

reindex_body="${reindex_body}}, \"dest\": {\"index\": \"$DEST_INDEX\", \"op_type\": \"$OP_TYPE\"}"

if [ -n "$MAX_DOCS" ]; then
# If max_docs is provided, do not use slices to avoid complexities.
reindex_body="${reindex_body}, \"max_docs\": $MAX_DOCS"
fi

reindex_body="${reindex_body}}"

echo "Reindex body: $reindex_body"

# If max_docs is provided, do not use slices to avoid complexities.
# Using slices with max_docs can lead to unpredictable document counts
# due to how slices operate independently, which may exceed the intended limit.
if [ -n "$MAX_DOCS" ]; then
task_response=$(curl -s -X POST "$ES_HOST/_reindex?wait_for_completion=false" \
-H 'Content-Type: application/json' \
-d "$reindex_body")
else
# If max_docs is not provided, we can safely use slices.
task_response=$(curl -s -X POST "$ES_HOST/_reindex?slices=auto&wait_for_completion=false" \
-H 'Content-Type: application/json' \
-d "$reindex_body")
fi

# Extract task ID from the response
task_id=$(echo "$task_response" | grep -o '"task":"[^"]*"' | cut -d':' -f2- | tr -d '"')

if [ -z "$task_id" ]; then
echo "Error: Failed to start reindexing task for index '$index'."
exit 4
fi

echo "Reindexing task started for index '$index' with task ID: $task_id"
done
}

check_es_alive
check_source_index_exists
check_dest_index_not_exists
validate_query
start_reindex

echo "Reindexing script executed successfully."
exit 0
128 changes: 128 additions & 0 deletions doc/elasticsearch/elasticsearch_reindex.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
## Elasticsearch Reindexing Guide

The _reindex API in Elasticsearch allows you to copy documents from one index to another. This can be useful when you need to change the mappings of an index, upgrade Elasticsearch versions, or simply migrate data.
This guide covers reindexing using two methods;

1. Using the Kibana Dev Tools
2. Using `curl` and Elasticsearch `reindex` API as per the script [here](../../bin/run-elastic-reindex.sh)

### Reindexing with Kibana Dev Tools

The Kibana Dev Tools provides an interactive environment to execute Elasticsearch queries and API commands

#### Steps

1. Open Kibana and navigate to Dev Tools > Console

2. Use the following `POST` request to re-index documents from the source index to the destination index

```
POST _reindex
{
"source": {
"index": "source_index_name"
},
"dest": {
"index": "dest_index_name",
"op_type": "create"
}
}
```

*** Replace source-index-name and dest-index-name with the names of your source and destination indices.

*** Set the "op-type":"create" to avoid overwriting existing documents in the destination index. To allow overwriting use "op_type":"index"

3. Use the following GET request from the Kibana Dev Tools console to get the Re-indexing status

```
GET _tasks/<task_id>
```

### Reindexing with Curl & Elasticsearch Reindex API

The Elasticsearch Reindex API provides for a REST endpoint to re-index documents.
The bash script available [here](../../bin/run-elastic-reindex.sh), allows for re-indexing by performing the necessary checks, and initiating the re-indexing process asynchronously.

The script returns a `task ID` that can be used to monitor the Reindexing status via curl command

```
curl -X GET "http://localhost:9200/_tasks/<task_id>"
```

#### Reindexing Limited Number of Documents for Testing

The Elasticsearch Reindex API provides for a `max_docs` argument to specify the maximum number of documents to reindex.

```
{
"source": {
"index": "mc_search-000002"
},
"dest": {
"index": "mc_search-000002-test",
"op_type": "create"
},
"max_docs": 10
}
```

The [script](../../bin/run-elastic-reindex.sh) provides for an optional argument `-m` to specify the number of documents to re-index.

#### Reindexing from multiple sources

Elasticsearch recommends to index one document at a time if we have many indices to reindex from, as referenced [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-from-multiple-sources).

### Reindexing select documents with a query

To reindex documents based on specific criteria, you can utilize the query parameter in the reindex request. This allows you to specify a query that filters the documents being reindexed. Here’s how you can structure your request:

```
POST _reindex
{
"source": {
"index": "mc_search-000002",
"query": {
"match": {
"canonical-domain": "mediacloud.org"
}
}
},
"dest": {
"index": "mc_search-000002-test"
}
}
```

Example in the bash script [here](../../bin/run-elastic-reindex.sh)

```
bin/run-elastic-reindex.sh -s mc_search-000003 mc_search-000004 -d reindexed -m 1000 -q '{
"match": {
"canonical_domain": "okezone.com"
}
}'
```

#### Slicing

The Reindex API supports Sliced scroll to parallelize the [reindexing process](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html#docs-reindex-slice), thereby improving efficiency.
We can perform slicing Manually (providing the no.of slices for each request) or Automatically (let Elasticsearch chose the number of slices to use).

```
curl -s -X POST "$ES_HOST/_reindex?slices=auto&wait_for_completion=false"
```

#### Throttling

The Reindex API supports throttling during reindexing by setting the `requests_per_second` to throttle the rate at which `_reindex` issues batches of index operations.

##### Rethrotting During Reindex

Based on the cluster monitoring stats, you can adjust the throttling dynamically using the _rethrottle API. This allows us to manage the load to our cluster.

```
POST _reindex/<task_id>/_rethrottle?requests_per_second=10
```

0 comments on commit 2320597

Please sign in to comment.