Skip to content
Draft
15 changes: 15 additions & 0 deletions index-recovery-tests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Harder-to-Reproduce Index Recovery Performance Results

I tested recovery of 1 and 12 shards of ~20 Gigs each. The size makes it a bit challenging to package nicely in a reproducible benchmark, although I am sure it can be done.
I am confident you can reproduce this behavior with a comparable amount of data and cloud structure. I can share the scripts I used to achieve these results if it is helpful.

## Results Summary

| Scenario | Shards | Configuration | Result | Time |
|----------|--------|---------------|--------|------|
| HTTP/2 | 1 | default | Fast | ~40s |
| HTTP/1 | 1 | default | Fast | ~50s |
| HTTP/1 | 12 | default | Fast | ~90s |
| HTTP/2 | 12 | default | Slowest | ~320s |
| HTTP/2 | 12 | `maxConcurrentStreams=1`| Slower | ~180s |

154 changes: 154 additions & 0 deletions scripts/add-replicas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/bin/bash
#
# /*
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
#

# =============================================================================
# Script to add replicas to a target node
#
# Usage:
# ./add-replicas.sh [SOLR_URL] [COLLECTION] [TARGET_NODE] [COUNT] [TYPE]
#
# Example:
# ./add-replicas.sh http://localhost:8983/solr test solr2:8983_solr 12 TLOG
# ./add-replicas.sh http://localhost:8983/solr test solr2:8983_solr 1 NRT
# =============================================================================

set -e

SOLR_URL="${1:-http://localhost:8983/solr}"
COLLECTION="${2:-test}"
TARGET_NODE="${3:-solr2:8983_solr}"
NUM_SHARDS="${4:-12}"
TYPE="${5:-TLOG}"

echo "Ensuring $NUM_SHARDS shards with 1 replica of type $TYPE on $TARGET_NODE for collection $COLLECTION"

# Fetch cluster status
echo "Fetching cluster status from $SOLR_URL..."
cluster_status=$(curl -s "$SOLR_URL/admin/collections?action=CLUSTERSTATUS")

# Validate JSON response
if ! echo "$cluster_status" | jq -e . >/dev/null 2>&1; then
echo "Error: Invalid JSON response from Solr."
echo "Response: $cluster_status"
exit 1
fi

# Check if collection exists
if echo "$cluster_status" | jq -e ".cluster.collections[\"$COLLECTION\"] == null" >/dev/null; then
echo "Collection '$COLLECTION' not found."
echo "Creating collection '$COLLECTION' with $NUM_SHARDS shards..."

# Determine replica types for CREATE
# prioritizing TLOG if requested
CREATE_PARAMS="action=CREATE&name=$COLLECTION&numShards=$NUM_SHARDS"

if [ "$TYPE" == "TLOG" ]; then
CREATE_PARAMS="${CREATE_PARAMS}&nrtReplicas=0&tlogReplicas=1"
elif [ "$TYPE" == "PULL" ]; then
CREATE_PARAMS="${CREATE_PARAMS}&nrtReplicas=0&pullReplicas=1"
else
CREATE_PARAMS="${CREATE_PARAMS}&replicationFactor=1"
fi

# Create collection targeted at the node to ensure initial replicas are there
create_response=$(curl -s -w "\n%{http_code}" \
"$SOLR_URL/admin/collections?${CREATE_PARAMS}&createNodeSet=$TARGET_NODE")

create_http_code=$(echo "$create_response" | tail -n1)

if [ "$create_http_code" != "200" ]; then
echo "Error creating collection: HTTP $create_http_code"
echo "$create_response" | head -n -1
exit 1
fi

echo "Collection created successfully."

# We are done since CREATE with createNodeSet puts them there
exit 0
fi

echo "Collection '$COLLECTION' exists. Checking shards..."

# Refresh cluster status
cluster_status=$(curl -s "$SOLR_URL/admin/collections?action=CLUSTERSTATUS")

# Iterate through expected shards 1..NUM_SHARDS
for ((i=1; i<=NUM_SHARDS; i++)); do
shard_name="shard${i}"

# Check if shard exists
shard_exists=$(echo "$cluster_status" | jq -r ".cluster.collections[\"$COLLECTION\"].shards[\"$shard_name\"] // empty")

if [ -z "$shard_exists" ]; then
echo " $shard_name does not exist. Creating..."

# Create shard
response=$(curl -s -w "\n%{http_code}" \
"$SOLR_URL/admin/collections?action=CREATESHARD&collection=$COLLECTION&shard=$shard_name&createNodeSet=$TARGET_NODE")

# CREATESHARD doesn't take type params easily for the new replica, it usually uses collection defaults.
# But if we use createNodeSet it creates a replica there.
# However, checking if it created the right TYPE is hard atomically.
# Typically CREATESHARD adds replicas based on collection settings.

http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "200" ]; then
echo " Error creating shard: HTTP $http_code"
echo "$response" | head -n -1
exit 1
fi
echo " $shard_name created."

# We might need to ensure the type is correct if default isn't TLOG.
# But for now assuming collection settings or manual add later if needed.
# Ideally we'd check and delete/re-add if wrong type, but that's complex.
else
# Shard exists, check for replica on TARGET_NODE
# We look for a replica on this node
replicas_on_node=$(echo "$cluster_status" | jq -r ".cluster.collections[\"$COLLECTION\"].shards[\"$shard_name\"].replicas | to_entries[] | select(.value.node_name == \"$TARGET_NODE\") | .key")

if [ -z "$replicas_on_node" ]; then
echo " $shard_name exists but has no replica on $TARGET_NODE. Adding $TYPE replica..."

response=$(curl -s -w "\n%{http_code}" \
"$SOLR_URL/admin/collections?action=ADDREPLICA&collection=$COLLECTION&shard=$shard_name&node=$TARGET_NODE&type=$TYPE")

http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "200" ]; then
echo " Error adding replica: HTTP $http_code"
echo "$response" | head -n -1
exit 1
fi
echo " Replica added."
else
echo " $shard_name already has replica on $TARGET_NODE. Skipping."
fi
fi
done

echo ""
echo "========================================="
echo "Configuration complete!"
echo "Collection: $COLLECTION"
echo "Target node: $TARGET_NODE"
echo "Shards checked: $NUM_SHARDS"
echo "========================================="

150 changes: 150 additions & 0 deletions scripts/cycle-replicas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
#
# /*
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
#

# =============================================================================
# Script to remove all replicas from a node and then add them back
#
# Usage:
# ./cycle-replicas.sh [SOLR_URL] [COLLECTION] [TARGET_NODE]
#
# Example:
# ./cycle-replicas.sh http://localhost:8983/solr test solr2:8983_solr
# =============================================================================

set -e

SOLR_URL="${1:-http://localhost:8983/solr}"
COLLECTION="${2:-test}"
TARGET_NODE="${3:-solr2:8983_solr}"

echo "Cycling replicas on $TARGET_NODE for collection $COLLECTION"
echo ""

# Get cluster status
cluster_status=$(curl -s "$SOLR_URL/admin/collections?action=CLUSTERSTATUS")

# Find all replicas on the target node
# Format: shard_name:replica_name
replicas_on_node=$(echo "$cluster_status" | jq -r "
.cluster.collections[\"$COLLECTION\"].shards | to_entries[] |
.key as \$shard |
.value.replicas | to_entries[] |
select(.value.node_name == \"$TARGET_NODE\") |
\"\(\$shard):\(.key)\"
")

if [ -z "$replicas_on_node" ]; then
echo "No replicas found on $TARGET_NODE for collection $COLLECTION"
exit 0
fi

# Get list of shards that have replicas on target node
shards_on_node=$(echo "$replicas_on_node" | cut -d: -f1 | sort -u)

echo "Found replicas on $TARGET_NODE:"
echo "$replicas_on_node"
echo ""

# =========================================
# PHASE 1: Remove all replicas from node
# =========================================
echo "========================================="
echo "PHASE 1: Removing replicas from $TARGET_NODE"
echo "========================================="

# Get the directory of the current script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Call the delete-replicas subscript
"$SCRIPT_DIR/delete-replicas.sh" "$SOLR_URL" "$COLLECTION" "$TARGET_NODE" 2

echo ""
echo "All replicas removed from $TARGET_NODE"
echo ""

# =========================================
# PHASE 2: Add replicas back to node
# =========================================
echo "========================================="
echo "PHASE 2: Adding replicas back to $TARGET_NODE (async)"
echo "========================================="

async_ids=()
timestamp=$(date +%s)

for shard in $shards_on_node; do
echo "Adding TLOG replica for $shard on $TARGET_NODE..."

async_id="${COLLECTION}_${shard}_add_${timestamp}"

# Delete any existing async status with this ID (ignore errors)
curl -s "$SOLR_URL/admin/collections?action=DELETESTATUS&requestid=$async_id" > /dev/null 2>&1 || true

response=$(curl -s -w "\n%{http_code}" \
"$SOLR_URL/admin/collections?action=ADDREPLICA&collection=$COLLECTION&shard=$shard&node=$TARGET_NODE&type=TLOG&async=$async_id")

http_code=$(echo "$response" | tail -n1)
body=$(echo "$response" | head -n -1)

if [ "$http_code" != "200" ]; then
echo "Error: HTTP $http_code"
echo "$body"
exit 1
fi

async_ids+=("$async_id")
echo " Submitted (async id: $async_id)"
done

echo ""
echo "Waiting for async operations to complete..."

# Wait for all async operations to complete
for async_id in "${async_ids[@]}"; do
echo "Checking status of $async_id..."

while true; do
status_response=$(curl -s "$SOLR_URL/admin/collections?action=REQUESTSTATUS&requestid=$async_id")
state=$(echo "$status_response" | jq -r '.status.state')

if [ "$state" == "completed" ]; then
echo " $async_id: completed"
# Clean up the async request
curl -s "$SOLR_URL/admin/collections?action=DELETESTATUS&requestid=$async_id" > /dev/null
break
elif [ "$state" == "failed" ]; then
echo " $async_id: FAILED"
echo "$status_response" | jq '.status'
exit 1
else
echo " $async_id: $state (waiting...)"
sleep 2
fi
done
done

echo ""
echo "========================================="
echo "Replica cycling complete!"
echo "Collection: $COLLECTION"
echo "Node: $TARGET_NODE"
echo "Shards cycled: $(echo "$shards_on_node" | wc -w)"
echo "========================================="

Loading