Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Trying to restore missing slots are also missing in the upstream bigtable #19

Open
dzervas opened this issue Dec 5, 2024 · 0 comments

Comments

@dzervas
Copy link

dzervas commented Dec 5, 2024

Hello,

I have created a script to constantly check for missing slots and re-upload them if they're missing (script at the bottom).

The problem is that some missing blocks can't be found on the upstream bigtable either (I use mainnet-beta-ledger-us-ny5) although they're accessible through the API.

A particular example is slot 299436809. It should be within ledger 298893770 but when I try to re-upload the said slot from that ledger, it reports that it can't find that slot.

I also saw some very weird behavior regarding the bounds.txt file inside each ledger. There were cases that ledgers included slots that they shouldn't as (e.g. ledger 123 having slot 200 while ledger 150 exists which breaks the assumption that the upper bound of a ledger is the next ledger)

Bigtable filler script

solana-bigtable-filler.sh

#!/bin/bash
export LOG_PREFIX="solana-bigtable-filler"
source $(dirname $0)/_helpers.sh

# Could be mainnet-beta-ledger-us-ny5,mainnet-beta-ledger-europe-fr2 or mainnet-beta-ledger-asia-sg1
export BUCKET_NAME="mainnet-beta-ledger-us-ny5"
export LEDGERS=$(gsutil ls gs://$BUCKET_NAME | sed -E 's#gs://.*/([0-9]+)/#\1#' | grep -v '^gs://' | sort -rh)
export LEDGERS_PATH=${LEDGERS_PATH:-./ledgers}
# 500k is the upper limit of the confirmed blocks we can request
export REQ_LIMIT=50000
export FINISHED=0
export REQUEST_DATA='{"method":"getBlocks","jsonrpc":"2.0","params":[<start>,<end>],"id":"1"}'
export TMP_CONFIRMED="/tmp/solana-confirmed.txt"
# Avoid logging info messages from solana-ledger-tool
export RUST_LOG=warn
# Get the current block height
export HEIGHT="$(_curl -X POST https://api.mainnet-beta.solana.com/ --data '{"method":"getSlot","jsonrpc":"2.0","id":"1"}' | jq -r '.re
sult')"
# Start from the top of the chain and go down
export SLOT=$(($HEIGHT - 1))

export DEBUG=${DEBUG:-0}

# Download the ledger from the slot and get the required version
function download_ledger() {
    local ledger=$1
    # Get the version of the slot
    local solana_version=$(gsutil cat "gs://$BUCKET_NAME/$ledger/version.txt" 2>/dev/null || echo "")
    # Find the path of the rocksdb.tar file
    local remote_path=$(gsutil ls "gs://$BUCKET_NAME/$ledger/" | grep rocksdb.tar)
    local extension=${remote_path/*./}
    # At some point the ledger was compressed with lbzip2, then with zstd
    local decompressor=""

    if [ -d "$LEDGERS_PATH/$ledger/rocksdb" ]; then
        return
    fi

    if test "$solana_version" = ""; then
        log error "Ledger $ledger has no version.txt, skipping"
        return
    fi

    log "Downloading & extracting ledger $ledger - needs version $solana_version"
    mkdir -p "$LEDGERS_PATH/$ledger"

    case $extension in
        bz2)
            decompressor="lbzip2"
            ;;
        zst)
            decompressor="unzstd"
            ;;
        *)
            log error "Unknown extension $extension"
            exit 1
            ;;
    esac

    if test "$DEBUG" -ne 0; then
        log "Would download $remote_path to $LEDGERS_PATH/$ledger"
        return
    fi

    # gsutil sometimes fail with "file too large" but it makes no sense
    # gsutil cp "$remote_path" $download_path
    curl -s -L -C - "https://storage.googleapis.com/$BUCKET_NAME/$ledger/$(basename $remote_path)" | \
        tar -I $decompressor -xf - -C "$LEDGERS_PATH/$ledger"
}

function upload_slot() {
    local ledger=$1
    local slot="$2"
    # local skip="false"

    if test "$DEBUG" -ne 0; then
        log "Would upload $slot from $LEDGERS_PATH/$ledger"
        return
    fi

    log "Uploading slot $slot"

    # Get the blocks from the slot of the ledger
    # local blocks=$(solana-ledger-tool slot $slot -l "$LEDGERS_PATH/$ledger" --force-update-to-open || skip="true")

    # if test "$skip" = "true"; then
    #   log error "Something went wrong with slot $slot, skipping"
    #   return
    # fi

    # Re-upload the ledger to the bigtable
    # TODO: Use the correct ledger tool version
    solana-ledger-tool bigtable upload $slot $slot -l "$LEDGERS_PATH/$ledger" --force-update-to-open --force || log error "Failed to up
load slot $slot, skipping"
}

function get_confirmed() {
    # Many blocks are just skipped by solana so we get the list from the upstream to filter the output
    local req=${REQUEST_DATA//<start>/$1}
    req=${req//<end>/$2}

    log "Downloading confirmed blocks $1-$2"
    local resp="$(_curl -X POST https://api.mainnet-beta.solana.com/ --data $req)"
    local error=$(jq -r '.error.message' <<< $resp || echo "")

    if test "$error" != "null"; then
        log error "Failed to get confirmed blocks: $error"
        exit 1
    fi

    jq '.result | .[]' <<< $resp
}

# Find the ledger that contains the slot
function find_ledgers() {
    local target_slots=$1
    # Sort the slots in descending order
    target_slots=$(echo $target_slots | tr ' ' '\n' | sort -rn | tr '\n' ' ')

    log "Looking for $(echo $target_slots | wc -w) slots"

    local upper_bound=""
    local lower_bound=""

    for ledger in $LEDGERS; do
        lower_bound=$ledger

        # If the upper bound is not set, set it to the double of the current ledger
        if test "$upper_bound" = ""; then
            upper_bound=$(($ledger * 2))
        fi

        local slots_in_ledger=""
        for slot in $target_slots; do
            # Ledger ID is bigger than the target slot, skip
            if test "$ledger" -ge "$slot"; then
                continue
            fi

            # Ledger ID is between the current and the previous ledger, download it
            if test "$lower_bound" -le "$slot" && test "$upper_bound" -ge "$slot"; then
                # log "Found ledger $ledger for slot $slot"
                slots_in_ledger="$slots_in_ledger $slot"
            fi
        done

        upper_bound=$ledger

        if test -z "$slots_in_ledger"; then
            continue
        fi

        download_ledger $ledger || (log error "Failed to download ledger $ledger, skipping" && return)

        for slot in $slots_in_ledger; do
            upload_slot $ledger $slot
        done

        # Remove handled slots from the list and slots bigger than the last slot in the ledger
        local last_slot_in_ledger=$(echo $slots_in_ledger | tr ' ' '\n' | sort -rn | head -n1)
        target_slots=$(echo $target_slots | tr ' ' '\n' | grep -v -E "^($slots_in_ledger) | $last_slot_in_ledger\$" | tr '\n' ' ')

        log "Finished processing slots, cleaning up ledger $ledger, remaining slots: $(echo $target_slots | wc -w)"
        rm -rf "$LEDGERS_PATH/$ledger"

        if test "$(echo $target_slots | tr -d ' ' | wc -w)" -eq 0; then
            log "All slots have been processed, exiting!"
            break
        fi
    done
}

function fill_slots() {
    local up_to=${1:-$SLOT}
    local limit=${2:-0}
    local timer_start=$(date +%s)
    local missing_slots=""

    if test $limit -le 0; then
        local req_limit=$REQ_LIMIT
        log "Reading slots 0-$up_to"

        while true; do
            # Get the list of our slots in the range
            local slots=$(solana-ledger-tool --output json-compact bigtable blocks $(($up_to - $req_limit)) $req_limit | head -n1)
            local slots_len=$(echo "$slots" | jq -r '. | length')

            if test $slots_len -eq 0; then
                log error "No slots found"
                break
            fi

            local slot_start=$(echo $slots | jq -r ".[0]")
            local slot_end=$(echo $slots | jq -r ".[$(($slots_len - 1))]")
            get_confirmed $slot_start $slot_end > $TMP_CONFIRMED

            # Find the difference between the upstream confirmed blocks vs our blocks
            # It uses a jq script to filter out the blocks that are already confirmed
            local missing_slots_limit=$(echo "$slots" | jq -r -f ./missing-slots.jq | _grep -v -F -f $TMP_CONFIRMED)
            local missing_slots_limit_len=$(echo $missing_slots_limit | wc -w)
            rm $TMP_CONFIRMED

            log "Found $missing_slots_limit_len missing slots"

            # Append them to the list of all missing slots
            if test $missing_slots_limit_len -gt 0; then
                missing_slots="$missing_slots $missing_slots_limit"
            fi

            up_to=$(($up_to - $req_limit))

            if test $slots_len -ne $limit && test $up_to -lt 100000; then
                log "Reached the limit of $limit blocks"
                break
            fi
        done
    else
        log "Reading slots $(($up_to - $limit))-$up_to"
        missing_slots=$(seq $(($up_to - $limit)) $up_to)
    fi

    if test -z "$missing_slots"; then
        log "No missing slots found"
        return
    fi

    # Iterate all the ledgers while filtering for the missing slots
    find_ledgers "$missing_slots"

    local timer_end=$(date +%s)
    log debug "Iteration took $(($timer_end - $timer_start))s"

    if test $slots_len -ne $limit && test $up_to -lt 100000; then
        log "Reached the end of the blocks with last block: $LAST_BLOCK"
        echo "FINISHED"
    fi
}

mkdir -p $LEDGERS_PATH

if test ! -z "$1"; then
    log "Manual mode - checking slot $1"
    fill_slots $1 1
else
    log "Automatic mode - checking all slots in history"
    fill_slots
fi

missing-slots.jq

# in short this script takes an input of [1, 2, 4, 7] and returns [3, 5, 6]
# save the input to the $in variable
. as $in |
reduce
    # start a loop over the index of the input
    range(0; length) as $i (
        # initialize an empty output array
        [];
        # if the current element + 1 does not equal the next element
        if $in[$i] + 1 != $in[$i + 1] and $in[$i] != ($in | last) then
            # append the range of the missing items to the output
            . + [range($in[$i] + 1; $in[$i+1])]
            # add a , at the end of all the items so that the output can be parsed by jq
            # . + ([range($in[$i] + 1; $in[$i+1])] | map(. = "\(.),"))
        else
            .
        end
    ) |
# do not output a json array, just the elements
.[]

_helpers.sh

#!/bin/bash
set -eEo pipefail
trap 'log error "Command at line \"$LINENO\" failed with error code \"$?\": \"$BASH_COMMAND\""' ERR

function log() {
    local symbol="+"
    local line="$2"
    case $1 in
        "debug")
            symbol="d"
            ;;
        "info")
            symbol="+"
            ;;
        "error")
            symbol="x"
            ;;
        *)
            symbol="+"
            line="$1"
            ;;
    esac

    echo "[$symbol] [$LOG_PREFIX] [$(date)] $line" >&2
}

function _grep() {
    # If the exit status is 1 (no matches) don't kill the script (due to `set -e`)
    grep "$@" || test $? = 1
}

function _curl() {
    curl -s -H "Content-Type: application/json" -D /tmp/solana-bigtable-resp-headers $@

    local method_limit=$(grep -i "x-ratelimit-method-remaining" /tmp/solana-bigtable-resp-headers | awk '{print $2}' | tr -d '\r')
    if test $method_limit -lt 5; then
        log "Rate limit reached, sleeping for 10s"
        sleep 10
    fi
}
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant