Skip to content

Commit

Permalink
Merge branch 'main' into type_conversion_ham_distance
Browse files Browse the repository at this point in the history
  • Loading branch information
therealdarkknight authored Oct 12, 2023
2 parents b000930 + 9e3e17b commit 099d476
Show file tree
Hide file tree
Showing 51 changed files with 1,047 additions and 157 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/benchmark-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ on:
- dev

jobs:
ubuntu-build:
runs-on: ubuntu-22.04
benchmark:
runs-on: self-hosted

steps:
- uses: actions/checkout@v3
Expand Down
18 changes: 16 additions & 2 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
- name: Create source code archive with submodules
if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
run: |
find ./ -name '.git*' -exec rm -rv {} \; || true
find ./ -name '.git*' -exec rm -r {} \; || true
tar -czf /tmp/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz .
- uses: geekyeggo/delete-artifact@v2
with:
Expand All @@ -81,9 +81,23 @@ jobs:
id: create_release
if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
with:
name: LanternDB v${{ steps.package.outputs.package_version }}
name: Lantern v${{ steps.package.outputs.package_version }}
tag_name: v${{ steps.package.outputs.package_version }}
files: |
${{ steps.package.outputs.package_path }}
/tmp/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz
generate_release_notes: true
- name: Homebrew release
uses: var77/bump-homebrew-formula-action@main
if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
with:
formula-name: lantern
formula-path: Formula/lantern.rb
tag-name: v${{ steps.package.outputs.package_version }}
download-url: https://github.com/lanterndata/lantern/releases/download/v${{ steps.package.outputs.package_version }}/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz
homebrew-tap: lanterndata/homebrew-lantern
commit-message: |
{{formulaName}} {{version}}
base-branch: main
env:
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,13 @@ add_custom_target(
# TEST
add_custom_target(
test
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

add_custom_target(
test-parallel
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
)

Expand Down
6 changes: 5 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ make test

# only run regression tests that have $FILTER in regression sql file path
make test FILTER=hnsw

# run parallel tests
make test-parallel
```
Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database.

## Running benchmarks
This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
Expand All @@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con

## Debugging the C codebase

If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
Below is a short recording demonstrating the use of `livedebug.py`:

[![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)
Expand Down
5 changes: 3 additions & 2 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ RUN rm -rf build \
&& make install

# Install benchmarking tools in build folder
RUN git clone https://github.com/lanterndata/benchmark \
&& cd benchmark \
RUN mkdir build/lantern \
&& git clone https://github.com/lanterndata/benchmark build/benchmark \
&& cd build/benchmark \
&& pip install -r core/requirements.txt --break-system-packages \
&& pip install -r external/requirements.txt --break-system-packages
ENV DATABASE_URL=postgres://postgres:postgres@localhost:5432/postgres
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![build](https://github.com/lanterndata/lantern/actions/workflows/build.yaml/badge.svg?branch=main)](https://github.com/lanterndata/lantern/actions/workflows/build.yaml)
[![test](https://github.com/lanterndata/lantern/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/lanterndata/lantern/actions/workflows/test.yaml)
[![codecov](https://codecov.io/github/lanterndata/lanterndb/branch/main/graph/badge.svg)](https://codecov.io/github/lanterndata/lanterndb)
[![codecov](https://codecov.io/github/lanterndata/lantern/branch/main/graph/badge.svg)](https://codecov.io/github/lanterndata/lantern)
[![Run on Replit](https://img.shields.io/badge/Run%20on-Replit-blue?logo=replit)](https://replit.com/@lanterndata/lantern-playground#.replit)

Lantern is an open-source PostgreSQL database extension to store vector data, generate embeddings, and handle vector search operations.
Expand All @@ -27,6 +27,13 @@ cd build
cmake ..
make install
```

To install Lantern using `homebrew`:
```
brew tap lanterndata/lantern
brew install lantern && lantern_install
```

You can also install Lantern on top of PostgreSQL from our [precompiled binaries](https://github.com/lanterndata/lantern/releases) via a single `make install`.

Alternatively, you can use Lantern in one click using [Replit](https://replit.com/@lanterndata/lantern-playground#.replit).
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/build-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fi
# Set Locale
apt update && apt-mark hold locales && \
# Install required packages for build
apt install -y --no-install-recommends build-essential cmake postgresql-server-dev-$PG_VERSION postgresql-$PG_VERSION-pgvector && \
apt install -y --no-install-recommends build-essential cmake postgresql-server-dev-$PG_VERSION && \
# Build lantern
cd /tmp/lantern && mkdir build && cd build && \
# Run cmake
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/run-tests-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ echo "port = 5432" >> ${PGDATA}/postgresql.conf
GCOV_PREFIX=$WORKDIR/build/CMakeFiles/lantern.dir/ GCOV_PREFIX_STRIP=5 POSTGRES_HOST_AUTH_METHOD=trust /usr/lib/postgresql/$PG_VERSION/bin/postgres 1>/tmp/pg-out.log 2>/tmp/pg-error.log &
# Wait for start and run tests
wait_for_pg && cd $WORKDIR/build && make test && \
make test-parallel && \
killall postgres && \
gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml
2 changes: 1 addition & 1 deletion ci/scripts/run-tests-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ wait_for_pg(){
# Start database
brew services start postgresql@$PG_VERSION

wait_for_pg && cd $WORKDIR/build && make test
wait_for_pg && cd $WORKDIR/build && make test && make test-parallel
14 changes: 9 additions & 5 deletions cmake/FindPostgreSQL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,14 @@
# Define additional search paths for root directories.
set(PostgreSQL_ROOT_DIRECTORIES ENV PGROOT ENV PGPATH ${PostgreSQL_ROOT})

find_program(
PG_CONFIG pg_config
PATHS ${PostgreSQL_ROOT_DIRECTORIES}
PATH_SUFFIXES bin)
if (DEFINED ENV{PG_CONFIG})
set(PG_CONFIG "$ENV{PG_CONFIG}")
else()
find_program(
PG_CONFIG pg_config
PATHS ${PostgreSQL_ROOT_DIRECTORIES}
PATH_SUFFIXES bin)
endif()

if(NOT PG_CONFIG)
message(FATAL_ERROR "Could not find pg_config")
Expand Down Expand Up @@ -156,4 +160,4 @@ if(PostgreSQL_FOUND)
message(
STATUS "PostgreSQL shared linker options: ${PostgreSQL_SHARED_LINK_OPTIONS}"
)
endif()
endif()
4 changes: 3 additions & 1 deletion scripts/check_symbols.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ SCRIPT=$(realpath "$0")
THIS_DIR=$(dirname "$SCRIPT")

# get all the symbols our shared library assumes are externally provided
MAYBE_EXTERN=$(nm -D $1 | grep ' U ' | awk '{print $2}' | sed -e 's/@.*//')
# libraries in extern_defined.sh use two @ symbols to separate symbol name from
# version. the sed fixes the discrepancy between here and extern_defined.sh
MAYBE_EXTERN=$(nm -D $1 | grep ' U ' | awk '{print $2}' | sed -e 's/@/@@/')

# get all the symbols that are externally provided
EXTERN_PROVIDED=$($THIS_DIR/extern_defined.sh)
Expand Down
43 changes: 34 additions & 9 deletions scripts/extern_defined.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,52 @@ IFS=$'\n\t'
# 3. use platorm specific equivalent of nm, ldd, awk

PG_BIN=$(pg_config --bindir)/postgres

SED_PATTERN='s/@/@/p' # noop pattern
# " T " - text symbol
nm -D $PG_BIN | grep " T " | awk '{print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions $PG_BIN | grep " T " | awk '{print $3}' | sed -e "$SED_PATTERN"
# global bss symbol in postgres
nm -D $PG_BIN | grep " B " | awk '{print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions $PG_BIN | grep " B " | awk '{print $3}' | sed -e "$SED_PATTERN"
# postgres Initialized data (bbs), global symbols
nm -D --with-symbol-versions $PG_BIN | grep " D " | awk '{print $3}' | sed -e "$SED_PATTERN"
# postgres weak symbols
nm -D $PG_BIN | grep " w " | awk '{print $2}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions $PG_BIN | grep " w " | awk '{print $2}' | sed -e "$SED_PATTERN"

# Get a list of shared library dependencies using ldd
dependencies=$(ldd "$PG_BIN" | awk '{print $3}' | grep -v "not a dynamic executable")

# Loop through the dependencies and extract symbols
for dependency in $dependencies; do
SED_PATTERN='s/@/@/p' # noop pattern
if grep -q "libstdc++" <<< "$dependency"; then
# even if postgres is linked against libstdc++, we should not use those and should
# always have our statically linked libstdc++ as postgres may not always be linked
# against libstdc++
continue
fi

if grep -q "libm" <<< "$dependency"; then
#libm does not use symbol versioning
SED_PATTERN='s/@.*$//p'
fi
# " U " - undefined symbol
nm -D "$dependency" | awk '/ U / {print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions "$dependency" | awk '/ U / {print $3}' | sed -e "$SED_PATTERN"
# " i " - the symbol is an indirect reference to another symbol. This is often used for compiler-generated code
nm -D "$dependency" | awk '/ i / {print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions "$dependency" | awk '/ i / {print $3}' | sed -e "$SED_PATTERN"
# " T " - The symbol is a text (code) symbol, representing a function or code that can be executed
nm -D "$dependency" | awk '/ T / {print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions "$dependency" | awk '/ T / {print $3}' | sed -e "$SED_PATTERN"
# " V " - the symbol is a weak object
nm -D "$dependency" | awk '/ V / {print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions "$dependency" | awk '/ V / {print $3}' | sed -e "$SED_PATTERN"
# " W " - the symbol is a weak symbol that has not been specifically tagged as weak object symbol
nm -D "$dependency" | awk '/ W / {print $3}' | sed -e 's/@.*$//p'
nm -D --with-symbol-versions "$dependency" | awk '/ W / {print $3}' | sed -e "$SED_PATTERN"
# " B " global bss symbol. e.g. __libc_single_threaded@@GLIBC_2.32
nm -D --with-symbol-versions "$dependency" | awk '/ B / {print $3}' | sed -e "$SED_PATTERN"
# " D " weak symbol. e.g. stderr@@GLIBC_2.2.5
nm -D --with-symbol-versions "$dependency" | awk '/ D / {print $3}' | sed -e "$SED_PATTERN"
done

# We link libstdc++ statically and it uses the symbol below from ld-linux
# Now we need to add ld-linux symbols to extern_defined. Since this is the only symbol we use,
# we can just filter and add only that one
# " T " text symbol. e.g. __tls_get_addr
LD_LINUX=$(ldd $(pg_config --bindir)/postgres| grep ld-linux | awk '{print $1}')
nm -D --with-symbol-versions "$LD_LINUX" | awk '/ T / {print $3}' | sed -e "$SED_PATTERN" | grep __tls_get_addr
43 changes: 38 additions & 5 deletions scripts/run_all_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,38 @@ fi
# Check if pgvector is available
pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')

# Settings
REGRESSION=0
PARALLEL=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--regression) REGRESSION=1 ;;
--parallel) PARALLEL=1 ;;
esac
shift
done

# Generate schedule.txt
rm -rf $TMP_OUTDIR/schedule.txt
if [ "$PARALLEL" -eq 1 ]; then
SCHEDULE='parallel_schedule.txt'
else
SCHEDULE='schedule.txt'
fi
if [ -n "$FILTER" ]; then
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -e 's/^\(test:\|test_pgvector:\)//' | tr " " "\n" | sed -e '/^$/d')
if [ "$PARALLEL" -eq 1 ]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
else
TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
if [[ "$pgvector_installed" == "1" ]]; then
TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
else
TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
fi
fi

while IFS= read -r f; do
if [[ $f == *"$FILTER"* ]]; then
echo "HERE $f"
echo "test: $f" >> $TMP_OUTDIR/schedule.txt
fi
done <<< "$TEST_FILES"
Expand All @@ -94,11 +115,18 @@ else
if [ "$pgvector_installed" == "1" ]; then
echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
fi
elif [[ "$line" =~ ^test_begin: ]]; then
test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
elif [[ "$line" =~ ^test_end: ]]; then
test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
echo "$test_name" >> $TMP_OUTDIR/schedule.txt
else
echo "$line" >> $TMP_OUTDIR/schedule.txt
fi
done < schedule.txt
done < $SCHEDULE
fi
unset $SCHEDULE
SCHEDULE=$TMP_OUTDIR/schedule.txt

function print_diff {
Expand All @@ -115,4 +143,9 @@ function print_diff {

trap print_diff ERR

DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
if [ "$PARALLEL" -eq 1 ]; then
cd parallel
PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh
else
PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
fi
Loading

0 comments on commit 099d476

Please sign in to comment.