Merge branch 'main' into type_conversion_ham_distance

lanterndata · Oct 12, 2023 · 099d476 · 099d476
2 parents b000930 + 9e3e17b
commit 099d476
Show file tree

Hide file tree

Showing 51 changed files with 1,047 additions and 157 deletions.
diff --git a/.github/workflows/benchmark-linux.yaml b/.github/workflows/benchmark-linux.yaml
@@ -11,8 +11,8 @@ on:
       - dev
 
 jobs:
-  ubuntu-build:
-    runs-on: ubuntu-22.04
+  benchmark:
+    runs-on: self-hosted
 
     steps:
       - uses: actions/checkout@v3

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -71,7 +71,7 @@ jobs:
       - name: Create source code archive with submodules
         if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
         run: |
-          find ./ -name '.git*' -exec rm -rv {} \; || true
+          find ./ -name '.git*' -exec rm -r {} \; || true
           tar -czf /tmp/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz .
       - uses: geekyeggo/delete-artifact@v2
         with:
@@ -81,9 +81,23 @@ jobs:
         id: create_release
         if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
         with:
-          name: LanternDB v${{ steps.package.outputs.package_version }}
+          name: Lantern v${{ steps.package.outputs.package_version }}
           tag_name: v${{ steps.package.outputs.package_version }}
           files: |
             ${{ steps.package.outputs.package_path }}
             /tmp/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz
           generate_release_notes: true
+      - name: Homebrew release
+        uses: var77/bump-homebrew-formula-action@main
+        if: ${{ github.event_name == 'workflow_dispatch' && inputs.create_release }}
+        with:
+          formula-name: lantern
+          formula-path: Formula/lantern.rb
+          tag-name: v${{ steps.package.outputs.package_version }}
+          download-url: https://github.com/lanterndata/lantern/releases/download/v${{ steps.package.outputs.package_version }}/lantern-v${{ steps.package.outputs.package_version }}-source.tar.gz
+          homebrew-tap: lanterndata/homebrew-lantern
+          commit-message: |
+            {{formulaName}} {{version}}
+          base-branch: main
+        env:
+          COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -192,7 +192,13 @@ add_custom_target(
 # TEST
 add_custom_target(
   test
-  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --regression
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
+)
+
+add_custom_target(
+  test-parallel
+  COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --parallel
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/test
 )
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,7 +8,11 @@ make test
 
 # only run regression tests that have $FILTER in regression sql file path
 make test FILTER=hnsw
+
+# run parallel tests
+make test-parallel
 ```
+Running `make test` will run the lantern regression tests, these run independent of one another. At the moment the tests for `make test-parallel` are under development, they can be found in `test/parallel`. The goal of the parallel tests is to generate a more realistic workload on the index to discover timing errors and other bugs dependent on more complex use, they run in the same database. 
 
 ## Running benchmarks
 This requires Python to be installed. Please check the `Dockerfile.dev` for pip requirements.
@@ -30,7 +34,7 @@ If you build Lantern in a different directory, make sure to update `.vscode` con
 
 ## Debugging the C codebase
 
-If you make changes to the C codebase, in addition to `make test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
+If you make changes to the C codebase, in addition to `make test` and `make parallel-test`, you can also use the `livedebug.py` utility in a `tmux` session to easily attach `gdb` to the psql backend and find out what breaks.
 Below is a short recording demonstrating the use of `livedebug.py`:
 
 [![asciicast](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt.svg)](https://asciinema.org/a/jTsbWdOcTvUl4iAJlAw3Cszbt)

diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -40,8 +40,9 @@ RUN rm -rf build \
     && make install
 
 # Install benchmarking tools in build folder
-RUN git clone https://github.com/lanterndata/benchmark \
-    && cd benchmark \
+RUN mkdir build/lantern \
+    && git clone https://github.com/lanterndata/benchmark build/benchmark \
+    && cd build/benchmark \
     && pip install -r core/requirements.txt --break-system-packages \
     && pip install -r external/requirements.txt --break-system-packages
 ENV DATABASE_URL=postgres://postgres:postgres@localhost:5432/postgres

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![build](https://github.com/lanterndata/lantern/actions/workflows/build.yaml/badge.svg?branch=main)](https://github.com/lanterndata/lantern/actions/workflows/build.yaml)
 [![test](https://github.com/lanterndata/lantern/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/lanterndata/lantern/actions/workflows/test.yaml)
-[![codecov](https://codecov.io/github/lanterndata/lanterndb/branch/main/graph/badge.svg)](https://codecov.io/github/lanterndata/lanterndb)
+[![codecov](https://codecov.io/github/lanterndata/lantern/branch/main/graph/badge.svg)](https://codecov.io/github/lanterndata/lantern)
 [![Run on Replit](https://img.shields.io/badge/Run%20on-Replit-blue?logo=replit)](https://replit.com/@lanterndata/lantern-playground#.replit)
 
 Lantern is an open-source PostgreSQL database extension to store vector data, generate embeddings, and handle vector search operations.
@@ -27,6 +27,13 @@ cd build
 cmake ..
 make install
 ```
+
+To install Lantern using `homebrew`:
+```
+brew tap lanterndata/lantern
+brew install lantern && lantern_install
+```
+
 You can also install Lantern on top of PostgreSQL from our [precompiled binaries](https://github.com/lanterndata/lantern/releases) via a single `make install`.
 
 Alternatively, you can use Lantern in one click using [Replit](https://replit.com/@lanterndata/lantern-playground#.replit).

diff --git a/ci/scripts/build-docker.sh b/ci/scripts/build-docker.sh
@@ -14,7 +14,7 @@ fi
 # Set Locale
 apt update && apt-mark hold locales && \
 # Install required packages for build
-apt install -y --no-install-recommends build-essential cmake postgresql-server-dev-$PG_VERSION postgresql-$PG_VERSION-pgvector && \
+apt install -y --no-install-recommends build-essential cmake postgresql-server-dev-$PG_VERSION && \
 # Build lantern
 cd /tmp/lantern && mkdir build && cd build && \
 # Run cmake

diff --git a/ci/scripts/run-tests-linux.sh b/ci/scripts/run-tests-linux.sh
@@ -27,5 +27,6 @@ echo "port = 5432" >> ${PGDATA}/postgresql.conf
 GCOV_PREFIX=$WORKDIR/build/CMakeFiles/lantern.dir/ GCOV_PREFIX_STRIP=5 POSTGRES_HOST_AUTH_METHOD=trust /usr/lib/postgresql/$PG_VERSION/bin/postgres 1>/tmp/pg-out.log 2>/tmp/pg-error.log &
 # Wait for start and run tests
 wait_for_pg && cd $WORKDIR/build && make test && \
+make test-parallel && \
 killall postgres && \
 gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml
diff --git a/ci/scripts/run-tests-mac.sh b/ci/scripts/run-tests-mac.sh
@@ -22,4 +22,4 @@ wait_for_pg(){
 # Start database
 brew services start postgresql@$PG_VERSION
 
-wait_for_pg && cd $WORKDIR/build && make test
+wait_for_pg && cd $WORKDIR/build && make test && make test-parallel
diff --git a/cmake/FindPostgreSQL.cmake b/cmake/FindPostgreSQL.cmake
@@ -48,10 +48,14 @@
 # Define additional search paths for root directories.
 set(PostgreSQL_ROOT_DIRECTORIES ENV PGROOT ENV PGPATH ${PostgreSQL_ROOT})
 
-find_program(
-  PG_CONFIG pg_config
-  PATHS ${PostgreSQL_ROOT_DIRECTORIES}
-  PATH_SUFFIXES bin)
+if (DEFINED ENV{PG_CONFIG})
+  set(PG_CONFIG "$ENV{PG_CONFIG}")
+else()
+  find_program(
+    PG_CONFIG pg_config
+    PATHS ${PostgreSQL_ROOT_DIRECTORIES}
+    PATH_SUFFIXES bin)
+endif()
 
 if(NOT PG_CONFIG)
   message(FATAL_ERROR "Could not find pg_config")
@@ -156,4 +160,4 @@ if(PostgreSQL_FOUND)
   message(
     STATUS "PostgreSQL shared linker options: ${PostgreSQL_SHARED_LINK_OPTIONS}"
   )
-endif()
+endif()
diff --git a/scripts/check_symbols.sh b/scripts/check_symbols.sh
@@ -6,7 +6,9 @@ SCRIPT=$(realpath "$0")
 THIS_DIR=$(dirname "$SCRIPT")
 
 # get all the symbols our shared library assumes are externally provided
-MAYBE_EXTERN=$(nm -D $1 | grep ' U ' | awk '{print $2}' | sed -e 's/@.*//')
+# libraries in extern_defined.sh use two @ symbols to separate symbol name from
+# version. the sed fixes the discrepancy between here and extern_defined.sh
+MAYBE_EXTERN=$(nm -D $1 | grep ' U ' | awk '{print $2}' | sed -e 's/@/@@/')
 
 # get all the symbols that are externally provided
 EXTERN_PROVIDED=$($THIS_DIR/extern_defined.sh)

diff --git a/scripts/extern_defined.sh b/scripts/extern_defined.sh
@@ -34,27 +34,52 @@ IFS=$'\n\t'
 # 3. use platorm specific equivalent of nm, ldd, awk
 
 PG_BIN=$(pg_config --bindir)/postgres
-
+SED_PATTERN='s/@/@/p' # noop pattern
 # " T " - text symbol
-nm -D  $PG_BIN | grep " T " | awk '{print $3}' | sed -e 's/@.*$//p'
+nm -D --with-symbol-versions $PG_BIN | grep " T " | awk '{print $3}' | sed -e "$SED_PATTERN"
 # global bss symbol in postgres
-nm -D  $PG_BIN | grep " B " | awk '{print $3}' | sed -e 's/@.*$//p'
+nm -D --with-symbol-versions $PG_BIN | grep " B " | awk '{print $3}' | sed -e "$SED_PATTERN"
+# postgres Initialized data (bbs), global symbols
+nm -D --with-symbol-versions $PG_BIN | grep " D " | awk '{print $3}' | sed -e "$SED_PATTERN"
 # postgres weak symbols
-nm -D  $PG_BIN | grep " w " | awk '{print $2}' | sed -e 's/@.*$//p'
+nm -D --with-symbol-versions $PG_BIN | grep " w " | awk '{print $2}' | sed -e "$SED_PATTERN"
 
 # Get a list of shared library dependencies using ldd
 dependencies=$(ldd "$PG_BIN" | awk '{print $3}' | grep -v "not a dynamic executable")
 
 # Loop through the dependencies and extract symbols
 for dependency in $dependencies; do
+   SED_PATTERN='s/@/@/p' # noop pattern
+   if grep -q "libstdc++" <<< "$dependency"; then
+      # even if postgres is linked against libstdc++, we should not use those and should
+      # always have our statically linked libstdc++ as postgres may not always be linked
+      # against libstdc++
+      continue
+   fi
+
+   if grep -q "libm" <<< "$dependency"; then
+      #libm does not use symbol versioning
+	   SED_PATTERN='s/@.*$//p'
+   fi
    # " U " - undefined symbol
-   nm -D "$dependency" | awk '/ U / {print $3}' | sed -e 's/@.*$//p'
+   nm -D --with-symbol-versions "$dependency" | awk '/ U / {print $3}' | sed -e "$SED_PATTERN"
    # " i " - the symbol is an indirect reference to another symbol. This is often used for compiler-generated code
-   nm -D "$dependency" | awk '/ i / {print $3}' | sed -e 's/@.*$//p'
+   nm -D --with-symbol-versions "$dependency" | awk '/ i / {print $3}' | sed -e "$SED_PATTERN"
    # " T " - The symbol is a text (code) symbol, representing a function or code that can be executed
-   nm -D "$dependency" | awk '/ T / {print $3}' | sed -e 's/@.*$//p'
+   nm -D --with-symbol-versions "$dependency" | awk '/ T / {print $3}' | sed -e "$SED_PATTERN"
    # " V " - the symbol is a weak object
-   nm -D "$dependency" | awk '/ V / {print $3}' | sed -e 's/@.*$//p'
+   nm -D --with-symbol-versions "$dependency" | awk '/ V / {print $3}' | sed -e "$SED_PATTERN"
    # " W " - the symbol is a weak symbol that has not been specifically tagged as weak object symbol
-   nm -D "$dependency" | awk '/ W / {print $3}' | sed -e 's/@.*$//p'
+   nm -D --with-symbol-versions "$dependency" | awk '/ W / {print $3}' | sed -e "$SED_PATTERN"
+   # " B " global bss symbol. e.g. __libc_single_threaded@@GLIBC_2.32
+   nm -D --with-symbol-versions "$dependency" | awk '/ B / {print $3}' | sed -e "$SED_PATTERN"
+   # " D " weak symbol. e.g. stderr@@GLIBC_2.2.5
+   nm -D --with-symbol-versions "$dependency" | awk '/ D / {print $3}' | sed -e "$SED_PATTERN"
 done
+
+# We link libstdc++ statically and it uses the symbol below from ld-linux
+# Now we need to add ld-linux symbols to extern_defined. Since this is the only symbol we use,
+# we can just filter and add only that one
+# " T " text symbol. e.g. __tls_get_addr
+LD_LINUX=$(ldd $(pg_config --bindir)/postgres| grep ld-linux | awk '{print $1}')
+nm -D --with-symbol-versions "$LD_LINUX" | awk '/ T / {print $3}' | sed -e "$SED_PATTERN" | grep __tls_get_addr
diff --git a/scripts/run_all_tests.sh b/scripts/run_all_tests.sh
@@ -67,17 +67,38 @@ fi
 # Check if pgvector is available
 pgvector_installed=$($PSQL -U $DB_USER -d postgres -c "SELECT 1 FROM pg_available_extensions WHERE name = 'vector'" -tA | tail -n 1 | tr -d '\n')
 
+# Settings
+REGRESSION=0
+PARALLEL=0
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --regression) REGRESSION=1 ;;
+        --parallel) PARALLEL=1 ;;
+    esac
+    shift
+done
+
 # Generate schedule.txt
 rm -rf $TMP_OUTDIR/schedule.txt
+if [ "$PARALLEL" -eq 1 ]; then
+    SCHEDULE='parallel_schedule.txt'
+else
+    SCHEDULE='schedule.txt'
+fi
 if [ -n "$FILTER" ]; then
-    if [[ "$pgvector_installed" == "1" ]]; then
-        TEST_FILES=$(cat schedule.txt | grep -E '^(test:|test_pgvector:)' | sed -e 's/^\(test:\|test_pgvector:\)//' | tr " " "\n" | sed -e '/^$/d')
+    if [ "$PARALLEL" -eq 1 ]; then
+    	TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_begin:|test_end:)' | sed -E -e 's/^test:|test_begin:|test_end://' | tr " " "\n" | sed -e '/^$/d')
     else
-        TEST_FILES=$(cat schedule.txt | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    if [[ "$pgvector_installed" == "1" ]]; then
+		TEST_FILES=$(cat $SCHEDULE | grep -E '^(test:|test_pgvector:)' | sed -E -e 's/^test:|test_pgvector://' | tr " " "\n" | sed -e '/^$/d')
+	    else
+		TEST_FILES=$(cat $SCHEDULE | grep '^test:' | sed -e 's/^test://' | tr " " "\n" | sed -e '/^$/d')
+	    fi
     fi
 
     while IFS= read -r f; do
         if [[ $f == *"$FILTER"* ]]; then
+            echo "HERE $f"
             echo "test: $f" >> $TMP_OUTDIR/schedule.txt
         fi
     done <<< "$TEST_FILES"
@@ -94,11 +115,18 @@ else
             if [ "$pgvector_installed" == "1" ]; then
                 echo "test: $test_name" >> $TMP_OUTDIR/schedule.txt
             fi
+        elif [[ "$line" =~ ^test_begin: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_begin:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
+        elif [[ "$line" =~ ^test_end: ]]; then
+            test_name=$(echo "$line" | sed -e 's/test_end:/test:/')
+            echo "$test_name" >> $TMP_OUTDIR/schedule.txt
         else
             echo "$line" >> $TMP_OUTDIR/schedule.txt
         fi
-    done < schedule.txt
+    done < $SCHEDULE
 fi
+unset $SCHEDULE
 SCHEDULE=$TMP_OUTDIR/schedule.txt
 
 function print_diff {
@@ -115,4 +143,9 @@ function print_diff {
 
 trap print_diff ERR
 
-DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+if [ "$PARALLEL" -eq 1 ]; then
+    cd parallel
+    PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=../test_runner.sh
+else
+    PARALLEL=$PARALLEL DB_USER=$DB_USER $(pg_config --pkglibdir)/pgxs/src/test/regress/pg_regress --user=$DB_USER --schedule=$SCHEDULE --outputdir=$TMP_OUTDIR --launcher=./test_runner.sh
+fi