diff --git a/.github/workflows/build_main.yml b/.github/workflows/build_main.yml index fc7c5f6eb..f1088a5b6 100644 --- a/.github/workflows/build_main.yml +++ b/.github/workflows/build_main.yml @@ -28,8 +28,7 @@ jobs: uses: ./.github/actions/scala_build - name: build python uses: ./.github/actions/python_build - # CRAN FLAKY (502 'Bad Gateway' ERRORS) - # - name: build R - # uses: ./.github/actions/r_build + - name: build R + uses: ./.github/actions/r_build - name: upload artefacts uses: ./.github/actions/upload_artefacts diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c10e4f2b..56e7bff22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## v0.4.2 [DBR 13.3 LTS] +- Geopandas now fixed to "<0.14.4,>=0.14" due to conflict with minimum numpy version in geopandas 0.14.4. +- H3 python changed from "==3.7.0" to "<4.0,>=3.7" to pick up patches. +- Fixed an issue with fallback logic when deserializing subdatasets from a zip. +- Adjusted data used to speed up a long-running test. +- Streamlines setup_gdal and setup_fuse_install: + - init script and resource copy logic fixed to repo "main" (.so) / "latest" (.jar). + - added apt-get lock handling for native installs. + - removed support for UbuntuGIS PPA as GDAL version no longer compatible with jammy default (3.4.x). + ## v0.4.1 [DBR 13.3 LTS] - Fixed python bindings for MosaicAnalyzer functions. - Added tiller functions, ST_AsGeoJSONTile and ST_AsMVTTile, for creating GeoJSON and MVT tiles as aggregations of geometries. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0e321a059..0684cca0b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,6 +83,9 @@ The repository is structured as follows: ## Test & build Mosaic +Given that DBR 13.3 is Ubuntu 22.04, we recommend using docker, +see [mosaic-docker.sh](https://github.com/databrickslabs/mosaic/blob/main/scripts/mosaic-docker.sh). + ### Scala JAR We use the [Maven](https://maven.apache.org/install.html) build tool to manage and build the Mosaic scala project. @@ -115,6 +118,8 @@ To build the docs: - Install the pandoc library (follow the instructions for your platform [here](https://pandoc.org/installing.html)). - Install the python requirements from `docs/docs-requirements.txt`. - Build the HTML documentation by running `make html` from `docs/`. + - For nbconvert you may have to symlink your jupyter share folder, + e.g. `sudo ln -s /opt/homebrew/share/jupyter /usr/local/share`. - You can locally host the docs by running the `reload.py` script in the `docs/source/` directory. ## Style diff --git a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION index c72991a01..e5e1de0f1 100644 --- a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION +++ b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION @@ -1,6 +1,6 @@ Package: sparkrMosaic Title: SparkR bindings for Databricks Mosaic -Version: 0.4.1 +Version: 0.4.2 Authors@R: person("Robert", "Whiffin", , "robert.whiffin@databricks.com", role = c("aut", "cre") ) diff --git a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION index 4c5265fb5..69619e72a 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION +++ b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION @@ -1,6 +1,6 @@ Package: sparklyrMosaic Title: sparklyr bindings for Databricks Mosaic -Version: 0.4.1 +Version: 0.4.2 Authors@R: person("Robert", "Whiffin", , "robert.whiffin@databricks.com", role = c("aut", "cre") ) diff --git a/R/sparklyr-mosaic/tests.R b/R/sparklyr-mosaic/tests.R index 3663c63f6..9bad8f6fa 100644 --- a/R/sparklyr-mosaic/tests.R +++ b/R/sparklyr-mosaic/tests.R @@ -9,7 +9,7 @@ library(sparklyr.nested) spark_home <- Sys.getenv("SPARK_HOME") spark_home_set(spark_home) -install.packages("sparklyrMosaic_0.4.1.tar.gz", repos = NULL) +install.packages("sparklyrMosaic_0.4.2.tar.gz", repos = NULL) library(sparklyrMosaic) # find the mosaic jar in staging diff --git a/README.md b/README.md index 2a07a6294..70827dc9a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ An extension to the [Apache Spark](https://spark.apache.org/) framework that all [![codecov](https://codecov.io/gh/databrickslabs/mosaic/branch/main/graph/badge.svg?token=aEzZ8ITxdg)](https://codecov.io/gh/databrickslabs/mosaic) [![build](https://github.com/databrickslabs/mosaic/actions/workflows/build_main.yml/badge.svg)](https://github.com/databrickslabs/mosaic/actions?query=workflow%3A%22build+main%22) [![docs](https://github.com/databrickslabs/mosaic/actions/workflows/docs.yml/badge.svg)](https://github.com/databrickslabs/mosaic/actions/workflows/docs.yml) -[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/databrickslabs/mosaic.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/databrickslabs/mosaic/context:python) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![lines of code](https://tokei.rs/b1/github/databrickslabs/mosaic)]([https://codecov.io/github/databrickslabs/mosaic](https://github.com/databrickslabs/mosaic)) @@ -33,7 +32,8 @@ The supported languages are Scala, Python, R, and SQL. ## How does it work? -The Mosaic library is written in Scala (JVM) to guarantee maximum performance with Spark and when possible, it uses code generation to give an extra performance boost. +The Mosaic library is written in Scala (JVM) to guarantee maximum performance with Spark and when possible, +it uses code generation to give an extra performance boost. __The other supported languages (Python, R and SQL) are thin wrappers around the Scala (JVM) code.__ @@ -42,6 +42,13 @@ Image1: Mosaic logical design. ## Getting started +:warning: **geopandas 0.14.4 not supported** + +For Mosaic <= 0.4.1 `%pip install databricks-mosaic` will no longer install "as-is" in DBRs due to the fact that Mosaic +left geopandas unpinned in those versions. With geopandas 0.14.4, numpy dependency conflicts with the limits of +scikit-learn in DBRs. The workaround is `%pip install geopandas==0.14.3 databricks-mosaic`. +Mosaic 0.4.2+ limits the geopandas version. + ### Mosaic 0.4.x Series [Latest] We recommend using Databricks Runtime versions 13.3 LTS with Photon enabled. @@ -56,18 +63,21 @@ We recommend using Databricks Runtime versions 13.3 LTS with Photon enabled. __Language Bindings__ -As of Mosaic 0.4.0 (subject to change in follow-on releases)... +As of Mosaic 0.4.0 / DBR 13.3 LTS (subject to change in follow-on releases)... -* [Assigned Clusters](https://docs.databricks.com/en/compute/configure.html#access-modes): Mosaic Python, SQL, R, and Scala APIs. -* [Shared Access Clusters](https://docs.databricks.com/en/compute/configure.html#access-modes): Mosaic Scala API (JVM) with Admin [allowlisting](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html); _Python bindings to Mosaic Scala APIs are blocked by Py4J Security on Shared Access Clusters._ +* [Assigned Clusters](https://docs.databricks.com/en/compute/configure.html#access-modes) + * Mosaic Python, SQL, R, and Scala APIs. +* [Shared Access Clusters](https://docs.databricks.com/en/compute/configure.html#access-modes) + * Mosaic Scala API (JVM) with Admin [allowlisting](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/allowlist.html). + * Mosaic Python bindings (to Mosaic Scala APIs) are blocked by Py4J Security on Shared Access Clusters. * Mosaic SQL expressions cannot yet be registered with [Unity Catalog](https://www.databricks.com/product/unity-catalog) due to API changes affecting DBRs >= 13, more [here](https://docs.databricks.com/en/udf/index.html). __Additional Notes:__ -As of Mosaic 0.4.0 (subject to change in follow-on releases)... +Mosaic is a custom JVM library that extends spark, which has the following implications in DBR 13.3 LTS: 1. [Unity Catalog](https://www.databricks.com/product/unity-catalog): Enforces process isolation which is difficult to accomplish with custom JVM libraries; as such only built-in (aka platform provided) JVM APIs can be invoked from other supported languages in Shared Access Clusters. -2. [Volumes](https://docs.databricks.com/en/connect/unity-catalog/volumes.html): Along the same principle of isolation, clusters (both assigned and shared access) can read Volumes via relevant built-in readers and writers or via custom python calls which do not involve any custom JVM code. +2. [Volumes](https://docs.databricks.com/en/connect/unity-catalog/volumes.html): Along the same principle of isolation, clusters can read Volumes via relevant built-in (aka platform provided) readers and writers or via custom python calls which do not involve any custom JVM code. ### Mosaic 0.3.x Series @@ -142,7 +152,7 @@ import com.databricks.labs.mosaic.JTS val mosaicContext = MosaicContext.build(H3, JTS) mosaicContext.register(spark) ``` -__Note: Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Hive UDFs), but not Shared Access due to API changes, more [here](https://docs.databricks.com/en/udf/index.html).__ +__Note: Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Spark Expressions), but not Shared Access due to API changes, more [here](https://docs.databricks.com/en/udf/index.html).__ ## Examples diff --git a/docs/source/conf.py b/docs/source/conf.py index 1fbaf1965..e81dd3385 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'Mosaic' -copyright = '2022, Databricks Inc' -author = 'Stuart Lynn, Milos Colic, Erni Durdevic, Robert Whiffin, Timo Roest' +copyright = '2024, Databricks Inc' +author = 'Milos Colic, Stuart Lynn, Michael Johns, Robert Whiffin' # The full version, including alpha/beta/rc tags -release = "v0.4.1" +release = "v0.4.2" # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index a9b7348d5..d9b62490b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -29,74 +29,66 @@ :target: https://github.com/databrickslabs/mosaic/actions/workflows/docs.yml :alt: Mosaic sphinx docs - .. image:: https://img.shields.io/lgtm/grade/python/g/databrickslabs/mosaic.svg?logo=lgtm&logoWidth=18 - :target: https://lgtm.com/projects/g/databrickslabs/mosaic/context:python - :alt: Language grade: Python - .. image:: https://img.shields.io/badge/code%20style-black-000000.svg :target: https://github.com/psf/black :alt: Code style: black - - -Mosaic is an extension to the `Apache Spark `_ framework that allows easy and fast processing of very large geospatial datasets. - -We currently recommend using Databricks Runtime with Photon enabled; -this will leverage the Databricks H3 expressions when using H3 grid system. - -Mosaic provides: - -* easy conversion between common spatial data encodings (WKT, WKB and GeoJSON); -* constructors to easily generate new geometries from Spark native data types; -* many of the OGC SQL standard :code:`ST_` functions implemented as Spark Expressions for transforming, aggregating and joining spatial datasets; -* high performance through implementation of Spark code generation within the core Mosaic functions; -* optimisations for performing point-in-polygon joins using an approach we co-developed with Ordnance Survey (`blog post `_); and -* the choice of a Scala, SQL and Python API. +| Mosaic is an extension to the `Apache Spark `_ framework for fast + easy processing + of very large geospatial datasets. It provides: +| +| [1] The choice of a Scala, SQL and Python language bindings (written in Scala). +| [2] Raster and Vector APIs. +| [3] Easy conversion between common spatial data encodings (WKT, WKB and GeoJSON). +| [4] Constructors to easily generate new geometries from Spark native data types. +| [5] Many of the OGC SQL standard :code:`ST_` functions implemented as Spark Expressions for transforming, +| aggregating and joining spatial datasets. +| [6] High performance through implementation of Spark code generation within the core Mosaic functions. +| [7] Performing point-in-polygon joins using an approach we co-developed with Ordnance Survey + (`blog post `_). .. note:: - For Mosaic versions < 0.4 please use the `0.3 docs `_. - -.. warning:: - At times, it is useful to "hard refresh" pages to ensure your cached local version matches the latest live, - more `here `_. + We recommend using Databricks Runtime with Photon enabled to leverage the Databricks H3 expressions. Version 0.4.x Series ==================== -We recommend using Databricks Runtime versions 13.3 LTS with Photon enabled. +.. warning:: + For Mosaic <= 0.4.1 :code:`%pip install databricks-mosaic` will no longer install "as-is" in DBRs due to the fact that Mosaic + left geopandas unpinned in those versions. With geopandas 0.14.4, numpy dependency conflicts with the limits of + scikit-learn in DBRs. The workaround is :code:`%pip install geopandas==0.14.3 databricks-mosaic`. + Mosaic 0.4.2+ limits the geopandas version. Mosaic 0.4.x series only supports DBR 13.x DBRs. If running on a different DBR it will throw an exception: - DEPRECATION ERROR: Mosaic v0.4.x series only supports Databricks Runtime 13. - You can specify `%pip install 'databricks-mosaic<0.4,>=0.3'` for DBR < 13. + DEPRECATION ERROR: Mosaic v0.4.x series only supports Databricks Runtime 13. + You can specify :code:`%pip install 'databricks-mosaic<0.4,>=0.3'` for DBR < 13. Mosaic 0.4.x series issues an ERROR on standard, non-Photon clusters `ADB `_ | `AWS `_ | `GCP `_: - DEPRECATION ERROR: Please use a Databricks Photon-enabled Runtime for performance benefits or Runtime ML for - spatial AI benefits; Mosaic 0.4.x series restricts executing this cluster. - -As of Mosaic 0.4.0 (subject to change in follow-on releases) + DEPRECATION ERROR: Please use a Databricks Photon-enabled Runtime for performance benefits or Runtime ML for + spatial AI benefits; Mosaic 0.4.x series restricts executing this cluster. -* `Assigned Clusters `_: Mosaic Python, SQL, R, and Scala APIs. -* `Shared Access Clusters `_: Mosaic Scala API (JVM) with - Admin `allowlisting `_; - Python bindings to Mosaic Scala APIs are blocked by Py4J Security on Shared Access Clusters. +As of Mosaic 0.4.0 / DBR 13.3 LTS (subject to change in follow-on releases): -.. warning:: - Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Hive UDFs), but not Shared Access due - to `Unity Catalog `_ API changes, more `here `_. +* `Assigned Clusters `_ + * Mosaic Python, SQL, R, and Scala APIs. +* `Shared Access Clusters `_ + * Mosaic Scala API (JVM) with Admin `allowlisting `_. + * Mosaic Python bindings (to Mosaic Scala APIs) are blocked by Py4J Security on Shared Access Clusters. + * Mosaic SQL expressions cannot yet be registered due to `Unity Catalog `_. + API changes, more `here `_. .. note:: - As of Mosaic 0.4.0 (subject to change in follow-on releases) + Mosaic is a custom JVM library that extends spark, which has the following implications in DBR 13.3 LTS: * `Unity Catalog `_ enforces process isolation which is difficult to accomplish with custom JVM libraries; as such only built-in (aka platform provided) JVM APIs can be invoked from other supported languages in Shared Access Clusters. - * Along the same principle of isolation, clusters (both Assigned and Shared Access) can read - `Volumes `_ via relevant built-in readers and - writers or via custom python calls which do not involve any custom JVM code. + * Clusters can read `Volumes `_ via relevant + built-in (aka platform provided) readers and writers or via custom python calls which do not involve any custom JVM code. + Version 0.3.x Series ==================== @@ -104,9 +96,6 @@ Version 0.3.x Series We recommend using Databricks Runtime versions 12.2 LTS with Photon enabled. For Mosaic versions < 0.4.0 please use the `0.3.x docs `_. -.. warning:: - Mosaic 0.3.x series does not support DBR 13.x DBRs. - As of the 0.3.11 release, Mosaic issues the following WARNING when initialized on a cluster that is neither Photon Runtime nor Databricks Runtime ML `ADB `_ | `AWS `_ | @@ -120,6 +109,9 @@ making this change is that we are streamlining Mosaic internals to be more align powered by Photon. Along this direction of change, Mosaic has standardized to JTS as its default and supported Vector Geometry Provider. +.. note:: + For Mosaic versions < 0.4 please use the `0.3 docs `_. + Documentation ============= diff --git a/docs/source/usage/automatic-sql-registration.rst b/docs/source/usage/automatic-sql-registration.rst index 6175394d6..56cd1b219 100644 --- a/docs/source/usage/automatic-sql-registration.rst +++ b/docs/source/usage/automatic-sql-registration.rst @@ -11,7 +11,7 @@ to your Spark / Databricks cluster to perform spatial queries or integrating Spa with a geospatial middleware component such as [Geoserver](https://geoserver.org/). .. warning:: - Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Hive UDFs), but not Shared Access due + Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Spark Expressions), but not Shared Access due to `Unity Catalog `_ API changes, more `here `_. Pre-requisites diff --git a/docs/source/usage/install-gdal.rst b/docs/source/usage/install-gdal.rst index 310f41276..cc1801a27 100644 --- a/docs/source/usage/install-gdal.rst +++ b/docs/source/usage/install-gdal.rst @@ -29,9 +29,8 @@ Mosaic requires GDAL to be installed on the cluster. The easiest way to do this the :code:`setup_gdal` function. .. note:: - - This is close in behavior to Mosaic < 0.4 series (prior to DBR 13), with new options - to pip install Mosaic for either ubuntugis gdal (3.4.3) or jammy default (3.4.1). - - Param "to_fuse_dir" can be one of "/Volumes/..", "/Workspace/..", "/dbfs/.."; + - This is close in behavior to Mosaic < 0.4 series (prior to DBR 13), it installs jammy default GDAL (3.4.1). + - Param **to_fuse_dir** can be one of **/Volumes/..**, **/Workspace/..**, **/dbfs/..**; however, you should consider :code:`setup_fuse_install()` for Volume based installs as that exposes more options, to include copying JAR and JNI Shared Objects. @@ -43,19 +42,10 @@ the :code:`setup_gdal` function. :param to_fuse_dir: Path to write out the init script for GDAL installation; default is "/Workspace/Shared/geospatial/mosaic/gdal/jammy". :type to_fuse_dir: str - :param with_mosaic_pip: Whether to configure a script that pip installs databricks-mosaic, - fixed to the current version; default is False. - :type with_mosaic_pip: bool - :param with_ubuntugis: Whether to use ubuntugis ppa for GDAL instead of built-in; - default is False. - :type with_ubuntugis: bool - :param script_out_name: name of the script to be written; - default is "mosaic-gdal-init.sh". + :param script_out_name: Name of the script to be written; default is “mosaic-gdal-init.sh”. :type script_out_name: str - :param override_mosaic_version: String value to use to override the mosaic version to install, - e.g. "==0.4.0" or "<0.5,>=0.4"; default is None. - :type override_mosaic_version: str - :rtype: bool + :param jni_so_copy: Whether to copy shared object to fuse dir and config script to use; default is False. + :type jni_so_copy: bool :example: @@ -69,9 +59,7 @@ the :code:`setup_gdal` function. +-----------------------------------------------------------------------------------------------------------+ | ::: Install setup complete ::: | +-----------------------------------------------------------------------------------------------------------+ - | - Settings: 'with_mosaic_pip'? False, 'with_gdal'? True, 'with_ubuntugis'? False | - | 'jar_copy'? False, 'jni_so_copy'? False, 'override_mosaic_version'? None | - | - Derived: 'mosaic_version'? 0.4.0, 'github_version'? 0.4.0, 'release_version'? None, 'pip_str'? ==0.4.0 | + | - Settings: 'jar_copy'? False, 'jni_so_copy'? False | | - Fuse Dir: '/Workspace/Shared/geospatial/mosaic/gdal/jammy' | | - Init Script: configured and stored at 'mosaic-gdal-init.sh'; add to your cluster and restart, | | more at https://docs.databricks.com/en/init-scripts/cluster-scoped.html | @@ -106,16 +94,13 @@ code at the top of the notebook: GDAL enabled. GDAL 3.4.1, released 2021/12/27 -.. note:: - You can configure init script from default ubuntu GDAL (3.4.1) to ubuntugis ppa @ https://launchpad.net/~ubuntugis/+archive/ubuntu/ppa (3.4.3) - with `setup_gdal(with_ubuntugis=True)` GDAL Configuration #################### Here are spark session configs available for raster, e.g. :code:`spark.conf.set("", "")`. -.. list-table:: Title +.. list-table:: :widths: 25 25 50 :header-rows: 1 @@ -124,10 +109,40 @@ Here are spark session configs available for raster, e.g. :code:`spark.conf.set( - Comments * - spark.databricks.labs.mosaic.raster.checkpoint - "/dbfs/tmp/mosaic/raster/checkpoint" - - Checkpoint location, see :ref:`rst_maketiles` for more + - Checkpoint location, e.g. :ref:`rst_maketiles` + * - spark.databricks.labs.mosaic.raster.use.checkpoint + - "false" + - Checkpoint for session, in 0.4.2+ * - spark.databricks.labs.mosaic.raster.tmp.prefix - "" (will use "/tmp") - Local directory for workers * - spark.databricks.labs.mosaic.raster.blocksize - "128" - Blocksize in pixels, see :ref:`rst_convolve` and :ref:`rst_filter` for more + +GDAL is configured as follows in `MosaicGDAL `__ class: + +.. list-table:: + :widths: 50 50 + :header-rows: 1 + + * - Config + - Value + * - GDAL_VRT_ENABLE_PYTHON + - "YES" + * - GDAL_DISABLE_READDIR_ON_OPEN + - "TRUE" + * - CPL_TMPDIR + - "" + * - GDAL_PAM_PROXY_DIR + - "" + * - GDAL_PAM_ENABLED + - "YES" + * - CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE + - "NO" + * - CPL_LOG + - "/gdal.log" + * - GDAL_CACHEMAX + - "512" + * - GDAL_NUM_THREADS + - "ALL_CPUS" \ No newline at end of file diff --git a/docs/source/usage/installation.rst b/docs/source/usage/installation.rst index d1cf54d1e..cdeeba4d0 100644 --- a/docs/source/usage/installation.rst +++ b/docs/source/usage/installation.rst @@ -5,37 +5,42 @@ Installation guide Supported platforms ################### -.. note:: - For Mosaic 0.4 series, we recommend DBR 13.3 LTS on Photon or ML Runtime clusters. +.. warning:: + For Mosaic <= 0.4.1 :code:`%pip install databricks-mosaic` will no longer install "as-is" in DBRs due to the fact that Mosaic + left geopandas unpinned in those versions. With geopandas 0.14.4, numpy dependency conflicts with the limits of + scikit-learn in DBRs. The workaround is :code:`%pip install geopandas==0.14.3 databricks-mosaic`. + Mosaic 0.4.2+ limits the geopandas version. Mosaic 0.4.x series only supports DBR 13.x DBRs. If running on a different DBR it will throw an exception: - DEPRECATION ERROR: Mosaic v0.4.x series only supports Databricks Runtime 13. - You can specify `%pip install 'databricks-mosaic<0.4,>=0.3'` for DBR < 13. + DEPRECATION ERROR: Mosaic v0.4.x series only supports Databricks Runtime 13. + You can specify :code:`%pip install 'databricks-mosaic<0.4,>=0.3'` for DBR < 13. Mosaic 0.4.x series issues an ERROR on standard, non-Photon clusters `ADB `_ | `AWS `_ | `GCP `_: - DEPRECATION ERROR: Please use a Databricks Photon-enabled Runtime for performance benefits or Runtime ML for - spatial AI benefits; Mosaic 0.4.x series restricts executing this cluster. + DEPRECATION ERROR: Please use a Databricks Photon-enabled Runtime for performance benefits or Runtime ML for + spatial AI benefits; Mosaic 0.4.x series restricts executing this cluster. -As of Mosaic 0.4.0 (subject to change in follow-on releases) +As of Mosaic 0.4.0 / DBR 13.3 LTS (subject to change in follow-on releases): -* `Assigned Clusters `_: Mosaic Python, SQL, R, and Scala APIs. -* `Shared Access Clusters `_: Mosaic Scala API (JVM) with - Admin `allowlisting `_; - Python bindings to Mosaic Scala APIs are blocked by Py4J Security on Shared Access Clusters. +* `Assigned Clusters `_ + * Mosaic Python, SQL, R, and Scala APIs. +* `Shared Access Clusters `_ + * Mosaic Scala API (JVM) with Admin `allowlisting `_. + * Mosaic Python bindings (to Mosaic Scala APIs) are blocked by Py4J Security on Shared Access Clusters. + * Mosaic SQL expressions cannot yet be registered due to `Unity Catalog `_. + API changes, more `here `_. .. note:: - As of Mosaic 0.4.0 (subject to change in follow-on releases) + Mosaic is a custom JVM library that extends spark, which has the following implications in DBR 13.3 LTS: * `Unity Catalog `_ enforces process isolation which is difficult to accomplish with custom JVM libraries; as such only built-in (aka platform provided) JVM APIs can be invoked from other supported languages in Shared Access Clusters. - * Along the same principle of isolation, clusters (both assigned and shared access) can read - `Volumes `_ via relevant built-in readers and - writers or via custom python calls which do not involve any custom JVM code. + * Clusters can read `Volumes `_ via relevant + built-in (aka platform provided) readers and writers or via custom python calls which do not involve any custom JVM code. If you have cluster creation permissions in your Databricks workspace, you can create a cluster using the instructions @@ -113,9 +118,11 @@ The mechanism for enabling the Mosaic functions varies by language: enableMosaic() .. note:: - We recommend use of `import mosaic as mos` to namespace the python api and avoid any conflicts with other similar - functions. By default, the python import will handle installing the JAR and registering Hive UDFs which is suitable - for Assigned (vs Shared Access) clusters. + * We recommend use of :code:`import mosaic as mos` to namespace the python api and avoid any conflicts with other similar + functions. By default, the python import will handle installing the JAR and registering Spark Expressions which are + suitable for Assigned (vs Shared Access) clusters. + * It is possible to initialize python bindings without providing :code:`dbutils`; if you do this, :code:`%%mosaic_kepler` + won't be able to render maps in notebooks. Unless you are specially adding the JAR to your cluster (outside :code:`%pip` or the WHL file), please always initialize with Python first, then you can initialize Scala (after the JAR has been auto-attached by python); otherwise, you don't @@ -139,13 +146,27 @@ confs as well as through extra params in Mosaic 0.4.x series :code:`enable_mosai :type jar_autoattach: bool :rtype: None -Users can control various aspects of Mosaic's operation with the following Spark confs: - - * :code:`"spark.databricks.labs.mosaic.jar.autoattach"` - Automatically attach the Mosaic JAR to the Databricks cluster (Optional, default is "true"). - * :code:`"spark.databricks.labs.mosaic.jar.path"` - Explicitly specify the path to the Mosaic JAR (Optional and not required at all in a standard Databricks environment). - * :code:`"spark.databricks.labs.mosaic.geometry.api"` - Explicitly specify the underlying geometry library to use for spatial operations (Optional, default is "JTS"). - * :code:`"spark.databricks.labs.mosaic.index.system"` - Explicitly specify the index system to use for optimized spatial joins (Optional, default is "H3"). - +Users can control various aspects of Mosaic's operation with the following optional Spark session configs: + +.. list-table:: + :widths: 25 25 50 + :header-rows: 1 + + * - Config + - Default + - Comments + * - spark.databricks.labs.mosaic.jar.autoattach + - "true" + - Automatically attach the Mosaic JAR to the Databricks cluster? + * - spark.databricks.labs.mosaic.jar.path + - "" + - Path to the Mosaic JAR, not required in standard installs + * - spark.databricks.labs.mosaic.geometry.api + - "JTS" + - Geometry library to use for spatial operations, only option in 0.4 series + * - spark.databricks.labs.mosaic.index.system + - "H3" + - Index system to use SQL usage ********* @@ -162,5 +183,5 @@ register the Mosaic SQL functions in your SparkSession from a Scala notebook cel mosaicContext.register(spark) .. warning:: - Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Hive UDFs), but not Shared Access due + Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Spark Expressions), but not Shared Access due to `Unity Catalog `_ API changes, more `here `_. diff --git a/pom.xml b/pom.xml index f0b663cce..8d4a8abf7 100644 --- a/pom.xml +++ b/pom.xml @@ -278,7 +278,7 @@ 2.12.10 2.12 3.4.0 - 0.4.1 + 0.4.2 diff --git a/python/mosaic/__init__.py b/python/mosaic/__init__.py index 84ff72ca6..22fcceb1f 100644 --- a/python/mosaic/__init__.py +++ b/python/mosaic/__init__.py @@ -4,4 +4,4 @@ from .models import SpatialKNN from .readers import read -__version__ = "0.4.1" +__version__ = "0.4.2" diff --git a/python/mosaic/api/fuse.py b/python/mosaic/api/fuse.py index ac1966cd1..607e311db 100644 --- a/python/mosaic/api/fuse.py +++ b/python/mosaic/api/fuse.py @@ -1,171 +1,69 @@ from dataclasses import dataclass +from pathlib import Path import os -import pkg_resources import requests __all__ = ["SetupMgr", "setup_fuse_install"] -def get_install_mosaic_version() -> str: - """ - Currently installed version of mosaic. - - Returns - ------- - Installed version of package 'databricks-mosaic' if available; - otherwise, None - """ - try: - return pkg_resources.get_distribution("databricks-mosaic").version - except Exception: - pass - return None - - @dataclass class SetupMgr: """ Defaults mirror setup_gdal. """ - to_fuse_dir: str script_in_name: str = "mosaic-gdal-init.sh" script_out_name: str = "mosaic-gdal-init.sh" - with_mosaic_pip: bool = False - with_gdal: bool = True - with_ubuntugis: bool = False - override_mosaic_version: str = None jar_copy: bool = False jni_so_copy: bool = False - def configure(self) -> bool: + def configure(self, test_mode: bool = False) -> bool: """ Handle various config options. - - if `with_mosaic_pip` or `with_gdal` or `with_ubuntugis`, - script will be configured and written. Returns True unless resources fail to download. """ - # - set the mosaic and github versions - # will be used in downloading resources - # may be used in pip install - mosaic_version = get_install_mosaic_version() - github_version = mosaic_version # <- valid or None - if self.override_mosaic_version is not None and set( - self.override_mosaic_version - ) <= set("=0123456789."): - github_version = self.override_mosaic_version.replace("=", "") - github_version = mosaic_version # <- valid or None - pip_str = "" - release_version = None - - if ( - self.override_mosaic_version is not None - and self.override_mosaic_version == "main" - ): - github_version = "main" - elif self.override_mosaic_version is not None and set( - self.override_mosaic_version - ).issubset(set("=0123456789.")): - github_version = self.override_mosaic_version.replace("=", "") - elif mosaic_version is None: - github_version = "main" - - GITHUB_CONTENT_URL_BASE = ( - "https://raw.githubusercontent.com/databrickslabs/mosaic" - ) - GITHUB_CONTENT_TAG_URL = f"{GITHUB_CONTENT_URL_BASE}/v_{github_version}" - if github_version == "main": - GITHUB_CONTENT_TAG_URL = f"{GITHUB_CONTENT_URL_BASE}/main" - # - generate fuse dir path + # volumes must be pre-generated in unity catalog os.makedirs(self.to_fuse_dir, exist_ok=True) - with_script = self.with_mosaic_pip or self.with_gdal - script_out_path = f"{self.to_fuse_dir}/{self.script_out_name}" - if with_script: - # - start with the unconfigured script - script_url = f"{GITHUB_CONTENT_TAG_URL}/scripts/{self.script_in_name}" - script = None + # - start with the un-configured script (from repo) + # this is using a different (repo) folder in 0.4.2+ (to allow prior versions to work) + GITHUB_CONTENT_TAG_URL = "https://raw.githubusercontent.com/databrickslabs/mosaic/main" + script_url = f"{GITHUB_CONTENT_TAG_URL}/scripts/0.4.2/{self.script_in_name}" + script = None + root_path = None + if not test_mode: with requests.Session() as s: script = s.get(script_url, allow_redirects=True).text + else: + # test_mode (use local resource) + # - up 4 parents [0..3] + # - api [0] -> mosaic [1] -> python [2] -> mosaic [3] + root_path = Path(__file__).parents[3] + script_path = root_path / 'scripts' / '0.4.2' / self.script_in_name + script = script_path.read_text(encoding='utf-8') + + # - tokens used in script + SCRIPT_FUSE_DIR_TOKEN = "FUSE_DIR='__FUSE_DIR__'" # <- ' added + SCRIPT_WITH_FUSE_SO_TOKEN = "WITH_FUSE_SO=0" + + # - set the fuse dir + script = script.replace( + SCRIPT_FUSE_DIR_TOKEN, + SCRIPT_FUSE_DIR_TOKEN.replace("__FUSE_DIR__", self.to_fuse_dir), + ) - # - tokens used in script - SCRIPT_FUSE_DIR_TOKEN = "FUSE_DIR='__FUSE_DIR__'" # <- ' added - SCRIPT_GITHUB_VERSION_TOKEN = "GITHUB_VERSION=__GITHUB_VERSION__" - SCRIPT_MOSAIC_PIP_VERSION_TOKEN = ( - "MOSAIC_PIP_VERSION='__MOSAIC_PIP_VERSION__'" # <- ' added - ) - SCRIPT_WITH_MOSAIC_TOKEN = "WITH_MOSAIC=0" - SCRIPT_WITH_GDAL_TOKEN = "WITH_GDAL=0" - SCRIPT_WITH_UBUNTUGIS_TOKEN = "WITH_UBUNTUGIS=0" - SCRIPT_WITH_FUSE_SO_TOKEN = "WITH_FUSE_SO=0" - - # - set the github version in the script - # this will be used to download so files - script = script.replace( - SCRIPT_GITHUB_VERSION_TOKEN, - SCRIPT_GITHUB_VERSION_TOKEN.replace( - "__GITHUB_VERSION__", github_version - ), - ) - - # - set the fuse dir - script = script.replace( - SCRIPT_FUSE_DIR_TOKEN, - SCRIPT_FUSE_DIR_TOKEN.replace("__FUSE_DIR__", self.to_fuse_dir), - ) - - script = script.replace("apt-add-repository", "apt-add-repository -y") - - # - are we configuring for mosaic pip? - if self.with_mosaic_pip: - script = script.replace( - SCRIPT_WITH_MOSAIC_TOKEN, SCRIPT_WITH_MOSAIC_TOKEN.replace("0", "1") - ) - - # - are we configuring for gdal? - if self.with_gdal: - script = script.replace( - SCRIPT_WITH_GDAL_TOKEN, SCRIPT_WITH_GDAL_TOKEN.replace("0", "1") - ) - - # - are we configuring for ubuntugis? - if self.with_ubuntugis: - script = script.replace( - SCRIPT_WITH_UBUNTUGIS_TOKEN, - SCRIPT_WITH_UBUNTUGIS_TOKEN.replace("0", "1"), - ) - - # - are we configuring for jni so copy? - if self.jni_so_copy: - script = script.replace( - SCRIPT_WITH_FUSE_SO_TOKEN, - SCRIPT_WITH_FUSE_SO_TOKEN.replace("0", "1"), - ) - - # - set the mosaic version for pip - if ( - self.override_mosaic_version is not None - and not self.override_mosaic_version == "main" - ): - pip_str = f"=={self.override_mosaic_version}" - if any(c in self.override_mosaic_version for c in ["=", ">", "<"]): - pip_str = f"""{self.override_mosaic_version.replace("'","").replace('"','')}""" - else: - pip_str = f"=={self.override_mosaic_version}" - elif mosaic_version is not None: - pip_str = f"=={mosaic_version}" + # - are we configuring for jni so copy? + if self.jni_so_copy: script = script.replace( - SCRIPT_MOSAIC_PIP_VERSION_TOKEN, - SCRIPT_MOSAIC_PIP_VERSION_TOKEN.replace( - "__MOSAIC_PIP_VERSION__", pip_str - ), + SCRIPT_WITH_FUSE_SO_TOKEN, + SCRIPT_WITH_FUSE_SO_TOKEN.replace("0", "1"), ) - # - write the configured init script - with open(script_out_path, "w") as file: - file.write(script) + # - write the configured init script + script_out_path = Path(self.to_fuse_dir) / self.script_out_name + script_out_path.write_text(script, encoding='utf-8') # --- end of script config --- @@ -174,13 +72,13 @@ def configure(self) -> bool: if with_resources: CHUNK_SIZE = 1024 * 1024 * 64 # 64MB # - handle jar copy + # 0.4.2 always get the latest release if self.jar_copy: - # url and version details - GITHUB_RELEASE_URL_BASE = ( - "https://github.com/databrickslabs/mosaic/releases" - ) - resource_version = github_version - if github_version == "main": + if not test_mode: + # url and version details + GITHUB_RELEASE_URL_BASE = ( + "https://github.com/databrickslabs/mosaic/releases" + ) latest = None with requests.Session() as s: latest = str( @@ -190,61 +88,81 @@ def configure(self) -> bool: ).content ) resource_version = latest.split("/tag/v_")[1].split('"')[0] - # download jar - jar_filename = f"mosaic-{resource_version}-jar-with-dependencies.jar" - jar_path = f"{self.to_fuse_dir}/{jar_filename}" - with requests.Session() as s: - r = s.get( - f"{GITHUB_RELEASE_URL_BASE}/download/v_{resource_version}/{jar_filename}", - stream=True, - ) - with open(jar_path, "wb") as f: - for ch in r.iter_content(chunk_size=CHUNK_SIZE): - f.write(ch) - resource_statuses[jar_filename] = r.status_code - # - handle so copy - if self.jni_so_copy: - with requests.Session() as s: - for so_filename in [ - "libgdalalljni.so", - "libgdalalljni.so.30", - "libgdalalljni.so.30.0.3", - ]: - so_path = f"{self.to_fuse_dir}/{so_filename}" + # download jar + jar_filename = f"mosaic-{resource_version}-jar-with-dependencies.jar" + jar_path = f"{self.to_fuse_dir}/{jar_filename}" + with requests.Session() as s: r = s.get( - f"{GITHUB_CONTENT_TAG_URL}/resources/gdal/jammy/{so_filename}", + f"{GITHUB_RELEASE_URL_BASE}/download/v_{resource_version}/{jar_filename}", stream=True, ) - with open(so_path, "wb") as f: + with open(jar_path, "wb") as f: for ch in r.iter_content(chunk_size=CHUNK_SIZE): f.write(ch) - resource_statuses[so_filename] = r.status_code + resource_statuses[jar_filename] = r.status_code + else: + # test_mode (use local resources) + lib_path = root_path / 'python' / 'mosaic' / 'lib' + src_jar_path = None + for p in lib_path.iterdir(): + if p.name.startswith('mosaic-') and p.name.endswith('-jar-with-dependencies.jar'): + src_jar_path = p + break + if src_jar_path: + dst_jar_path = Path(f'{self.to_fuse_dir}/{src_jar_path.name}') + dst_jar_path.write_bytes(src_jar_path.read_bytes()) + + # - handle so copy + if self.jni_so_copy: + so_names = [ + "libgdalalljni.so", + "libgdalalljni.so.30", + "libgdalalljni.so.30.0.3" + ] + if not test_mode: + with requests.Session() as s: + for so_filename in so_names: + so_path = f"{self.to_fuse_dir}/{so_filename}" + r = s.get( + f"{GITHUB_CONTENT_TAG_URL}/resources/gdal/jammy/{so_filename}", + stream=True, + ) + with open(so_path, "wb") as f: + for ch in r.iter_content(chunk_size=CHUNK_SIZE): + f.write(ch) + resource_statuses[so_filename] = r.status_code + else: + # test_mode (use local resources) + resources_path = root_path / 'resources' / 'gdal' / 'jammy' + for so_filename in so_names: + src_so_path = resources_path / so_filename + dst_so_path = Path(f'{self.to_fuse_dir}/{so_filename}') + dst_so_path.write_bytes(src_so_path.read_bytes()) # - echo status print(f"::: Install setup complete :::") print( - f"- Settings: 'with_mosaic_pip'? {self.with_mosaic_pip}, 'with_gdal'? {self.with_gdal}, 'with_ubuntugis'? {self.with_ubuntugis}" + f"- Settings: 'jar_copy'? {self.jar_copy}, 'jni_so_copy'? {self.jni_so_copy}" ) + print(f"- Fuse Dir: '{self.to_fuse_dir}'") print( - f" 'override_mosaic_version'? {self.override_mosaic_version}, 'jar_copy'? {self.jar_copy}, 'jni_so_copy'? {self.jni_so_copy}" + f"- Init Script: configured and stored at '{self.script_out_name}'; ", + end="", + ) + print(f"add to your cluster and restart,") + print( + f" more at https://docs.databricks.com/en/init-scripts/cluster-scoped.html" ) - print(f"- Fuse Dir: '{self.to_fuse_dir}'") - if with_script: - print( - f"- Init Script: configured and stored at '{self.script_out_name}'; ", - end="", - ) - print(f"add to your cluster and restart,") - print( - f" more at https://docs.databricks.com/en/init-scripts/cluster-scoped.html" - ) if with_resources: print(f"- Resource(s): copied") print(resource_statuses) + if test_mode: + print(f"- Test Mode files generated") + print(os.listdir(self.to_fuse_dir)) print("\n") if not any(resource_statuses) or all( - value == 200 for value in resource_statuses.values() + value == 200 for value in resource_statuses.values() ): return True else: @@ -252,34 +170,17 @@ def configure(self) -> bool: def setup_fuse_install( - to_fuse_dir: str, - with_mosaic_pip: bool, - with_gdal: bool, - with_ubuntugis: bool = False, - script_out_name: str = "mosaic-fuse-init.sh", - override_mosaic_version: str = None, - jar_copy: bool = True, - jni_so_copy: bool = True, -) -> None: + to_fuse_dir: str, + script_out_name: str = "mosaic-fuse-init.sh", + jar_copy: bool = True, + jni_so_copy: bool = True, + test_mode: bool = False +) -> bool: """ - [1] Copies Mosaic "fat" JAR (with dependencies) into `to_fuse_dir` - - by default, version will match the current mosaic version executing the command, - assuming it is a released version; if `override_mosaic_version` is a single value, - versus a range, that value will be used instead - - this doesn't involve a script unless `with_mosaic_pip=True` or `with_gdal=True` - - if `jar_copy=False`, then the JAR is not copied - [2] if `with_mosaic_pip=True` - - By default, configures script to pip install databricks-mosaic using current mosaic - version executing the command or to `override_mosaic_version` - - this is useful (1) to "pin" to a specific mosaic version, especially if using the - JAR that is also being pre-staged for this version and (2) to consolidate all mosaic - setup into a script and avoid needing to `%pip install databricks-mosaic` in each session - [3] if `with_gdal=True` - - configures script that is a variation of what setup_gdal does with some differences + [1] if `jar_copy=True` + - Copies current mosaic "fat" JAR (with dependencies) into `to_fuse_dir` + [2] if `jni_so_copy=True` - configures to load shared objects from fuse dir (vs wget) - [4] if `with_ubuntugis=True` (assumes `with_gdal=True`) - - configures script to use the GDAL version provided by ubuntugis - - default is False Notes: (a) `to_fuse_dir` can be one of `/Volumes/..`, `/Workspace/..`, `/dbfs/..` (b) Volume paths are the recommended FUSE mount for Databricks in DBR 13.3+ @@ -291,38 +192,24 @@ def setup_fuse_install( ---------- to_fuse_dir : str Path to write out the resource(s) for GDAL installation. - with_mosaic_pip : bool - Whether to configure a script that pip installs databricks-mosaic, - fixed to the current version. - with_gdal : bool - Whether to also configure a script for GDAL and pre-stage GDAL JNI shared object files. - with_ubuntugis : bool - Whether to use ubuntugis ppa for GDAL instead of built-in; - default is False. script_out_name : str name of the script to be written; default is 'mosaic-fuse-init.sh'. - override_mosaic_version: str - String value to use to override the mosaic version to install, - e.g. '==0.4.0' or '<0.5,>=0.4'; - default is None. jar_copy: bool Whether to copy the Mosaic JAR; default is True. jni_so_copy: bool Whether to copy the GDAL JNI shared objects; default is True. + test_mode: bool + Only for unit tests. Returns True unless resources fail to download. ------- """ setup_mgr = SetupMgr( to_fuse_dir, - with_mosaic_pip=with_mosaic_pip, - with_gdal=with_gdal, - with_ubuntugis=with_ubuntugis, script_out_name=script_out_name, - override_mosaic_version=override_mosaic_version, jar_copy=jar_copy, jni_so_copy=jni_so_copy, ) - return setup_mgr.configure() + return setup_mgr.configure(test_mode=test_mode) diff --git a/python/mosaic/api/gdal.py b/python/mosaic/api/gdal.py index 386070400..57c0e10f6 100644 --- a/python/mosaic/api/gdal.py +++ b/python/mosaic/api/gdal.py @@ -7,12 +7,10 @@ def setup_gdal( - to_fuse_dir: str = "/Workspace/Shared/geospatial/mosaic/gdal/jammy", - with_mosaic_pip: bool = False, - with_ubuntugis: bool = False, - script_out_name: str = "mosaic-gdal-init.sh", - override_mosaic_version: str = None, - jni_so_copy: bool = False, + to_fuse_dir: str = "/Workspace/Shared/geospatial/mosaic/gdal/jammy/0.4.2", + script_out_name: str = "mosaic-gdal-init.sh", + jni_so_copy: bool = False, + test_mode: bool = False ) -> bool: """ Prepare GDAL init script and shared objects required for GDAL to run on spark. @@ -21,8 +19,8 @@ def setup_gdal( a cluster restart is required. Notes: - (a) This is close in behavior to Mosaic < 0.4 series (prior to DBR 13), with new options - to pip install Mosaic for either ubuntugis gdal (3.4.3) or jammy default (3.4.1) + (a) This is close in behavior to Mosaic < 0.4 series (prior to DBR 13), + now using jammy default (3.4.1) (b) `to_fuse_dir` can be one of `/Volumes/..`, `/Workspace/..`, `/dbfs/..`; however, you should use `setup_fuse_install()` for Volume based installs @@ -30,33 +28,26 @@ def setup_gdal( ---------- to_fuse_dir : str Path to write out the init script for GDAL installation; - default is '/Workspace/Shared/geospatial/mosaic/gdal/jammy'. - with_mosaic_pip : bool - Whether to configure a script that pip installs databricks-mosaic, - fixed to the current version; default is False. - with_ubuntugis : bool - Whether to use ubuntugis ppa for GDAL instead of built-in; - default is False. + default is '/Workspace/Shared/geospatial/mosaic/gdal/jammy/0.4.2'. script_out_name : str name of the script to be written; default is 'mosaic-gdal-init.sh'. - override_mosaic_version: str - String value to use to override the mosaic version to install, - e.g. '==0.4.0' or '<0.5,>=0.4'; - default is None. + jni_so_copy : bool + if True, copy shared object to fuse dir and config script to use; + default is False + test_mode : bool + Only for unit tests. - Returns True unless resources fail to download. + Returns ------- + True unless resources fail to download. """ setup_mgr = SetupMgr( to_fuse_dir, - with_mosaic_pip=with_mosaic_pip, - with_ubuntugis=with_ubuntugis, script_out_name=script_out_name, - override_mosaic_version=override_mosaic_version, - jni_so_copy=jni_so_copy, + jni_so_copy=jni_so_copy ) - return setup_mgr.configure() + return setup_mgr.configure(test_mode=test_mode) def enable_gdal(spark: SparkSession) -> None: diff --git a/python/setup.cfg b/python/setup.cfg index 01ca109a2..298663a38 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -18,10 +18,11 @@ classifiers = packages = find: python_requires = >=3.10.0 install_requires = + geopandas<0.14.4,>=0.14 + h3<4.0,>=3.7 + ipython>=7.22.0 keplergl==0.3.2 - h3==3.7.0 pyspark<3.5,>=3.4 - ipython>=7.22.0 [options.package_data] mosaic = diff --git a/python/test/test_fuse_install.py b/python/test/test_fuse_install.py index 90a3b9c1b..ec0d82745 100644 --- a/python/test/test_fuse_install.py +++ b/python/test/test_fuse_install.py @@ -5,50 +5,38 @@ class TestFuseInstall(SparkTestCase): def setUp(self) -> None: return super().setUp() - def test_setup_no_op(self): - installer = FuseInstaller(False, False, jar_copy=False, jni_so_copy=False) + def test_setup_script_only(self): + installer = FuseInstaller(jar_copy=False, jni_so_copy=False) try: self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - self.assertEqual(len(installer.list_files()), 0) # <- nothing generated + self.assertEqual(len(installer.list_files()),1) # <- script generated - def test_setup_jar_only(self): - installer = FuseInstaller(False, False, jar_copy=True, jni_so_copy=False) + def test_setup_jar(self): + installer = FuseInstaller(jar_copy=True, jni_so_copy=False) try: self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - def test_setup_sh_pip_only(self): - installer = FuseInstaller(True, False, jar_copy=False, jni_so_copy=False) - try: - self.assertTrue(installer.do_op()) - except Exception: - self.fail("Executing `setup_fuse_install()` raised an exception.") - - self.assertEqual(len(installer.list_files()), 1) # <- just init script + self.assertEqual(len(installer.list_files()), 2) # <- init script and jar - def test_setup_sh_gdal(self): - installer = FuseInstaller(False, True, jar_copy=False, jni_so_copy=False) + def test_setup_jni(self): + installer = FuseInstaller(jar_copy=False, jni_so_copy=True) try: self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - self.assertEqual(len(installer.list_files()), 1) # <- just init script + self.assertEqual(len(installer.list_files()), 4) # <- init script and so files - def test_setup_sh_gdal_jni(self): - installer = FuseInstaller(False, True, jar_copy=False, jni_so_copy=True) + def test_setup_all(self): + installer = FuseInstaller(jar_copy=True, jni_so_copy=True) try: self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - def test_setup_sh_all(self): - installer = FuseInstaller(True, True, jar_copy=True, jni_so_copy=True) - try: - self.assertTrue(installer.do_op()) - except Exception: - self.fail("Executing `setup_fuse_install()` raised an exception.") + self.assertEqual(len(installer.list_files()), 5) # <- init script jar, and so files diff --git a/python/test/test_gdal_install.py b/python/test/test_gdal_install.py index f98a3d8b2..5b8117566 100644 --- a/python/test/test_gdal_install.py +++ b/python/test/test_gdal_install.py @@ -12,14 +12,4 @@ def test_setup_gdal(self): except Exception: self.fail("Copying objects with `setup_gdal()` raised an exception.") - self.assertEqual( - len(installer.list_files()), 4 - ) # <- init script and shared objs - - try: - installer_result = installer.run_init_script() - self.assertEqual(installer_result, 0) - gdalinfo_result = installer.test_gdalinfo() - self.assertEqual(gdalinfo_result, "GDAL 3.4.1, released 2021/12/27\n") - except Exception: - self.fail("Execution of GDAL init script raised an exception.") + self.assertEqual(len(installer.list_files()),1) # <- init script diff --git a/python/test/utils/setup_fuse.py b/python/test/utils/setup_fuse.py index 5bdc2b472..ac2fae795 100644 --- a/python/test/utils/setup_fuse.py +++ b/python/test/utils/setup_fuse.py @@ -8,11 +8,9 @@ class FuseInstaller: - def __init__(self, with_mosaic_pip, with_gdal, jar_copy=False, jni_so_copy=False): + def __init__(self, jar_copy=False, jni_so_copy=False): self._site_packages = working_set.find(Requirement("keplergl")).location self._temp_dir = tempfile.mkdtemp() - self.with_mosaic_pip = with_mosaic_pip - self.with_gdal = with_gdal self.jar_copy = jar_copy self.jni_so_copy = jni_so_copy self.FUSE_INIT_SCRIPT_FILENAME = "mosaic-fuse-init.sh" @@ -23,12 +21,10 @@ def __del__(self): def do_op(self) -> bool: return api.setup_fuse_install( self._temp_dir, - self.with_mosaic_pip, - self.with_gdal, jar_copy=self.jar_copy, jni_so_copy=self.jni_so_copy, - override_mosaic_version="main", script_out_name=self.FUSE_INIT_SCRIPT_FILENAME, + test_mode=True ) def run_init_script(self) -> int: diff --git a/python/test/utils/setup_gdal.py b/python/test/utils/setup_gdal.py index a62c4207e..7dab60179 100644 --- a/python/test/utils/setup_gdal.py +++ b/python/test/utils/setup_gdal.py @@ -1,6 +1,5 @@ import os import shutil -import subprocess import tempfile from pkg_resources import working_set, Requirement @@ -19,26 +18,10 @@ def __del__(self): def do_op(self) -> bool: return api.setup_gdal( to_fuse_dir=self._temp_dir, - override_mosaic_version="main", script_out_name=self.GDAL_INIT_SCRIPT_FILENAME, - jni_so_copy=True, + jni_so_copy=False, + test_mode=True ) - def run_init_script(self) -> int: - gdal_install_script_target = os.path.join( - self._temp_dir, self.GDAL_INIT_SCRIPT_FILENAME - ) - os.chmod(gdal_install_script_target, mode=0x744) - result = subprocess.run( - [gdal_install_script_target], - stdout=subprocess.DEVNULL, - env=dict(os.environ, DATABRICKS_ROOT_VIRTUALENV_ENV=self._site_packages), - ) - return result.returncode - def list_files(self) -> list[str]: return os.listdir(self._temp_dir) - - def test_gdalinfo(self) -> str: - result = subprocess.run(["gdalinfo", "--version"], stdout=subprocess.PIPE) - return result.stdout.decode() diff --git a/scripts/0.4.2/mosaic-gdal-init.sh b/scripts/0.4.2/mosaic-gdal-init.sh new file mode 100644 index 000000000..9ee60f8c2 --- /dev/null +++ b/scripts/0.4.2/mosaic-gdal-init.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# -- +# This is for Ubuntu 22.04 (Jammy) +# [1] corresponds to DBR 13+ +# [2] jammy offers GDAL 3.4.1 +# [3] see Mosaic functions (python) to configure +# and pre-stage resources: +# - setup_fuse_install(...) and +# - setup_gdal(...) +# [4] this script has conditional logic based on variables +# [5] stripped back in Mosaic 0.4.2+ +# Author: Michael Johns | mjohns@databricks.com +# Last Modified: 29 APR, 2024 + +# TEMPLATE-BASED REPLACEMENT +# - can also be manually specified +FUSE_DIR='__FUSE_DIR__' + +# CONDITIONAL LOGIC +WITH_FUSE_SO=0 # <- use fuse dir shared objects (vs wget) + +# refresh package info +# 0.4.2 - added "-y" +sudo apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-backports main universe multiverse restricted" +sudo apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-updates main universe multiverse restricted" +sudo apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-security main multiverse restricted universe" +sudo apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main multiverse restricted universe" +sudo apt-get update -y + +# install natives +# 0.4.2 added package lock wait (can change value) +sudo apt-get -o DPkg::Lock::Timeout=-1 install -y unixodbc libcurl3-gnutls libsnappy-dev libopenjp2-7 +sudo apt-get -o DPkg::Lock::Timeout=-1 install -y gdal-bin libgdal-dev python3-numpy python3-gdal + +# pip install gdal +# matches jammy version +pip install --upgrade pip +pip install gdal==3.4.1 + +# add pre-build JNI shared object to the path +if [ $WITH_FUSE_SO == 1 ] +then + # copy from fuse dir with no-clobber + sudo cp -n $FUSE_DIR/libgdalalljni.so /usr/lib + sudo cp -n $FUSE_DIR/libgdalalljni.so.30 /usr/lib + sudo cp -n $FUSE_DIR/libgdalalljni.so.30.0.3 /usr/lib +else + # copy from github + GITHUB_REPO_PATH=databrickslabs/mosaic/main/resources/gdal/jammy + sudo wget -nv -P /usr/lib -nc https://raw.githubusercontent.com/$GITHUB_REPO_PATH/libgdalalljni.so + sudo wget -nv -P /usr/lib -nc https://raw.githubusercontent.com/$GITHUB_REPO_PATH/libgdalalljni.so.30 + sudo wget -nv -P /usr/lib -nc https://raw.githubusercontent.com/$GITHUB_REPO_PATH/libgdalalljni.so.30.0.3 +fi diff --git a/scripts/mosaic-gdal-init.sh b/scripts/mosaic-gdal-init.sh index 950b0ffe9..a899c7cb2 100644 --- a/scripts/mosaic-gdal-init.sh +++ b/scripts/mosaic-gdal-init.sh @@ -1,17 +1,19 @@ #!/bin/bash -# -- +# -- +# SCRIPT FOR 0.4.0 and 0.4.1 +# NOT USED in 0.4.2+ # This is for Ubuntu 22.04 (Jammy) # [1] corresponds to DBR 13+ # [2] jammy offers GDAL 3.4.1 (default) -# - optional: Ubuntugis offers GDAL 3.4.3, -# with additional ppa added +# - backported: ignoring ubuntugis +# - repo changed to incompatible version # [3] see Mosaic functions (python) to configure # and pre-stage resources: # - setup_fuse_install(...) and # - setup_gdal(...) # [4] this script has conditional logic based on variables # Author: Michael Johns | mjohns@databricks.com -# Last Modified: 05 JAN, 2024 +# Last Modified: 29 APR, 2024 # TEMPLATE-BASED REPLACEMENT # - can also be manually specified @@ -22,11 +24,11 @@ MOSAIC_PIP_VERSION='__MOSAIC_PIP_VERSION__' # CONDITIONAL LOGIC WITH_MOSAIC=0 # <- pip install mosaic? WITH_GDAL=0 # <- install gdal? -WITH_UBUNTUGIS=0 # <- use ubuntugis ppa? -WITH_FUSE_SO=0 # <- use fuse dir shared objects (vs wget) +WITH_UBUNTUGIS=0 # <- use ubuntugis ppa, now ignored! +WITH_FUSE_SO=0 # <- use fuse dir shared objects (vs wget) -# SPECIFIED VERSIONS -GDAL_VERSION=3.4.1 # <- ubuntugis is 3.4.3 +# SPECIFIED VERSIONS +GDAL_VERSION=3.4.1 # - optional: install GDAL if [ $WITH_GDAL == 1 ] @@ -36,11 +38,6 @@ then sudo apt-add-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-updates main universe multiverse restricted" sudo apt-add-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc)-security main multiverse restricted universe" sudo apt-add-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main multiverse restricted universe" - if [ $WITH_UBUNTUGIS == 1 ] - then - sudo add-apt-repository ppa:ubuntugis/ppa - GDAL_VERSION=3.4.3 - fi sudo apt-get update -y # - install natives @@ -60,7 +57,6 @@ then sudo cp -n $FUSE_DIR/libgdalalljni.so.30.0.3 /usr/lib else # copy from github - # TODO: in v0.4.1, include $GITHUB_VERSION GITHUB_REPO_PATH=databrickslabs/mosaic/main/resources/gdal/jammy sudo wget -nv -P /usr/lib -nc https://raw.githubusercontent.com/$GITHUB_REPO_PATH/libgdalalljni.so sudo wget -nv -P /usr/lib -nc https://raw.githubusercontent.com/$GITHUB_REPO_PATH/libgdalalljni.so.30 diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala index 3517f9368..d840a3205 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala @@ -82,39 +82,63 @@ object GDAL { } /** - * Reads a raster from the given input data. If it is a byte array, it will - * read the raster from the byte array. If it is a string, it will read the - * raster from the path. If the path is a zip file, it will read the raster - * from the zip file. If the path is a subdataset, it will read the raster - * from the subdataset. - * - * @param inputRaster - * The path to the raster. This path has to be a path to a single raster. - * Rasters with subdatasets are supported. - * @return - * Returns a Raster object. - */ + * Reads a raster from the given input data. + * - If it is a byte array, it will read the raster from the byte array. + * - If it is a string, it will read the raster from the path. + * - Path may be a zip file. + * - Path may be a subdataset. + * + * @param inputRaster + * The raster, based on inputDT. Path based rasters with subdatasets + * are supported. + * @param createInfo + * Mosaic creation info of the raster. Note: This is not the same as the + * metadata of the raster. This is not the same as GDAL creation options. + * @param inputDT + * [[DataType]] for the raster, either [[StringType]] or [[BinaryType]]. + * @return + * Returns a [[MosaicRasterGDAL]] object. + */ def readRaster( - inputRaster: Any, - createInfo: Map[String, String], - inputDT: DataType - ): MosaicRasterGDAL = { - inputDT match { - case StringType => - MosaicRasterGDAL.readRaster(createInfo) - case BinaryType => - val bytes = inputRaster.asInstanceOf[Array[Byte]] - val raster = MosaicRasterGDAL.readRaster(bytes, createInfo) - // If the raster is coming as a byte array, we can't check for zip condition. - // We first try to read the raster directly, if it fails, we read it as a zip. - if (raster == null) { - val parentPath = createInfo("parentPath") - val zippedPath = s"/vsizip/$parentPath" - MosaicRasterGDAL.readRaster(bytes, createInfo + ("path" -> zippedPath)) - } else { - raster - } - case _ => throw new IllegalArgumentException(s"Unsupported data type: $inputDT") + inputRaster: Any, + createInfo: Map[String, String], + inputDT: DataType + ): MosaicRasterGDAL = { + if (inputRaster == null) { + MosaicRasterGDAL(null, createInfo, -1) + } else { + inputDT match { + case _: StringType => + MosaicRasterGDAL.readRaster(createInfo) + case _: BinaryType => + val bytes = inputRaster.asInstanceOf[Array[Byte]] + try { + val rasterObj = MosaicRasterGDAL.readRaster(bytes, createInfo) + if (rasterObj.raster == null) { + val rasterZipObj = readParentZipBinary(bytes, createInfo) + if (rasterZipObj.raster == null) { + rasterObj // <- return initial + } else { + rasterZipObj + } + } else { + rasterObj + } + } catch { + case _: Throwable => readParentZipBinary(bytes, createInfo) + } + case _ => throw new IllegalArgumentException(s"Unsupported data type: $inputDT") + } + } + } + + private def readParentZipBinary(bytes: Array[Byte], createInfo: Map[String, String]): MosaicRasterGDAL = { + try { + val parentPath = createInfo("parentPath") + val zippedPath = s"/vsizip/$parentPath" + MosaicRasterGDAL.readRaster(bytes, createInfo + ("path" -> zippedPath)) + } catch { + case _: Throwable => MosaicRasterGDAL(null, createInfo, -1) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 00028dbb0..8a1075a76 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -1043,7 +1043,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends object MosaicContext extends Logging { var _tmpDir: String = "" - val mosaicVersion: String = "0.4.1" + val mosaicVersion: String = "0.4.2" private var instance: Option[MosaicContext] = None diff --git a/src/main/scala/com/databricks/labs/mosaic/package.scala b/src/main/scala/com/databricks/labs/mosaic/package.scala index 86bdbcec7..7b5f1da73 100644 --- a/src/main/scala/com/databricks/labs/mosaic/package.scala +++ b/src/main/scala/com/databricks/labs/mosaic/package.scala @@ -32,6 +32,7 @@ package object mosaic { val MOSAIC_RASTER_RE_TILE_ON_READ = "retile_on_read" val MOSAIC_NO_DRIVER = "no_driver" + val MOSAIC_TEST_MODE = "spark.databricks.labs.mosaic.test.mode" def read: MosaicDataFrameReader = new MosaicDataFrameReader(SparkSession.builder().getOrCreate()) diff --git a/src/test/resources/binary/geotiff-small/chicago_sp27.tif b/src/test/resources/binary/geotiff-small/chicago_sp27.tif new file mode 100644 index 000000000..bf2a26cfa Binary files /dev/null and b/src/test/resources/binary/geotiff-small/chicago_sp27.tif differ diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FilterBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FilterBehaviors.scala index 2d64a633c..a371a1ed5 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FilterBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FilterBehaviors.scala @@ -11,7 +11,7 @@ trait RST_FilterBehaviors extends QueryTest { // noinspection MapGetGet def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { - spark.sparkContext.setLogLevel("FATAL") + spark.sparkContext.setLogLevel("ERROR") val mc = MosaicContext.build(indexSystem, geometryAPI) mc.register() val sc = spark @@ -21,52 +21,52 @@ trait RST_FilterBehaviors extends QueryTest { val rastersInMemory = spark.read .format("gdal") .option("raster_storage", "in-memory") - .load("src/test/resources/modis") + .load("src/test/resources/binary/geotiff-small/chicago_sp27.tif") val gridTiles = rastersInMemory .withColumn("result", rst_filter($"tile", 3, "mode")) .select("result") .collect() - gridTiles.length should be(7) + gridTiles.length should be(1) val gridTiles2 = rastersInMemory .withColumn("result", rst_filter($"tile", lit(3), lit("mode"))) .select("result") .collect() - gridTiles2.length should be(7) + gridTiles2.length should be(1) val gridTiles3 = rastersInMemory .withColumn("result", rst_filter($"tile", lit(3), lit("avg"))) .select("result") .collect() - gridTiles3.length should be(7) + gridTiles3.length should be(1) val gridTiles4 = rastersInMemory .withColumn("result", rst_filter($"tile", lit(3), lit("min"))) .select("result") .collect() - gridTiles4.length should be(7) + gridTiles4.length should be(1) val gridTiles5 = rastersInMemory .withColumn("result", rst_filter($"tile", lit(3), lit("max"))) .select("result") .collect() - gridTiles5.length should be(7) + gridTiles5.length should be(1) val gridTiles6 = rastersInMemory .withColumn("result", rst_filter($"tile", lit(3), lit("median"))) .select("result") .collect() - gridTiles6.length should be(7) + gridTiles6.length should be(1) rastersInMemory.createOrReplaceTempView("source") - + noException should be thrownBy spark .sql(""" |select rst_filter(tile, 3, 'mode') as tile from source diff --git a/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala b/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala index 12dcac6f3..9aa152cee 100644 --- a/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala +++ b/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala @@ -2,7 +2,7 @@ package org.apache.spark.sql.test import com.databricks.labs.mosaic.gdal.MosaicGDAL import com.databricks.labs.mosaic.utils.FileUtils -import com.databricks.labs.mosaic.{MOSAIC_GDAL_NATIVE, MOSAIC_RASTER_CHECKPOINT} +import com.databricks.labs.mosaic.{MOSAIC_GDAL_NATIVE, MOSAIC_RASTER_CHECKPOINT, MOSAIC_TEST_MODE} import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.gdal.gdal.gdal @@ -14,6 +14,8 @@ trait SharedSparkSessionGDAL extends SharedSparkSession { override def sparkConf: SparkConf = { super.sparkConf .set(MOSAIC_GDAL_NATIVE, "true") + super.sparkConf + .set(MOSAIC_TEST_MODE, "true") } override def createSparkSession: TestSparkSession = {