From 08e669a4e64fbfcf93eb6e9ae89c96fb04724a1c Mon Sep 17 00:00:00 2001 From: Anthony Tuininga Date: Mon, 17 Feb 2025 19:35:07 -0700 Subject: [PATCH] Added preliminary support for fetching data as Apache Arrow arrays with zero copy interchange with popular data frame libraries (#375). --- .gitignore | 4 + THIRD_PARTY_LICENSES.txt | 244 + doc/src/api_manual/connection.rst | 55 + doc/src/api_manual/dataframe.rst | 222 + doc/src/api_manual/defaults.rst | 4 + doc/src/index.rst | 1 + doc/src/release_notes.rst | 4 + doc/src/user_guide/sql_execution.rst | 363 +- doc/src/user_guide/tuning.rst | 11 + samples/dataframe_numpy.py | 71 + samples/dataframe_pandas.py | 90 + samples/dataframe_parquet_write.py | 87 + samples/dataframe_polars.py | 67 + samples/dataframe_pyarrow.py | 95 + samples/dataframe_torch.py | 67 + samples/sql/create_schema.sql | 38 + setup.cfg | 1 + setup.py | 26 +- src/oracledb/__init__.py | 6 +- src/oracledb/base_impl.pxd | 19 + src/oracledb/base_impl.pyx | 18 +- src/oracledb/connection.py | 37 + src/oracledb/errors.py | 9 + src/oracledb/impl/base/converters.pyx | 127 + src/oracledb/impl/base/cursor.pyx | 52 +- src/oracledb/impl/base/metadata.pyx | 36 +- src/oracledb/impl/base/utils.pyx | 2 + src/oracledb/impl/base/var.pyx | 24 +- src/oracledb/impl/thick/cursor.pyx | 15 +- src/oracledb/impl/thick/var.pyx | 80 +- src/oracledb/impl/thin/messages.pyx | 21 +- src/oracledb/interchange/__init__.py | 0 src/oracledb/interchange/buffer.py | 82 + src/oracledb/interchange/column.py | 205 + src/oracledb/interchange/dataframe.py | 151 + .../interchange/nanoarrow/nanoarrow.c | 3872 +++++++++++++++ .../interchange/nanoarrow/nanoarrow.h | 4279 +++++++++++++++++ src/oracledb/interchange/nanoarrow_bridge.pxd | 102 + src/oracledb/interchange/nanoarrow_bridge.pyx | 334 ++ src/oracledb/interchange/protocol.py | 282 ++ src/oracledb/thick_impl.pyx | 5 +- src/oracledb/thin_impl.pyx | 3 +- tests/sql/create_schema.sql | 13 + tests/test_8000_dataframe.py | 481 ++ utils/templates/connection.py | 37 + 45 files changed, 11710 insertions(+), 32 deletions(-) create mode 100644 doc/src/api_manual/dataframe.rst create mode 100644 samples/dataframe_numpy.py create mode 100644 samples/dataframe_pandas.py create mode 100644 samples/dataframe_parquet_write.py create mode 100644 samples/dataframe_polars.py create mode 100644 samples/dataframe_pyarrow.py create mode 100644 samples/dataframe_torch.py create mode 100644 src/oracledb/interchange/__init__.py create mode 100644 src/oracledb/interchange/buffer.py create mode 100644 src/oracledb/interchange/column.py create mode 100644 src/oracledb/interchange/dataframe.py create mode 100644 src/oracledb/interchange/nanoarrow/nanoarrow.c create mode 100644 src/oracledb/interchange/nanoarrow/nanoarrow.h create mode 100644 src/oracledb/interchange/nanoarrow_bridge.pxd create mode 100644 src/oracledb/interchange/nanoarrow_bridge.pyx create mode 100644 src/oracledb/interchange/protocol.py create mode 100644 tests/test_8000_dataframe.py diff --git a/.gitignore b/.gitignore index d672f61d..8c791a41 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ build/ dist/ doc/build src/oracledb/*.c +src/oracledb/interchange/*.c tests/ext/config.ini .ipynb_checkpoints/ +.venv*/ +.idea samples/sample.csv +samples/sample.parquet diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt index 5cd97ac0..1c1cf597 100644 --- a/THIRD_PARTY_LICENSES.txt +++ b/THIRD_PARTY_LICENSES.txt @@ -453,3 +453,247 @@ GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +___________________________________________________________________________________________ + +Apache Arrow nanoarrow +Copyright 2023 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +---------- + + + +Apache nanoarrow 0.6.0 + + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +---------- + +Copyright 2015-2023 Mikkel F. Jørgensen, dvide.com +Copyright (c) 2016 Mikkel Fahnøe Jørgensen, dvide.com +Copyright (c) 2005-2016 Paul Hsieh +Copyright (c) 2024 Mikkel Fahnøe Jørgensen, dvide.com + +---------- + +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. diff --git a/doc/src/api_manual/connection.rst b/doc/src/api_manual/connection.rst index a685dcea..c4f5f409 100644 --- a/doc/src/api_manual/connection.rst +++ b/doc/src/api_manual/connection.rst @@ -128,6 +128,61 @@ Connection Methods .. versionadded:: 2.1.0 +.. method:: Connection.fetch_df_all(statement, parameters=None, \ + arraysize=None) + + Fetches all rows of the SQL query ``statement``, returning them in an + :ref:`OracleDataFrame ` object. An empty + OracleDataFrame is returned if there are no rows available. + + The ``parameters`` parameter can be a list of tuples, where each tuple item + maps to one :ref:`bind variable placeholder ` in ``statement``. It + can also be a list of dictionaries, where the keys match the bind variable + placeholder names in ``statement``. + + The ``arraysize`` parameter can specified to tune performance of fetching + data across the network. It defaults to :attr:`defaults.arraysize`. + Internally, the ``fetch_df_all()``'s :attr:`Cursor.prefetchrows` size is + always set to the value of the explicit or default ``arraysize`` parameter + value. + + See :ref:`dataframeformat` for the supported data types and examples. + + .. note:: + + The data frame support in python-oracledb 3.0.0 is a pre-release and + may change in the next version. + + .. versionadded:: 3.0.0 + +.. method:: Connection.fetch_df_batches(statement, parameters=None, \ + size=None) + + This returns an iterator yielding the next ``size`` rows of the SQL query + ``statement`` in each iteration as an :ref:`OracleDataFrame + ` object. An empty OracleDataFrame is returned if there + are no rows available. + + The ``parameters`` parameter can be a list of tuples, where each tuple item + maps to one :ref:`bind variable placeholder ` in ``statement``. It + can also be a list of dictionaries, where the keys match the bind variable + placeholder names in ``statement``. + + The ``size`` parameter controls the number of records fetched in each + batch. It defaults to :attr:`defaults.arraysize`. Internally, the + ``fetch_df_batches()``'s :attr:`Cursor.arraysize`. and + :attr:`Cursor.prefetchrows` sizes are always set to the value of the + explicit or default ``size`` parameter value. + + See :ref:`dataframeformat` for the supported data types and examples. + + .. note:: + + The data frame support in python-oracledb 3.0.0 is a pre-release and + may change in the next version. + + .. versionadded:: 3.0.0 + .. method:: Connection.getSodaDatabase() Returns a :ref:`SodaDatabase ` object for Simple Oracle Document diff --git a/doc/src/api_manual/dataframe.rst b/doc/src/api_manual/dataframe.rst new file mode 100644 index 00000000..c1d7ec12 --- /dev/null +++ b/doc/src/api_manual/dataframe.rst @@ -0,0 +1,222 @@ +.. _oracledataframeobj: + +**************** +API: Data Frames +**************** + +Python-oracledb can fetch directly to the `Python DataFrame Interchange +Protocol `__ +format. + +See :ref:`dataframeformat` for more information, including the type mapping +from Oracle Database types to Arrow data types. + +.. note:: + + The data frame support in python-oracledb 3.0.0 is a pre-release and may + change in the next version. + +OracleDataFrame Objects +======================= + +OracleDataFrame objects are returned from the methods +:meth:`Connection.fetch_df_all()` and :meth:`Connection.fetch_df_batches()`. + +The OracleDataFrame object is an extension to the DB API. + +.. versionadded:: 3.0.0 + +.. _oracledataframemeth: + +OracleDataFrame Methods +----------------------- + +The object implements the Python DataFrame Interchange Protocol `DataFrame API +Interface `__ + +.. method:: OracleDataFrame.column_arrays() + + Returns a list of :ref:`OracleArrowArray ` objects, + each containing a select list column. + + This is an extension to the DataFrame Interchange Protocol. + +.. method:: OracleDataFrame.column_names() + + Returns a list of the column names in the data frame. + +.. method:: OracleDataFrame.get_chunks(n_chunks) + + Returns itself, since python-oracledb only uses one chunk. + +.. method:: OracleDataFrame.get_column(i) + + Returns an :ref:`OracleColumn ` object for the column + at the given index ``i``. + +.. method:: OracleDataFrame.get_column_by_name(name) + + Returns an :ref:`OracleColumn ` object for the column + with the given name ``name``. + +.. method:: OracleDataFrame.get_columns() + + Returns a list of :ref:`OracleColumn ` objects, one + object for each column in the data frame. + +.. method:: OracleDataFrame.num_chunks() + + Return the number of chunks the data frame consists of. + + This always returns 1. + +.. method:: OracleDataFrame.num_columns() + + Returns the number of columns in the data frame. + +.. method:: OracleDataFrame.num_rows() + + Returns the number of rows in the data frame. + +.. _oracledataframeattr: + +OracleDataFrame Attributes +-------------------------- + +.. attribute:: OracleDataFrame.metadata + + This read-only attribute returns the metadata for the data frame as a + dictionary with keys ``num_columns``, ``num_rows``, and ``num_chunks``, + showing the number of columns, rows, and chunks, respectively. The number + of chunks is always 1 in python-oracledb. + +.. _oraclearrowarrayobj: + +OracleArrowArray Objects +======================== + +OracleArrowArray objects are returned by +:meth:`OracleDataFrame.column_arrays()`. + +These are used for conversion to `PyArrow Tables +`__, see +:ref:`dataframeformat`. + +.. versionadded:: 3.0.0 + +.. _oraclecolumnobj: + +OracleColumn Objects +==================== + +OracleColumn objects are returned by :meth:`OracleDataFrame.get_column()`, +:meth:`OracleDataFrame.get_column_by_name()`, and +:meth:`OracleDataFrame.get_columns()`. + +.. versionadded:: 3.0.0 + +.. _oraclecolumnmeth: + +OracleColumn Methods +-------------------- + +.. method:: OracleColumn.get_buffers() + + Returns a dictionary containing the underlying buffers. + + The returned dictionary contains the ``data``, ``validity``, and ``offset`` + keys. + + The ``data`` attribute is a two-element tuple whose first element is a + buffer containing the data and whose second element is the data buffer's + associated dtype. + + The ``validity`` attribute is a a two-element tuple whose first element + is a buffer containing mask values indicating missing data and whose + second element is the mask value buffer's associated dtype. The value of + this attribute is *None* if the null representation is not a bit or byte + mask. + + The ``offset`` attribute is a two-element tuple whose first element is a + buffer containing the offset values for variable-size binary data (for + example, variable-length strings) and whose second element is the offsets + buffer's associated dtype. The value of this attribute is *None* if the + data buffer does not have an associated offsets buffer. + +.. method:: OracleColumn.get_chunks(n_chunks) + + Returns itself, since python-oracledb only uses one chunk. + +.. method:: OracleColumn.num_chunks() + + Returns the number of chunks the column consists of. + + This always returns 1. + +.. method:: OracleColumn.size() + + Returns the number of rows in the column. + +.. _oraclecolumnattr: + +OracleColumn Attributes +----------------------- + +.. attribute:: OracleColumn.describe_null + + This read-only property returns the description of the null representation + that the column uses. + +.. attribute:: OracleColumn.dtype + + This read-only attribute returns the Dtype description as a tuple + containing the values for the attributes ``kind``, ``bit-width``, + ``format string``, and ``endianess``. + + The ``kind`` attribute specifies the type of the data. + + The ``bit-width`` attribute specifies the number of bits as an integer. + + The ``format string`` attribute specifies the data type description format + string in Apache Arrow C Data Interface format. + + The ``endianess`` attribute specifies the byte order of the data type. + Currently, only native endianess is supported. + +.. attribute:: OracleColumn.metadata + + This read-only attribute returns the metadata for the column as a + dictionary with string keys. + +.. attribute:: OracleColumn.null_count + + This read-only attribute returns the number of null row values, if known. + +.. attribute:: OracleColumn.offset + + This read-only attribute specifies the offset of the first row. + +.. _oraclecolumnbufferobj: + +OracleColumnBuffer Objects +========================== + +A buffer object backed by an ArrowArray consisting of a single chunk. + +This is an internal class used for conversion to third party data frames. + +.. versionadded:: 3.0.0 + +.. _oraclecolumnbufferattr: + +OracleColumnBuffer Attributes +----------------------------- + +.. attribute:: OracleColumnBuffer.bufsize + + This read-only property returns the buffer size in bytes. + +.. attribute:: OracleColumnBuffer.ptr + + This read-only attribute specifies the pointer to the start of the buffer + as an integer. diff --git a/doc/src/api_manual/defaults.rst b/doc/src/api_manual/defaults.rst index c429d027..120b11fc 100644 --- a/doc/src/api_manual/defaults.rst +++ b/doc/src/api_manual/defaults.rst @@ -128,6 +128,10 @@ Defaults Attributes The default value for :attr:`Cursor.prefetchrows`. This is a query tuning attribute, see :ref:`Tuning Fetch Performance `. + This attribute is ignored when using :meth:`Connection.fetch_df_all()` or + :meth:`Connection.fetch_df_batches()` since these methods always set the + internal prefetch size to the relevant arraysize or size value. + This attribute has an initial value of *2*. .. attribute:: defaults.program diff --git a/doc/src/index.rst b/doc/src/index.rst index a6f418ca..cd2c9c28 100644 --- a/doc/src/index.rst +++ b/doc/src/index.rst @@ -61,6 +61,7 @@ API Manual api_manual/connection_pool.rst api_manual/pool_params.rst api_manual/cursor.rst + api_manual/dataframe.rst api_manual/fetch_info.rst api_manual/variable.rst api_manual/subscription.rst diff --git a/doc/src/release_notes.rst b/doc/src/release_notes.rst index d24e4834..41842d15 100644 --- a/doc/src/release_notes.rst +++ b/doc/src/release_notes.rst @@ -88,6 +88,10 @@ Thick Mode Changes Common Changes ++++++++++++++ +#) Added new methods :meth:`Connection.fetch_df_all()` and + :meth:`Connection.fetch_df_batches()` to fetch data as DataFrames + compliant with the Python DataFrame Interchange protocol. See + :ref:`dataframeformat`. #) Added support for Oracle Database 23ai SPARSE vectors. #) Added support for :ref:`naming and caching connection pools ` during creation, and retrieving them later from the diff --git a/doc/src/user_guide/sql_execution.rst b/doc/src/user_guide/sql_execution.rst index 44560fd5..d3b63fa7 100644 --- a/doc/src/user_guide/sql_execution.rst +++ b/doc/src/user_guide/sql_execution.rst @@ -5,12 +5,19 @@ Executing SQL ************* Executing SQL statements is the primary way in which a Python application -communicates with Oracle Database. Statements are executed using the methods -:meth:`Cursor.execute()` or :meth:`Cursor.executemany()`. Statements include -queries, Data Manipulation Language (DML), and Data Definition Language (DDL). -A few other `specialty statements -`__ can also be executed. +communicates with Oracle Database. Statements include queries, Data +Manipulation Language (DML), and Data Definition Language (DDL). A few other +`specialty statements `__ can also be +executed. Statements are executed using one of these methods +:meth:`Cursor.execute()`, :meth:`Cursor.executemany()`, +:meth:`Connection.fetch_df_all()`, :meth:`Connection.fetch_df_batches()`, +:meth:`AsyncCursor.execute()`, :meth:`AsyncCursor.executemany()`, +:meth:`AsyncConnection.execute()`, :meth:`AsyncConnection.executemany()`, or +:meth:`AsyncConnection.run_pipeline()`. + +This chapter discusses python-oracledb's synchronous methods. The asynchronous +methods and pipelining functionality are discussed in detail in :ref:`asyncio`. PL/SQL statements are discussed in :ref:`plsqlexecution`. Other chapters contain information on specific data types and features. See :ref:`batchstmnt`, @@ -18,8 +25,9 @@ contain information on specific data types and features. See :ref:`batchstmnt`, Python-oracledb can be used to execute individual statements, one at a time. Once a statement has finished execution, only then will the next statement -execute. If you try to execute statements concurrently, the statements are -queued and run consecutively in the order they are in the code. +execute. If you try to execute statements concurrently in a single connection, +the statements are queued and run consecutively in the order they are executed +in the application code. Python-oracledb does not read SQL*Plus ".sql" files. To read SQL files, use a technique like the one in ``run_sql_script()`` in `samples/sample_env.py @@ -30,7 +38,7 @@ SQL statements should not contain a trailing semicolon (";") or forward slash .. code-block:: python - cursor.execute("select * from MyTable;") + cursor.execute("select * from MyTable;") # fails due to semicolon This is correct: @@ -42,8 +50,8 @@ This is correct: SQL Queries =========== -Queries (statements beginning with SELECT or WITH) can only be executed using -the method :meth:`Cursor.execute()`. Rows can then be iterated over, or can be +Queries (statements beginning with SELECT or WITH) can be executed using the +method :meth:`Cursor.execute()`. Rows can then be iterated over, or can be fetched using one of the methods :meth:`Cursor.fetchone()`, :meth:`Cursor.fetchmany()` or :meth:`Cursor.fetchall()`. There is a :ref:`default type mapping ` to Python types that can be @@ -52,9 +60,10 @@ optionally :ref:`overridden `. .. IMPORTANT:: Interpolating or concatenating user data with SQL statements, for example - ``cursor.execute("SELECT * FROM mytab WHERE mycol = '" + myvar + "'")``, is a security risk - and impacts performance. Use :ref:`bind variables ` instead. For - example, ``cursor.execute("SELECT * FROM mytab WHERE mycol = :mybv", mybv=myvar)``. + ``cursor.execute("SELECT * FROM mytab WHERE mycol = '" + myvar + "'")`` is + a security risk and impacts performance. Use :ref:`bind variables ` + instead, for example ``cursor.execute("SELECT * FROM mytab WHERE mycol = + :mybv", mybv=myvar)``. .. _fetching: @@ -120,6 +129,8 @@ Rows can be fetched in various ways. The fetch methods return data as tuples. To return results as dictionaries, see :ref:`rowfactories`. +- Data can also be fetched in Arrow data format, see :ref:`dataframeformat`. + Closing Cursors --------------- @@ -547,7 +558,7 @@ Oracle Database uses decimal numbers and these cannot be converted seamlessly to binary number representations like Python floats. In addition, the range of Oracle numbers exceeds that of floating point numbers. Python has decimal objects which do not have these limitations. In python-oracledb you can set -``oracledb.defaults.fetch_decimals`` so that Decimals are returned to the +:attr:`defaults.fetch_decimals` so that Decimals are returned to the application, ensuring that numeric precision is not lost when fetching certain numbers. @@ -579,7 +590,7 @@ This displays ``7.1 * 3 = 21.3`` See `samples/return_numbers_as_decimals.py `__ -An equivalent, longer, older coding idiom to :attr:`Defaults.fetch_decimals` is +An equivalent, longer, older coding idiom to :attr:`defaults.fetch_decimals` is to use an :ref:`output type handler ` do the conversion. .. code-block:: python @@ -723,6 +734,326 @@ Performance-sensitive applications should consider using scalar types instead of objects. If you do use objects, avoid calling :meth:`Connection.gettype()` unnecessarily, and avoid objects with large numbers of attributes. +.. _dataframeformat: + +Fetching using the DataFrame Interchange Protocol +------------------------------------------------- + +Python-oracledb can fetch directly to the `Python DataFrame Interchange +Protocol `__ +format. This then allows zero-copy data interchanges between Python data frame +libraries. It is an efficient way to work with data using Python libraries such +as `Apache Arrow `__, `Pandas +`__, `Polars `__, `NumPy +`__, `PyTorch `__, or to write files +in `Apache Parquet `__ format. + +.. note:: + + The data frame support in python-oracledb 3.0.0 is a pre-release and may + change in the next version. + +The method :meth:`Connection.fetch_df_all()` fetches all rows from a query. +The method :meth:`Connection.fetch_df_batches()` implements an iterator for +fetching batches of rows. The methods return :ref:`OracleDataFrame +` objects, whose :ref:`methods ` +implement the Python DataFrame Interchange Protocol `DataFrame API Interface +`__. + +For example, to fetch all rows from a query and print some information about +the results: + +.. code-block:: python + + sql = "select * from departments" + # Adjust arraysize to tune the query fetch performance + odf = connection.fetch_df_all(statement=sql, arraysize=100) + + print(odf.odf.column_names()) + print(f"{odf.num_columns()} columns") + print(f"{odf.num_rows()} rows") + +With Oracle Database's standard DEPARTMENTS table, this would display:: + + ['DEPARTMENT_ID', 'DEPARTMENT_NAME', 'MANAGER_ID', 'LOCATION_ID'] + 4 columns + 27 rows + +To do more extensive operations on an :ref:`OracleDataFrame +`, it can be converted to an appropriate library class, and +then methods of that library can be used. For example it could be converted to +a `Pandas DataFrame `__, or to a `PyArrow table +`__ as shown +later. + +**Data Frame Type Mapping** + +Internally, python-oracledb's :ref:`OracleDataFrame ` +support makes use of `Apache nanoarrow `__ +libraries to build data frames. + +The following data type mapping occurs from Oracle Database types to the Arrow +types used in OracleDataFrame objects. Querying any other types from Oracle +Database will result in an exception. + +.. list-table-with-summary:: + :header-rows: 1 + :class: wy-table-responsive + :widths: 1 1 + :align: left + :summary: The first column is the Oracle Database type. The second column is the Arrow data type used in the OracleDataFrame object. + + * - Oracle Database Type + - Arrow Data Type + * - DB_TYPE_NUMBER + - DECIMAL128, INT64, or DOUBLE. See notes below + * - DB_TYPE_CHAR + - STRING + * - DB_TYPE_VARCHAR + - STRING + * - DB_TYPE_BINARY_FLOAT + - FLOAT + * - DB_TYPE_BINARY_DOUBLE + - DOUBLE + * - DB_TYPE_BOOLEAN + - BOOLEAN + * - DB_TYPE_DATE + - TIMESTAMP + * - DB_TYPE_TIMESTAMP + - TIMESTAMP + * - DB_TYPE_TIMESTAMP_LTZ + - TIMESTAMP + * - DB_TYPE_TIMESTAMP_TZ + - TIMESTAMP + + +When converting Oracle Database NUMBERs, if :attr:`defaults.fetch_decimals` is +*True*, the Arrow data type is DECIMAL128. Note Arrow's DECIMAL128 format only +supports precision of up to 38 decimal digits. Else, if the Oracle number data +type has scale of 0, and precision less than or equal to 18, then the Arrow +data type is INT64. In all other cases, the Arrow data type is DOUBLE. + +The Arrow TIMESTAMP for Oracle Database DATEs will have a time unit of +"seconds". For Oracle Database TIMESTAMP types, the time unit depends on the +Oracle type's fractional precision: + +.. list-table-with-summary:: + :header-rows: 1 + :class: wy-table-responsive + :widths: 1 1 + :align: left + :summary: The first column is the Oracle Database TIMESTAMP-type fractional second precision. The second column is the resulting Arrow TIMESTAMP time unit. + + * - Oracle Database TIMESTAMP fractional second precision range + - Arrow TIMESTAMP time unit + * - 0 + - seconds + * - 1 - 3 + - milliseconds + * - 4 - 6 + - microconds + * - 7 - 9 + - nanoseconds + +Arrow TIMESTAMPs will not have timezone data. + +**Inserting OracleDataFrames into Oracle Database** + +To insert data currently in :ref:`OracleDataFrame ` format +into Oracle Database requires it to be converted. For example, you could +convert it into a Pandas DataFrame for insert with the Pandas method +``to_sql()``. Or convert into a Python list via the PyArrow +``Table.to_pylist()`` method and then use standard python-oracledb +functionality to execute a SQL INSERT statement. + +Creating PyArrow Tables ++++++++++++++++++++++++ + +An example that creates and uses a `PyArrow Table +`__ is: + +.. code-block:: python + + # Get an OracleDataFrame + # Adjust arraysize to tune the query fetch performance + sql = "select id, name from SampleQueryTab order by id" + odf = connection.fetch_df_all(statement=sql, arraysize=100) + + # Create a PyArrow table + pyarrow_table = pyarrow.Table.from_arrays( + arrays=odf.column_arrays(), names=odf.column_names() + ) + + print("\nNumber of rows and columns:") + (r, c) = pyarrow_table.shape + print(f"{r} rows, {c} columns") + +This makes use of :meth:`OracleDataFrame.column_arrays()` which returns a list +of :ref:`OracleArrowArray Objects `. + +See `samples/dataframe_pyarrow.py `__ for a runnable example. + +Creating Pandas DataFrames +++++++++++++++++++++++++++ + +An example that creates and uses a `Pandas DataFrame `__ is: + +.. code-block:: python + + import pandas + + # Get an OracleDataFrame + # Adjust arraysize to tune the query fetch performance + sql = "select * from mytable where id = :1" + myid = 12345 # the bind variable value + odf = connection.fetch_df_all(statement=sql, parameters=[myid], arraysize=1000) + + # Get a Pandas DataFrame from the data. + # This is a zero copy call + df = pandas.api.interchange.from_dataframe(odf) + + # Perform various Pandas operations on the DataFrame + print(df.T) # transform + print(df.tail(3)) # last three rows + +Using python-oracledb to fetch the interchange format will be more efficient +than using the Pandas ``read_sql()`` method. + +See `samples/dataframe_pandas.py `__ for a runnable example. + +Creating Polars Series +++++++++++++++++++++++ + +An example that creates and uses a `Polars Series +`__ is: + +.. code-block:: python + + import pyarrow + import polars + + # Get an OracleDataFrame + # Adjust arraysize to tune the query fetch performance + sql = "select id from SampleQueryTab order by id" + odf = connection.fetch_df_all(statement=sql, arraysize=100) + + # Convert to a Polars Series + pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) + p = polars.from_arrow(pyarrow_array) + + # Perform various Polars operations on the Series + print(p.sum()) + print(p.log10()) + +See `samples/dataframe_polars.py `__ for a runnable example. + +Writing Apache Parquet Files +++++++++++++++++++++++++++++ + +To write output in `Apache Parquet `__ file +format, you can use data frames as an efficient intermediary. Use the +:meth:`Connection.fetch_df_batches()` iterator and convert to a `PyArrow Table +`__ that can +be written by the PyArrow library. + +.. code-block:: python + + import pyarrow + import pyarrow.parquet as pq + + FILE_NAME = "sample.parquet" + + # Tune the fetch batch size for your query + BATCH_SIZE = 10000 + + sql = "select * from mytable" + pqwriter = None + for odf in connection.fetch_df_batches(statement=sql, size=BATCH_SIZE): + + # Get a PyArrow table from the query results + pyarrow_table = pyarrow.Table.from_arrays( + arrays=odf.column_arrays(), names=odf.column_names() + ) + + if not pqwriter: + pqwriter = pq.ParquetWriter(FILE_NAME, pyarrow_table.schema) + + pqwriter.write_table(pyarrow_table) + + pqwriter.close() + +See `samples/dataframe_parquet_write.py `__ +for a runnable example. + +The DLPack Protocol ++++++++++++++++++++ + +The DataFrame format facilitates working with query results as +tensors. Conversion can be done using the standard `DLPack Protocol +`__ implemented by PyArrow. + +**Using NumPy Arrays** + +For example, to convert to `NumPy `__ ``ndarray`` format: + +.. code-block:: python + + import pyarrow + import numpy + + SQL = "select id from SampleQueryTab order by id" + + # Get an OracleDataFrame + # Adjust arraysize to tune the query fetch performance + odf = connection.fetch_df_all(statement=SQL, arraysize=100) + + # Convert to an ndarray via the Python DLPack specification + pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) + np = numpy.from_dlpack(pyarrow_array) + + # Perform various numpy operations on the ndarray + + print(numpy.sum(np)) + print(numpy.log10(np)) + + +See `samples/dataframe_numpy.py `__ for a runnable example. + +**Using Torch** + +An example of working with data as a `Torch tensor +`__ is: + +.. code-block:: python + + import pyarrow + import torch + + SQL = "select id from SampleQueryTab order by id" + + # Get an OracleDataFrame + # Adjust arraysize to tune the query fetch performance + odf = connection.fetch_df_all(statement=SQL, arraysize=100) + + # Convert to a Torch tensor via the Python DLPack specification + pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) + tt = torch.from_dlpack(pyarrow_array) + + # Perform various Torch operations on the tensor + + print(torch.sum(tt)) + print(torch.log10(tt)) + +See `samples/dataframe_torch.py `__ for a runnable example. + .. _rowlimit: Limiting Rows diff --git a/doc/src/user_guide/tuning.rst b/doc/src/user_guide/tuning.rst index f0b64c89..9202c753 100644 --- a/doc/src/user_guide/tuning.rst +++ b/doc/src/user_guide/tuning.rst @@ -311,6 +311,17 @@ The ``arraysize`` value can also be set before calling the procedure: Also see `Avoiding Premature Prefetching`_. +Tuning Fetching for DataFrames +------------------------------ + +When fetching :ref:`data frames ` with +:meth:`Connection.fetch_df_all()` or :meth:`Connection.fetch_df_batches()`, +tuning of data transfer across the network is controlled by the methods +``arraysize`` or ``size`` parameters, respectively. + +Any :attr:`defaults.prefetchrows` value is ignored since these methods always +set the internal prefetch size to the relevant ``arraysize`` or ``size`` value. + Database Round-trips ==================== diff --git a/samples/dataframe_numpy.py b/samples/dataframe_numpy.py new file mode 100644 index 00000000..8bc7a476 --- /dev/null +++ b/samples/dataframe_numpy.py @@ -0,0 +1,71 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_numpy.py +# +# Shows how to use connection.fetch_df_all() to efficiently put data into a +# NumPy ndarray via the DLPack standard memory layout. +# ----------------------------------------------------------------------------- + +import pyarrow +import numpy + +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +SQL = "select id from SampleQueryTab order by id" + +# Get an OracleDataFrame +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL, arraysize=100) + +# Convert to an ndarray via the Python DLPack specification +pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) +np = numpy.from_dlpack(pyarrow_array) + +# If the array has nulls, an alternative is: +# np = pyarrow_array.to_numpy(zero_copy_only=False) + +print("Type:") +print(type(np)) # + +# Perform various numpy operations on the ndarray + +print("\nSum:") +print(numpy.sum(np)) + +print("\nLog10:") +print(numpy.log10(np)) diff --git a/samples/dataframe_pandas.py b/samples/dataframe_pandas.py new file mode 100644 index 00000000..f6165757 --- /dev/null +++ b/samples/dataframe_pandas.py @@ -0,0 +1,90 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_pandas.py +# +# Shows how to use connection.fetch_df_all() and connection.fetch_df_batches() +# to create Pandas dataframes. +# ----------------------------------------------------------------------------- + +import pandas +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +SQL = "select id, name from SampleQueryTab order by id" + +# Get an OracleDataFrame. +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL, arraysize=100) + +# Get a Pandas DataFrame from the data. +# This is a zero copy call +df = pandas.api.interchange.from_dataframe(odf) + +# Perform various Pandas operations on the DataFrame + +print("Columns:") +print(df.columns) + +print("\nDataframe description:") +print(df.describe()) + +print("\nLast three rows:") +print(df.tail(3)) + +print("\nTransform:") +print(df.T) + +# ----------------------------------------------------------------------------- + +# An example of batch fetching +# +# Note that since this particular example ends up with all query rows being +# held in memory, it would be more efficient to use fetch_df_all() as shown +# above. + +print("\nFetching in batches:") +df = pandas.DataFrame() + +# Tune 'size' for your data set. Here it is small to show the batch fetch +# behavior on the sample table. +for odf in connection.fetch_df_batches(statement=SQL, size=10): + df_b = pandas.api.interchange.from_dataframe(odf) + print(f"Appending {df_b.shape[0]} rows") + df = pandas.concat([df, df_b], ignore_index=True) + +print("\nLast three rows:") +print(df.tail(3)) diff --git a/samples/dataframe_parquet_write.py b/samples/dataframe_parquet_write.py new file mode 100644 index 00000000..02a7d93f --- /dev/null +++ b/samples/dataframe_parquet_write.py @@ -0,0 +1,87 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_parquet_write.py +# +# Shows how to use connection.fetch_df_batches() to write files in Parquet +# format. +# ----------------------------------------------------------------------------- + +import os + +import pyarrow +import pyarrow.parquet as pq + +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +PARQUET_FILE_NAME = "sample.parquet" + +if os.path.isfile(PARQUET_FILE_NAME): + os.remove(PARQUET_FILE_NAME) + +# Tune this for your query +FETCH_BATCH_SIZE = 10 + +SQL = "select id, name from SampleQueryTab order by id" +pqwriter = None + +for odf in connection.fetch_df_batches(statement=SQL, size=FETCH_BATCH_SIZE): + + pyarrow_table = pyarrow.Table.from_arrays( + arrays=odf.column_arrays(), names=odf.column_names() + ) + + if not pqwriter: + pqwriter = pq.ParquetWriter(PARQUET_FILE_NAME, pyarrow_table.schema) + + print(f"Writing a batch of {odf.num_rows()} rows") + pqwriter.write_table(pyarrow_table) + +pqwriter.close() + +# ----------------------------------------------------------------------------- +# Check the file was created + +print("\nParquet file metadata:") +print(pq.read_metadata(PARQUET_FILE_NAME)) + +# ----------------------------------------------------------------------------- +# Read the file + +print("\nParquet file data:") +t = pq.read_table(PARQUET_FILE_NAME, columns=["ID", "NAME"]) +print(t) diff --git a/samples/dataframe_polars.py b/samples/dataframe_polars.py new file mode 100644 index 00000000..aaa2859d --- /dev/null +++ b/samples/dataframe_polars.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_polars.py +# +# Shows how to use connection.fetch_df_all() to efficiently put data into a +# Polars Series +# ----------------------------------------------------------------------------- + +import pyarrow +import polars + +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +SQL = "select id from SampleQueryTab order by id" + +# Get an OracleDataFrame +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL, arraysize=100) + +# Convert to a Polars Series +pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) +p = polars.from_arrow(pyarrow_array) + +print(type(p)) # + +# Perform various Polars operations on the Series + +print("\nSum:") +print(p.sum()) + +print("\nLog10:") +print(p.log10()) diff --git a/samples/dataframe_pyarrow.py b/samples/dataframe_pyarrow.py new file mode 100644 index 00000000..ec69d8d6 --- /dev/null +++ b/samples/dataframe_pyarrow.py @@ -0,0 +1,95 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_pyarrow.py +# +# Shows how to use connection.fetch_df_all() to create PyArrow tables and +# arrays. +# ----------------------------------------------------------------------------- + +import pyarrow + +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +# ----------------------------------------------------------------------------- +# Creating a PyArrow table + +SQL1 = "select id, name from SampleQueryTab order by id" + +# Get an OracleDataFrame +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL1, arraysize=100) + +# Create a PyArrow table +pyarrow_table = pyarrow.Table.from_arrays( + arrays=odf.column_arrays(), names=odf.column_names() +) + +print("Type:") +print(type(pyarrow_table)) # + +# Perform various PyArrow operations + +print("\nColumn names:") +print(pyarrow_table.column_names) + +print("\nNumber of rows and columns:") +(r, c) = pyarrow_table.shape +print(f"{r} rows, {c} columns") + +# ----------------------------------------------------------------------------- +# Creating a PyArrow array + +SQL2 = "select id from SampleQueryTab order by id" + +# Get an OracleDataFrame +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL2, arraysize=100) + +# Create a PyArrow array +pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) + +print("Type:") +print(type(pyarrow_array)) # + +# Perform various PyArrow operations + +print("\nSum:") +print(pyarrow_array.sum()) + +print("\nFirst three elements:") +print(pyarrow_array.slice(0, 3)) diff --git a/samples/dataframe_torch.py b/samples/dataframe_torch.py new file mode 100644 index 00000000..e45d1940 --- /dev/null +++ b/samples/dataframe_torch.py @@ -0,0 +1,67 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe_torch.py +# +# Shows how to use connection.fetch_df_all() to efficiently put data into a +# Torch tensor via the DLPack standard memory layout. +# ----------------------------------------------------------------------------- + +import pyarrow +import torch + +import oracledb +import sample_env + +# determine whether to use python-oracledb thin mode or thick mode +if not sample_env.get_is_thin(): + oracledb.init_oracle_client(lib_dir=sample_env.get_oracle_client()) + +connection = oracledb.connect( + user=sample_env.get_main_user(), + password=sample_env.get_main_password(), + dsn=sample_env.get_connect_string(), + params=sample_env.get_connect_params(), +) + +SQL = "select id from SampleQueryTab order by id" + +# Get an OracleDataFrame +# Adjust arraysize to tune the query fetch performance +odf = connection.fetch_df_all(statement=SQL, arraysize=100) + +# Convert to a Torch tensor via the Python DLPack specification +pyarrow_array = pyarrow.array(odf.get_column_by_name("ID")) +tt = torch.from_dlpack(pyarrow_array) + +print(type(tt)) # + +# Perform various Torch operations on the tensor + +print("\nSum:") +print(torch.sum(tt)) + +print("\nLog10:") +print(torch.log10(tt)) diff --git a/samples/sql/create_schema.sql b/samples/sql/create_schema.sql index 7a7c816f..d160e984 100644 --- a/samples/sql/create_schema.sql +++ b/samples/sql/create_schema.sql @@ -391,6 +391,44 @@ insert into &main_user..SampleQueryTab values (6, 'Frankie') / insert into &main_user..SampleQueryTab values (7, 'Gerri') / +insert into &main_user..SampleQueryTab values (8, 'Harriet') +/ +insert into &main_user..SampleQueryTab values (9, 'Isabelle') +/ +insert into &main_user..SampleQueryTab values (10, 'Jarek') +/ +insert into &main_user..SampleQueryTab values (11, 'Krishna') +/ +insert into &main_user..SampleQueryTab values (12, 'Leo') +/ +insert into &main_user..SampleQueryTab values (13, 'Mia') +/ +insert into &main_user..SampleQueryTab values (14, 'Nathalie') +/ +insert into &main_user..SampleQueryTab values (15, 'Oscar') +/ +insert into &main_user..SampleQueryTab values (16, 'Pia') +/ +insert into &main_user..SampleQueryTab values (17, 'Quentin') +/ +insert into &main_user..SampleQueryTab values (18, 'Roger') +/ +insert into &main_user..SampleQueryTab values (19, 'Sally') +/ +insert into &main_user..SampleQueryTab values (20, 'Tully') +/ +insert into &main_user..SampleQueryTab values (21, 'Una') +/ +insert into &main_user..SampleQueryTab values (22, 'Valerie') +/ +insert into &main_user..SampleQueryTab values (23, 'William') +/ +insert into &main_user..SampleQueryTab values (24, 'Xavier') +/ +insert into &main_user..SampleQueryTab values (25, 'Yasmin') +/ +insert into &main_user..SampleQueryTab values (26, 'Zach') +/ commit / diff --git a/setup.cfg b/setup.cfg index 34e0650f..b79870ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,6 +47,7 @@ test_suite = tests packages = oracledb oracledb.plugins + oracledb.interchange package_dir = =src diff --git a/setup.py b/setup.py index b29bb2cc..9729f381 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ # ----------------------------------------------------------------------------- -# Copyright (c) 2020, 2023, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -31,6 +31,14 @@ # base source directory source_dir = os.path.join("src", "oracledb") +# determine the nanoarrow bridge dependent source files (included) +base_dir = os.path.join(source_dir, "interchange") +nanoarrow_bridge_depends = [ + os.path.join(base_dir, "nanoarrow", "nanoarrow.c"), + os.path.join(base_dir, "nanoarrow", "nanoarrow.h"), +] +nanoarrow_bridge_pxd = os.path.join(base_dir, "nanoarrow_bridge.pxd") + # determine the base implementation dependent source files (included) impl_dir = os.path.join(source_dir, "impl", "base") base_depends = [ @@ -39,7 +47,7 @@ if n.endswith(".pyx") ] base_pxd = os.path.join(source_dir, "base_impl.pxd") -base_depends.append(base_pxd) +base_depends.extend([base_pxd, nanoarrow_bridge_pxd]) # determine the thick mode dependent source files (included) impl_dir = os.path.join(source_dir, "impl", "thick") @@ -91,21 +99,33 @@ Extension( "oracledb.base_impl", sources=["src/oracledb/base_impl.pyx"], + include_dirs=["src/oracledb/interchange/nanoarrow"], depends=base_depends, extra_compile_args=extra_compile_args, ), Extension( "oracledb.thin_impl", sources=["src/oracledb/thin_impl.pyx"], + include_dirs=["src/oracledb/interchange/nanoarrow"], depends=thin_depends, extra_compile_args=extra_compile_args, ), Extension( "oracledb.thick_impl", sources=["src/oracledb/thick_impl.pyx"], - include_dirs=["src/oracledb/impl/thick/odpi/include"], + include_dirs=[ + "src/oracledb/impl/thick/odpi/include", + "src/oracledb/interchange/nanoarrow", + ], depends=thick_depends, extra_compile_args=extra_compile_args, ), + Extension( + "oracledb.interchange.nanoarrow_bridge", + sources=["src/oracledb/interchange/nanoarrow_bridge.pyx"], + include_dirs=["src/oracledb/interchange/nanoarrow"], + depends=nanoarrow_bridge_depends, + extra_compile_args=extra_compile_args, + ), ] ) diff --git a/src/oracledb/__init__.py b/src/oracledb/__init__.py index 45cca983..1302e220 100644 --- a/src/oracledb/__init__.py +++ b/src/oracledb/__init__.py @@ -1,5 +1,5 @@ # ----------------------------------------------------------------------------- -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -314,6 +314,10 @@ SparseVector as SparseVector, ) +from .interchange.dataframe import ( + OracleDataFrame as OracleDataFrame, +) + from . import config_providers IntervalYM = collections.namedtuple("IntervalYM", ["years", "months"]) diff --git a/src/oracledb/base_impl.pxd b/src/oracledb/base_impl.pxd index 1f89c082..42b5de16 100644 --- a/src/oracledb/base_impl.pxd +++ b/src/oracledb/base_impl.pxd @@ -33,10 +33,17 @@ from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport abs from cpython cimport array ctypedef unsigned char char_type +from .interchange.nanoarrow_bridge cimport ( + ArrowTimeUnit, + ArrowType, + OracleArrowArray, +) + cdef enum: PY_TYPE_NUM_ARRAY = 13 PY_TYPE_NUM_BOOL = 4 @@ -438,9 +445,11 @@ cdef class OracleMetadata: readonly uint32_t vector_dimensions readonly uint8_t vector_format readonly uint8_t vector_flags + ArrowType _arrow_type uint8_t _py_type_num cdef int _finalize_init(self) except -1 + cdef int _set_arrow_type(self) except -1 cdef OracleMetadata copy(self) @staticmethod cdef OracleMetadata from_type(object typ) @@ -654,6 +663,7 @@ cdef class BaseCursorImpl: public type bind_style public dict bind_vars_by_name public object warning + public bint fetching_arrow uint32_t _buffer_rowcount uint32_t _buffer_index uint32_t _fetch_array_size @@ -691,6 +701,9 @@ cdef class BaseCursorImpl: cdef int _verify_var(self, object var) except -1 cdef int bind_many(self, object cursor, list parameters) except -1 cdef int bind_one(self, object cursor, object parameters) except -1 + cdef object _finish_building_arrow_arrays(self) + cdef int _create_arrow_arrays(self) except -1 + cdef class BaseVarImpl: @@ -709,6 +722,7 @@ cdef class BaseVarImpl: BaseConnImpl _conn_impl OracleMetadata _fetch_metadata list _values + OracleArrowArray _arrow_array bint _is_value_set cdef int _bind(self, object conn, BaseCursorImpl cursor, @@ -718,6 +732,7 @@ cdef class BaseVarImpl: cdef int _check_and_set_value(self, uint32_t pos, object value, bint* was_set) except -1 cdef DbType _check_fetch_conversion(self) + cdef int _create_arrow_array(self) except -1 cdef int _finalize_init(self) except -1 cdef DbType _get_adjusted_type(self, uint8_t ora_type_num) cdef list _get_array_value(self) @@ -951,6 +966,10 @@ cdef struct OracleData: OracleDataBuffer buffer +cdef int convert_oracle_data_to_arrow(OracleMetadata from_metadata, + OracleMetadata to_metadatda, + OracleData* data, + OracleArrowArray arrow_array) except -1 cdef object convert_oracle_data_to_python(OracleMetadata from_metadata, OracleMetadata to_metadatda, OracleData* data, diff --git a/src/oracledb/base_impl.pyx b/src/oracledb/base_impl.pyx index 69a872ee..f071fef1 100644 --- a/src/oracledb/base_impl.pyx +++ b/src/oracledb/base_impl.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -38,9 +38,24 @@ cimport cpython.datetime as cydatetime from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdint cimport UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX +from libc.stdlib cimport atoi, atof from libc.string cimport memcpy from cpython cimport array +from .interchange.nanoarrow_bridge cimport ( + NANOARROW_TIME_UNIT_SECOND, + NANOARROW_TIME_UNIT_MILLI, + NANOARROW_TIME_UNIT_MICRO, + NANOARROW_TIME_UNIT_NANO, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_TIMESTAMP, +) + import array import base64 @@ -65,6 +80,7 @@ cdef type PY_TYPE_ASYNC_CURSOR cdef type PY_TYPE_ASYNC_LOB cdef type PY_TYPE_BOOL = bool cdef type PY_TYPE_CURSOR +cdef object PY_TYPE_DATAFRAME cdef type PY_TYPE_DATE = datetime.date cdef type PY_TYPE_DATETIME = datetime.datetime cdef type PY_TYPE_DECIMAL = decimal.Decimal diff --git a/src/oracledb/connection.py b/src/oracledb/connection.py index bc179dc4..e3aac76e 100644 --- a/src/oracledb/connection.py +++ b/src/oracledb/connection.py @@ -713,6 +713,43 @@ def encode_oson(self, value): self._verify_connected() return self._impl.encode_oson(value) + def fetch_df_all( + self, + statement: str, + parameters: Optional[Union[list, tuple, dict]] = None, + arraysize: Optional[int] = None, + ): + """ + Fetch all data as OracleDataFrame. + """ + cursor = self.cursor() + cursor._impl.fetching_arrow = True + if arraysize is not None: + cursor.arraysize = arraysize + cursor.prefetchrows = cursor.arraysize + cursor.execute(statement, parameters) + return cursor._impl.fetch_df_all(cursor) + + def fetch_df_batches( + self, + statement: str, + parameters: Optional[Union[list, tuple, dict]] = None, + size: Optional[int] = None, + ): + """ + Fetch data in batches. Each batch is an OracleDataFrame + """ + cursor = self.cursor() + cursor._impl.fetching_arrow = True + if size is not None: + cursor.arraysize = size + cursor.prefetchrows = cursor.arraysize + cursor.execute(statement, parameters) + if size is None: + yield cursor._impl.fetch_df_all(cursor) + else: + yield from cursor._impl.fetch_df_batches(cursor, batch_size=size) + def getSodaDatabase(self) -> SodaDatabase: """ Return a SODA database object for performing all operations on Simple diff --git a/src/oracledb/errors.py b/src/oracledb/errors.py index 9de4c810..ef0b2000 100644 --- a/src/oracledb/errors.py +++ b/src/oracledb/errors.py @@ -279,6 +279,7 @@ def _raise_not_supported(feature: str) -> None: ERR_PASSWORD_TYPE_HANDLER_FAILED = 2057 ERR_PLAINTEXT_PASSWORD_IN_CONFIG = 2058 ERR_MISSING_CONNECT_DESCRIPTOR = 2059 +ERR_ARROW_C_API_ERROR = 2060 # error numbers that result in NotSupportedError ERR_TIME_NOT_SUPPORTED = 3000 @@ -310,6 +311,7 @@ def _raise_not_supported(feature: str) -> None: ERR_CURSOR_DIFF_CONNECTION = 3027 ERR_UNSUPPORTED_PIPELINE_OPERATION = 3028 ERR_INVALID_NETWORK_NAME = 3029 +ERR_ARROW_UNSUPPORTED_DATA_TYPE = 3030 # error numbers that result in DatabaseError ERR_TNS_ENTRY_NOT_FOUND = 4000 @@ -853,4 +855,11 @@ def _raise_not_supported(feature: str) -> None: ERR_INVALID_NETWORK_NAME: ( '"{name}" includes characters that are not allowed' ), + ERR_ARROW_UNSUPPORTED_DATA_TYPE: ( + "conversion from Oracle Database type {db_type_name} to Apache " + "Arrow format is not supported" + ), + ERR_ARROW_C_API_ERROR: ( + "Arrow C Data Interface operation failed with error code {code}" + ), } diff --git a/src/oracledb/impl/base/converters.pyx b/src/oracledb/impl/base/converters.pyx index 48c4b6be..9e9416f8 100644 --- a/src/oracledb/impl/base/converters.pyx +++ b/src/oracledb/impl/base/converters.pyx @@ -68,6 +68,94 @@ cdef object convert_interval_ym_to_python(OracleDataBuffer *buffer): return PY_TYPE_INTERVAL_YM(value.years, value.months) +cdef int convert_number_to_arrow_decimal(OracleArrowArray arrow_array, + OracleDataBuffer *buffer) except -1: + """ + Converts a NUMBER value stored in the buffer to Arrow DECIMAL128. + """ + cdef: + char_type c + bint has_sign = 0 + char_type digits[39] # 38 digits + sign + OracleNumber *value = &buffer.as_number + uint8_t num_chars = 0, decimal_point_index = 0, allowed_max_chars = 0 + int64_t actual_scale = 0 + + if value.chars[0] == 45: # minus sign + has_sign = True + + if value.is_integer: + if has_sign: + allowed_max_chars = 39 + else: + allowed_max_chars = 38 + else: # decimal point + if has_sign: + allowed_max_chars = 40 + else: + allowed_max_chars = 39 + + # Arrow Decimal128 can only represent values with 38 decimal digits + if value.is_max_negative_value or value.num_chars > allowed_max_chars: + raise ValueError("Value cannot be represented as " + "Arrow Decimal128") + if value.is_integer: + arrow_array.append_decimal(value.chars, value.num_chars) + else: + for i in range(value.num_chars): + c = value.chars[i] + # count all characters except the decimal point + if c != 46: + digits[num_chars] = c + num_chars += 1 + else: + decimal_point_index = i + + # Append any trailing zeros. + actual_scale = num_chars - decimal_point_index + for i in range(abs(arrow_array.scale) - actual_scale): + digits[num_chars] = b'0' + num_chars += 1 + arrow_array.append_decimal(digits, num_chars) + + + +cdef int convert_number_to_arrow_double(OracleArrowArray arrow_array, + OracleDataBuffer *buffer) except -1: + """ + Converts a NUMBER value stored in the buffer to Arrow DOUBLE. + """ + cdef OracleNumber *value = &buffer.as_number + if value.is_max_negative_value: + arrow_array.append_double(-1.0e126) + else: + arrow_array.append_double(atof(value.chars[:value.num_chars])) + + +cdef int convert_number_to_arrow_int64(OracleArrowArray arrow_array, + OracleDataBuffer *buffer) except -1: + """ + Converts a NUMBER value stored in the buffer to Arrow INT64. + """ + cdef OracleNumber *value = &buffer.as_number + arrow_array.append_int64(atoi(value.chars[:value.num_chars])) + + +cdef int convert_number_to_arrow_string(OracleArrowArray arrow_array, + OracleDataBuffer *buffer) except -1: + """ + Converts a NUMBER value stored in the buffer to Arrow string. + """ + cdef: + OracleNumber *value = &buffer.as_number + char* ptr + if value.is_max_negative_value: + ptr = "-1e126" + arrow_array.append_bytes(ptr, 6) + else: + arrow_array.append_bytes(value.chars, value.num_chars) + + cdef object convert_number_to_python_decimal(OracleDataBuffer *buffer): """ Converts a NUMBER value stored in the buffer to Python decimal.Decimal(). @@ -131,6 +219,45 @@ cdef object convert_str_to_python(OracleDataBuffer *buffer, uint8_t csfrm, return rb.ptr[:rb.num_bytes].decode(ENCODING_UTF16, encoding_errors) +cdef int convert_oracle_data_to_arrow(OracleMetadata from_metadata, + OracleMetadata to_metadata, + OracleData* data, + OracleArrowArray arrow_array) except -1: + """ + Converts the value stored in OracleData to Arrow format. + """ + cdef: + ArrowType arrow_type + uint32_t db_type_num + OracleRawBytes* rb + int64_t ts + + # NULL values + if data.is_null: + return arrow_array.append_null() + + arrow_type = to_metadata._arrow_type + db_type_num = from_metadata.dbtype.num + if arrow_type == NANOARROW_TYPE_INT64: + convert_number_to_arrow_int64(arrow_array, &data.buffer) + elif arrow_type == NANOARROW_TYPE_DOUBLE: + if db_type_num == DB_TYPE_NUM_NUMBER: + convert_number_to_arrow_double(arrow_array, &data.buffer) + else: + arrow_array.append_double(data.buffer.as_double) + elif arrow_type == NANOARROW_TYPE_FLOAT: + arrow_array.append_float(data.buffer.as_float) + elif arrow_type == NANOARROW_TYPE_STRING: + rb = &data.buffer.as_raw_bytes + arrow_array.append_bytes( rb.ptr, rb.num_bytes) + elif arrow_type == NANOARROW_TYPE_TIMESTAMP: + ts = int(convert_date_to_python(&data.buffer).timestamp() * + arrow_array.factor) + arrow_array.append_int64(ts) + elif arrow_type == NANOARROW_TYPE_DECIMAL128: + convert_number_to_arrow_decimal(arrow_array, &data.buffer) + + cdef object convert_oracle_data_to_python(OracleMetadata from_metadata, OracleMetadata to_metadata, OracleData* data, diff --git a/src/oracledb/impl/base/cursor.pyx b/src/oracledb/impl/base/cursor.pyx index 11f5f8bc..a4ae7c78 100644 --- a/src/oracledb/impl/base/cursor.pyx +++ b/src/oracledb/impl/base/cursor.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -234,6 +234,8 @@ cdef class BaseCursorImpl: # finalize variable and store in arrays var_impl._finalize_init() + if self.fetching_arrow: + var_impl._create_arrow_array() self.fetch_var_impls[pos] = var_impl return var_impl @@ -358,6 +360,12 @@ cdef class BaseCursorImpl: self.bind_vars_by_name = None self.bind_style = None + cdef int _create_arrow_arrays(self) except -1: + cdef BaseVarImpl var_impl + for var_impl in self.fetch_var_impls: + if var_impl._arrow_array is None: + var_impl._create_arrow_array() + def _prepare_for_execute(self, object cursor, str statement, object parameters, object keyword_parameters): """ @@ -502,6 +510,19 @@ cdef class BaseCursorImpl: self._bind_values(cursor, type_handler, parameters, num_rows, row_num, defer_type_assignment) + cdef object _finish_building_arrow_arrays(self): + """ + Flush all buffers and return an Oracle Data frame. + """ + cdef: + BaseVarImpl var_impl + list columns = [] + for var_impl in self.fetch_var_impls: + var_impl._arrow_array.finish_building() + columns.append(var_impl._arrow_array) + var_impl._arrow_array = None + return PY_TYPE_DATAFRAME(columns) + def close(self, bint in_del=False): """ Closes the cursor and makes it unusable for further operations. @@ -551,6 +572,35 @@ cdef class BaseCursorImpl: if self._buffer_rowcount > 0: return self._create_row() + def fetch_df_all(self, cursor): + """ + Internal method used for fetching all data as OracleDataFrame + """ + while self._more_rows_to_fetch: + self._fetch_rows(cursor) + return self._finish_building_arrow_arrays() + + def fetch_df_batches(self, cursor, int batch_size): + """ + Internal method used for fetching next batch as OracleDataFrame + cursor.arraysize = batchsize + """ + cdef: + BaseConnImpl conn_impl = self._get_conn_impl() + bint returned = False + + # Return the prefetched batch (thin mode) + if conn_impl.thin: + returned = True + yield self._finish_building_arrow_arrays() + + while self._more_rows_to_fetch: + self._create_arrow_arrays() + self._fetch_rows(cursor) + if not returned or self._buffer_rowcount > 0: + returned = True + yield self._finish_building_arrow_arrays() + def get_array_dml_row_counts(self): errors._raise_not_supported("getting a list of array DML row counts") diff --git a/src/oracledb/impl/base/metadata.pyx b/src/oracledb/impl/base/metadata.pyx index 206f5ea8..e4bda9ca 100644 --- a/src/oracledb/impl/base/metadata.pyx +++ b/src/oracledb/impl/base/metadata.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2024, Oracle and/or its affiliates. +# Copyright (c) 2024, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -54,6 +54,40 @@ cdef class OracleMetadata: else: self._py_type_num = PY_TYPE_NUM_FLOAT + cdef int _set_arrow_type(self) except -1: + """ + Determine the arrow type to use for the data. + """ + cdef: + uint8_t py_type_num = self._py_type_num + uint32_t db_type_num = self.dbtype.num + if db_type_num == DB_TYPE_NUM_NUMBER: + if py_type_num == PY_TYPE_NUM_DECIMAL: + self._arrow_type = NANOARROW_TYPE_DECIMAL128 + elif py_type_num == PY_TYPE_NUM_STR: + self._arrow_type = NANOARROW_TYPE_STRING + elif py_type_num == PY_TYPE_NUM_INT and self.scale == 0 \ + and self.precision <= 18: + self._arrow_type = NANOARROW_TYPE_INT64 + else: + self._arrow_type = NANOARROW_TYPE_DOUBLE + elif db_type_num in (DB_TYPE_NUM_CHAR, DB_TYPE_NUM_VARCHAR): + self._arrow_type = NANOARROW_TYPE_STRING + elif db_type_num == DB_TYPE_NUM_BINARY_FLOAT: + self._arrow_type = NANOARROW_TYPE_FLOAT + elif db_type_num == DB_TYPE_NUM_BINARY_DOUBLE: + self._arrow_type = NANOARROW_TYPE_DOUBLE + elif db_type_num == DB_TYPE_NUM_BOOLEAN: + self._arrow_type = NANOARROW_TYPE_BOOL + elif db_type_num in (DB_TYPE_NUM_DATE, + DB_TYPE_NUM_TIMESTAMP, + DB_TYPE_NUM_TIMESTAMP_LTZ, + DB_TYPE_NUM_TIMESTAMP_TZ): + self._arrow_type = NANOARROW_TYPE_TIMESTAMP + else: + errors._raise_err(errors.ERR_ARROW_UNSUPPORTED_DATA_TYPE, + db_type_name=self.dbtype.name) + cdef OracleMetadata copy(self): """ Create a copy of the metadata and return it. diff --git a/src/oracledb/impl/base/utils.pyx b/src/oracledb/impl/base/utils.pyx index a66d55a0..04d19022 100644 --- a/src/oracledb/impl/base/utils.pyx +++ b/src/oracledb/impl/base/utils.pyx @@ -223,6 +223,7 @@ def init_base_impl(package): PY_TYPE_ASYNC_LOB, \ PY_TYPE_CONNECT_PARAMS, \ PY_TYPE_CURSOR, \ + PY_TYPE_DATAFRAME, \ PY_TYPE_DB_OBJECT, \ PY_TYPE_DB_OBJECT_TYPE, \ PY_TYPE_FETCHINFO, \ @@ -248,6 +249,7 @@ def init_base_impl(package): PY_TYPE_ASYNC_LOB = package.AsyncLOB PY_TYPE_CONNECT_PARAMS = package.ConnectParams PY_TYPE_CURSOR = package.Cursor + PY_TYPE_DATAFRAME = package.OracleDataFrame PY_TYPE_DB_OBJECT = package.DbObject PY_TYPE_DB_OBJECT_TYPE = package.DbObjectType PY_TYPE_FETCHINFO = package.FetchInfo diff --git a/src/oracledb/impl/base/var.pyx b/src/oracledb/impl/base/var.pyx index 9b96bf9f..0cf47f93 100644 --- a/src/oracledb/impl/base/var.pyx +++ b/src/oracledb/impl/base/var.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -247,6 +247,28 @@ cdef class BaseVarImpl: input_type=self._fetch_metadata.dbtype.name, output_type=self.metadata.dbtype.name) + cdef int _create_arrow_array(self) except -1: + """ + Creates an Arrow array based on the type information selected by the + user. + """ + cdef ArrowTimeUnit time_unit = NANOARROW_TIME_UNIT_SECOND + self.metadata._set_arrow_type() + if self.metadata._arrow_type == NANOARROW_TYPE_TIMESTAMP: + if self.metadata.scale > 0 and self.metadata.scale <= 3: + time_unit = NANOARROW_TIME_UNIT_MILLI + elif self.metadata.scale > 3 and self.metadata.scale <= 6: + time_unit = NANOARROW_TIME_UNIT_MICRO + elif self.metadata.scale > 6 and self.metadata.scale <= 9: + time_unit = NANOARROW_TIME_UNIT_NANO + self._arrow_array = OracleArrowArray( + arrow_type=self.metadata._arrow_type, + name=self.metadata.name, + precision=self.metadata.precision, + scale=self.metadata.scale, + time_unit=time_unit, + ) + cdef int _finalize_init(self) except -1: """ Internal method that finalizes initialization of the variable. diff --git a/src/oracledb/impl/thick/cursor.pyx b/src/oracledb/impl/thick/cursor.pyx index 4353ea7d..38a0e1a3 100644 --- a/src/oracledb/impl/thick/cursor.pyx +++ b/src/oracledb/impl/thick/cursor.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -151,6 +151,8 @@ cdef class ThickCursorImpl(BaseCursorImpl): self._buffer_index = 0 self._buffer_rowcount = num_rows_in_buffer self._more_rows_to_fetch = more_rows_to_fetch + if self.fetching_arrow: + self._populate_arrow_arrays() cdef BaseConnImpl _get_conn_impl(self): """ @@ -250,6 +252,17 @@ cdef class ThickCursorImpl(BaseCursorImpl): if status < 0: _raise_from_odpi() + cdef int _populate_arrow_arrays(self) except -1: + """ + Populate Arrow arrays with fetched data. + """ + cdef: + ThickVarImpl var_impl + uint32_t i + for var_impl in self.fetch_var_impls: + for i in range(self._buffer_rowcount): + var_impl._transform_element_to_arrow(i) + def _set_oci_attr(self, uint32_t attr_num, uint32_t attr_type, object value): """ diff --git a/src/oracledb/impl/thick/var.pyx b/src/oracledb/impl/thick/var.pyx index 186952a0..25584df0 100644 --- a/src/oracledb/impl/thick/var.pyx +++ b/src/oracledb/impl/thick/var.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -341,6 +341,84 @@ cdef class ThickVarImpl(BaseVarImpl): cpython.PyList_SET_ITEM(return_value, i, element_value) return return_value + cdef int _transform_element_to_arrow(self, uint32_t pos): + """ + Transforms a single element from the value supplied by ODPI-C to its + equivalent Arrow format. + """ + cdef: + dpiData *data = &self._data[pos] + uint32_t ora_type_num + OracleData ora_data + dpiBytes *as_bytes + ora_data.is_null = data.isNull + if not data.isNull: + ora_type_num = self._fetch_metadata.dbtype.num + if ora_type_num == DPI_ORACLE_TYPE_NATIVE_DOUBLE: + ora_data.buffer.as_double = data.value.asDouble + elif ora_type_num == DPI_ORACLE_TYPE_NATIVE_FLOAT: + ora_data.buffer.as_float = data.value.asFloat + elif ora_type_num == DPI_ORACLE_TYPE_BOOLEAN: + ora_data.buffer.as_bool = data.value.asBoolean + elif ora_type_num in ( + DPI_ORACLE_TYPE_CHAR, + DPI_ORACLE_TYPE_LONG_VARCHAR, + DPI_ORACLE_TYPE_LONG_RAW, + DPI_ORACLE_TYPE_RAW, + DPI_ORACLE_TYPE_VARCHAR, + ): + as_bytes = &data.value.asBytes; + ora_data.buffer.as_raw_bytes.ptr = \ + as_bytes.ptr; + ora_data.buffer.as_raw_bytes.num_bytes = as_bytes.length; + elif ora_type_num in ( + DPI_ORACLE_TYPE_DATE, + DPI_ORACLE_TYPE_TIMESTAMP, + DPI_ORACLE_TYPE_TIMESTAMP_LTZ, + DPI_ORACLE_TYPE_TIMESTAMP_TZ, + ): + ora_data.buffer.as_date.year = data.value.asTimestamp.year; + ora_data.buffer.as_date.month = data.value.asTimestamp.month; + ora_data.buffer.as_date.day = data.value.asTimestamp.day; + ora_data.buffer.as_date.hour = data.value.asTimestamp.hour; + ora_data.buffer.as_date.minute = data.value.asTimestamp.minute; + ora_data.buffer.as_date.second = data.value.asTimestamp.second; + ora_data.buffer.as_date.fsecond = \ + data.value.asTimestamp.fsecond // 1000; + ora_data.buffer.as_date.tz_hour_offset = \ + data.value.asTimestamp.tzHourOffset; + ora_data.buffer.as_date.tz_minute_offset = \ + data.value.asTimestamp.tzMinuteOffset; + elif ora_type_num == DPI_ORACLE_TYPE_INTERVAL_DS: + ora_data.buffer.as_interval_ds.days = \ + data.value.asIntervalDS.days; + ora_data.buffer.as_interval_ds.hours = \ + data.value.asIntervalDS.hours; + ora_data.buffer.as_interval_ds.minutes = \ + data.value.asIntervalDS.minutes; + ora_data.buffer.as_interval_ds.seconds = \ + data.value.asIntervalDS.seconds; + ora_data.buffer.as_interval_ds.fseconds = \ + data.value.asIntervalDS.fseconds; + elif ora_type_num == DPI_ORACLE_TYPE_INTERVAL_YM: + ora_data.buffer.as_interval_ym.years = \ + data.value.asIntervalYM.years; + ora_data.buffer.as_interval_ym.months = \ + data.value.asIntervalYM.months; + elif ora_type_num == DPI_ORACLE_TYPE_NUMBER: + as_bytes = &data.value.asBytes; + ora_data.buffer.as_number.is_max_negative_value = 0; + ora_data.buffer.as_number.is_integer = \ + memchr(as_bytes.ptr, b'.', as_bytes.length) == NULL; + memcpy(ora_data.buffer.as_number.chars, as_bytes.ptr, + as_bytes.length); + ora_data.buffer.as_number.num_chars = as_bytes.length; + else: + errors._raise_err(errors.ERR_DB_TYPE_NOT_SUPPORTED, + name=self._fetch_metadata.dbtype.name) + convert_oracle_data_to_arrow(self._fetch_metadata, self.metadata, + &ora_data, self._arrow_array) + cdef object _transform_element_to_python(self, uint32_t pos, dpiData *data): """ diff --git a/src/oracledb/impl/thin/messages.pyx b/src/oracledb/impl/thin/messages.pyx index 547f8b49..2475166f 100644 --- a/src/oracledb/impl/thin/messages.pyx +++ b/src/oracledb/impl/thin/messages.pyx @@ -754,6 +754,10 @@ cdef class MessageWithData(Message): var_impl._fetch_metadata) statement._last_output_type_handler = type_handler + # Create OracleArrowArray if fetching arrow is enabled + if cursor_impl.fetching_arrow: + cursor_impl._create_arrow_arrays() + # the list of output variables is equivalent to the fetch variables self.out_var_impls = cursor_impl.fetch_var_impls @@ -838,10 +842,15 @@ cdef class MessageWithData(Message): buf.read_oracle_data(metadata, &data, from_dbobject=False) if metadata.dbtype._csfrm == CS_FORM_NCHAR: buf._caps._check_ncharset_id() - column_value = convert_oracle_data_to_python( - metadata, var_impl.metadata, &data, var_impl._encoding_errors, - from_dbobject=False - ) + if self.cursor_impl.fetching_arrow: + convert_oracle_data_to_arrow( + metadata, var_impl.metadata, &data, var_impl._arrow_array + ) + else: + column_value = convert_oracle_data_to_python( + metadata, var_impl.metadata, &data, + var_impl._encoding_errors, from_dbobject=False + ) if not self.in_fetch: buf.read_sb4(&actual_num_bytes) if actual_num_bytes < 0 and ora_type_num == ORA_TYPE_NUM_BOOLEAN: @@ -2116,6 +2125,8 @@ cdef class ExecuteMessage(MessageWithData): self.cursor_impl._set_fetch_array_size(num_iters) if num_iters > 0 and not stmt._no_prefetch: options |= TNS_EXEC_OPTION_FETCH + if self.cursor_impl.fetching_arrow: + options |= TNS_EXEC_OPTION_NO_COMPRESSED_FETCH if not stmt._is_plsql and not self.parse_only: options |= TNS_EXEC_OPTION_NOT_PLSQL elif stmt._is_plsql and num_params > 0: @@ -2239,6 +2250,8 @@ cdef class ExecuteMessage(MessageWithData): and not info._is_return_bind] if self.function_code == TNS_FUNC_REEXECUTE_AND_FETCH: exec_flags_1 |= TNS_EXEC_OPTION_EXECUTE + if self.cursor_impl.fetching_arrow: + exec_flags_1 |= TNS_EXEC_OPTION_NO_COMPRESSED_FETCH num_iters = self.cursor_impl.prefetchrows self.cursor_impl._set_fetch_array_size(num_iters) else: diff --git a/src/oracledb/interchange/__init__.py b/src/oracledb/interchange/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/oracledb/interchange/buffer.py b/src/oracledb/interchange/buffer.py new file mode 100644 index 00000000..04461be0 --- /dev/null +++ b/src/oracledb/interchange/buffer.py @@ -0,0 +1,82 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# buffer.py +# +# Implements the Buffer class as documented in DataFrame API +# ----------------------------------------------------------------------------- + +from .protocol import ( + Buffer, + DlpackDeviceType, +) + + +class OracleColumnBuffer(Buffer): + """ + OracleColumnBuffer represents a contiguous memory buffer in the DataFrame + Interchange Protocol. It provides access to raw binary data that backs + various components of the data frame such as column values, validity masks + and offsets for variable-length data types. + """ + + def __init__(self, buffer_type, size_in_bytes, address) -> None: + self.buffer_type = buffer_type + self.size_in_bytes = size_in_bytes + self.address = address + + def __dlpack__(self): + """ + Represent this structure as a DLPack interface. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, None]: + """ + Device type and device ID for where the data + in the buffer resides + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + device = self.__dlpack_device__()[0].name + return ( + f"OracleColumnBuffer(bufsize={self.bufsize}, " + f"ptr={self.ptr}, type={self.buffer_type}, device={device!r})" + ) + + @property + def bufsize(self) -> int: + """ + Returns the total size of buffer in bytes. + """ + return self.size_in_bytes + + @property + def ptr(self) -> int: + """ + Returns the memory address of the buffer. + """ + return self.address diff --git a/src/oracledb/interchange/column.py b/src/oracledb/interchange/column.py new file mode 100644 index 00000000..9bf24a59 --- /dev/null +++ b/src/oracledb/interchange/column.py @@ -0,0 +1,205 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# column.py +# +# Implements the Column class as documented in DataFrame API +# ----------------------------------------------------------------------------- + +from typing import Any, Iterable, Optional + +from .buffer import OracleColumnBuffer +from .protocol import ( + Column, + Dtype, + ColumnBuffers, + ColumnNullType, + DtypeKind, + Endianness, +) + +from .nanoarrow_bridge import ( + NANOARROW_TIME_UNIT_SECOND, + NANOARROW_TIME_UNIT_MILLI, + NANOARROW_TIME_UNIT_MICRO, + NANOARROW_TIME_UNIT_NANO, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_DECIMAL128, +) + + +class OracleColumn(Column): + """ + OracleColumn represents a column in the DataFrame Interchange Protocol. It + provides a standardized way to expose a column's data, metadata and chunks, + allowing interoperability between data frame libraries. + """ + + def __init__(self, ora_arrow_array: object): + self.ora_arrow_array = ora_arrow_array + self._buffer_info = ora_arrow_array.get_buffer_info() + + def __arrow_c_array__(self, requested_schema=None): + return self.ora_arrow_array.__arrow_c_array__( + requested_schema=requested_schema + ) + + def _data_buffer(self): + buffer = self._buffer_info.get("data") + if buffer is None: + return None + size_bytes, address = buffer + data_buffer = OracleColumnBuffer( + size_in_bytes=size_bytes, address=address, buffer_type="data" + ) + return data_buffer, self.dtype + + def _offsets_buffer(self): + buffer = self._buffer_info.get("offsets") + if buffer is None: + return None + size_bytes, address = buffer + offsets_buffer = OracleColumnBuffer( + size_in_bytes=size_bytes, address=address, buffer_type="offsets" + ) + dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE) + return offsets_buffer, dtype + + def _validity_buffer(self): + buffer = self._buffer_info.get("validity") + if buffer is None: + return None + size_bytes, address = buffer + validity_buffer = OracleColumnBuffer( + size_in_bytes=size_bytes, address=address, buffer_type="validity" + ) + dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE) + return validity_buffer, dtype + + @property + def describe_null(self) -> tuple[ColumnNullType, Optional[int]]: + """ + Returns a description of the null representation used by the column. + """ + if self.null_count == 0: + return ColumnNullType.NON_NULLABLE, None + else: + return ColumnNullType.USE_BITMASK, 0 + + @property + def dtype(self) -> Dtype: + """ + Returns the data type of the column. The returned dtype provides + information on the storage format and the type of data in the column. + """ + if self.ora_arrow_array.arrow_type == NANOARROW_TYPE_INT64: + return (DtypeKind.INT, 64, "l", Endianness.NATIVE) + elif self.ora_arrow_array.arrow_type == NANOARROW_TYPE_DOUBLE: + return (DtypeKind.FLOAT, 64, "g", Endianness.NATIVE) + elif self.ora_arrow_array.arrow_type == NANOARROW_TYPE_FLOAT: + return (DtypeKind.FLOAT, 64, "g", Endianness.NATIVE) + elif self.ora_arrow_array.arrow_type == NANOARROW_TYPE_STRING: + return (DtypeKind.STRING, 8, "u", Endianness.NATIVE) + elif self.ora_arrow_array.arrow_type == NANOARROW_TYPE_TIMESTAMP: + if self.ora_arrow_array.time_unit == NANOARROW_TIME_UNIT_MICRO: + return (DtypeKind.DATETIME, 64, "tsu:", Endianness.NATIVE) + elif self.ora_arrow_array.time_unit == NANOARROW_TIME_UNIT_SECOND: + return (DtypeKind.DATETIME, 64, "tss:", Endianness.NATIVE) + elif self.ora_arrow_array.time_unit == NANOARROW_TIME_UNIT_MILLI: + return (DtypeKind.DATETIME, 64, "tsm:", Endianness.NATIVE) + elif self.ora_arrow_array.time_unit == NANOARROW_TIME_UNIT_NANO: + return (DtypeKind.DATETIME, 64, "tsn:", Endianness.NATIVE) + elif self.ora_arrow_array.arrow_type == NANOARROW_TYPE_DECIMAL128: + array = self.ora_arrow_array + return ( + DtypeKind.DECIMAL, + 128, + f"d:{array.precision}.{array.scale}", + Endianness.NATIVE, + ) + + def get_buffers(self) -> ColumnBuffers: + """ + Returns a dictionary specifying the memory buffers backing the column. + This currently consists of: + - "data": the main buffer storing column values + - "validity": a buffer containing null/missing values + - "offsets": a buffer for variable-length types like string + """ + return { + "data": self._data_buffer(), + "validity": self._validity_buffer(), + "offsets": self._offsets_buffer(), + } + + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[Column]: + """ + Return an iterator containing the column chunks. Currently this only + returns itself. + """ + yield self + + @property + def metadata(self) -> dict[str, Any]: + """ + Returns metadata about the column. + """ + return { + "name": self.ora_arrow_array.name, + "size": self.size(), + "num_chunks": self.num_chunks(), + } + + @property + def null_count(self) -> int: + """ + Returns the number of null elements. + """ + return self.ora_arrow_array.null_count + + def num_chunks(self) -> int: + """ + Returns the number of chunks used by the column. This method currently + always returns the value 1, implying that the column uses contiguous + memory. + """ + return 1 + + @property + def offset(self) -> int: + """ + Returns the offset of the first element. + """ + return self.ora_arrow_array.offset + + def size(self) -> int: + """ + Returns the number of elements in the column. + """ + return len(self.ora_arrow_array) diff --git a/src/oracledb/interchange/dataframe.py b/src/oracledb/interchange/dataframe.py new file mode 100644 index 00000000..f305ba8c --- /dev/null +++ b/src/oracledb/interchange/dataframe.py @@ -0,0 +1,151 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# dataframe.py +# +# Implement DataFrame class as documented in the standard +# https://data-apis.org/dataframe-protocol/latest/API.html +# ----------------------------------------------------------------------------- + +from typing import Any, Dict, Iterable, List, Optional + +from .column import OracleColumn + +from .protocol import DataFrame + + +class OracleDataFrame(DataFrame): + """ + OracleDataFrame is an implementation of the DataFrame Interchange Protocol. + It provides an interface for exchanging tabular data between different data + frame libraries (e.g. pandas, pyarrow, polars). + """ + + def __init__( + self, + oracle_arrow_arrays: List, + allow_copy: bool = True, + ): + self._cols = [] + self._cols_map = {} + self._rows = None + self._arrays = oracle_arrow_arrays + for ora_arrow_array in oracle_arrow_arrays: + column = OracleColumn(ora_arrow_array=ora_arrow_array) + self._rows = column.size() + self._cols.append(column) + self._cols_map[ora_arrow_array.name] = column + self.allow_copy = allow_copy + + def __dataframe__( + self, + nan_as_null: bool = False, # noqa: FBT001 + allow_copy: bool = True, # noqa: FBT001 + ) -> DataFrame: + """ + Returns a data frame adhering to the DataFrame Interchange protocol. + """ + return self + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[DataFrame]: + """ + Returns an iterator for each of the chunks in the data frame. Since + there is currently only one chunk, this simply returns itself. + """ + yield self + + def column_arrays(self) -> List: + """ + Returns a list of the Arrow arrays corresponding to each column in the + data frame. + """ + return self._arrays + + def column_names(self) -> List[str]: + """ + Returns a list of the names of the columns in the data frame. + """ + return list(self._cols_map.keys()) + + def get_column(self, i: int) -> OracleColumn: + """ + Returns a column from the data frame given its zero-based index. If the + index is out of range, an IndexError exception is raised. + """ + if i < 0 or i >= self.num_columns(): + raise IndexError( + f"Column index {i} is out of bounds for " + f"DataFrame with {self.num_columns()} columns" + ) + return self._cols[i] + + def get_column_by_name(self, name: str) -> OracleColumn: + """ + Returns a column from the data frame given the name of the column. If + the column name is not found, a KeyError exception is raised. + """ + if name not in self._cols_map: + raise KeyError(f"Column {name} not found in DataFrame") + return self._cols_map[name] + + def get_columns(self) -> List[OracleColumn]: + """ + Returns a list of all of the columns in the data frame. + """ + return self._cols + + @property + def metadata(self) -> Dict[str, Any]: + """ + Returns metadata for the data frame. Currently this returns + information about the number of columns (num_columns), number of rows + (num_rows) and number of chunks (num_chunks). + """ + return { + "num_columns": self.num_columns(), + "num_rows": self.num_rows(), + "num_chunks": self.num_chunks(), + } + + def num_chunks(self) -> int: + """ + Returns the number of chunks (contiguous memory blocks) in the data + frame. Currently this always returns 1. + """ + return 1 + + def num_columns(self) -> int: + """ + Returns the number of columns in the data frame. + """ + return len(self._cols) + + def num_rows(self) -> int: + """ + Returns the number of rows in the data frame. + """ + return self._rows diff --git a/src/oracledb/interchange/nanoarrow/nanoarrow.c b/src/oracledb/interchange/nanoarrow/nanoarrow.c new file mode 100644 index 00000000..8f265988 --- /dev/null +++ b/src/oracledb/interchange/nanoarrow/nanoarrow.c @@ -0,0 +1,3872 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "nanoarrow.h" + +const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } + +int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } + +ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { + if (error == NULL) { + return NANOARROW_OK; + } + + memset(error->message, 0, sizeof(error->message)); + + va_list args; + va_start(args, fmt); + int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); + va_end(args); + + if (chars_needed < 0) { + return EINVAL; + } else if (((size_t)chars_needed) >= sizeof(error->message)) { + return ERANGE; + } else { + return NANOARROW_OK; + } +} + +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; + layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = storage_type; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; + + layout->element_size_bits[0] = 1; + layout->element_size_bits[1] = 0; + layout->element_size_bits[2] = 0; + + layout->child_size_elements = 0; + + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_RUN_END_ENCODED: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + layout->element_size_bits[0] = 0; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_LARGE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_BOOL: + layout->element_size_bits[1] = 1; + break; + + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + layout->element_size_bits[1] = 8; + break; + + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_HALF_FLOAT: + layout->element_size_bits[1] = 16; + break; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_FLOAT: + layout->element_size_bits[1] = 32; + break; + case NANOARROW_TYPE_INTERVAL_MONTHS: + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + layout->element_size_bits[1] = 64; + break; + + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + layout->element_size_bits[1] = 128; + break; + + case NANOARROW_TYPE_DECIMAL256: + layout->element_size_bits[1] = 256; + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; + break; + + case NANOARROW_TYPE_DENSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + break; + + case NANOARROW_TYPE_SPARSE_UNION: + layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; + layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; + layout->element_size_bits[0] = 8; + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; + layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; + break; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; + layout->element_size_bits[1] = 32; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = storage_type; + break; + + case NANOARROW_TYPE_LARGE_STRING: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; + break; + case NANOARROW_TYPE_LARGE_BINARY: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; + layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; + layout->element_size_bits[1] = 64; + layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; + break; + + case NANOARROW_TYPE_BINARY_VIEW: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY_VIEW; + layout->element_size_bits[1] = 128; + break; + case NANOARROW_TYPE_STRING_VIEW: + layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; + layout->buffer_data_type[1] = NANOARROW_TYPE_STRING_VIEW; + layout->element_size_bits[1] = 128; + + default: + break; + } +} + +void* ArrowMalloc(int64_t size) { return malloc(size); } + +void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } + +void ArrowFree(void* ptr) { free(ptr); } + +static uint8_t* ArrowBufferAllocatorMallocReallocate( + struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, + int64_t new_size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(old_size); + return (uint8_t*)ArrowRealloc(ptr, new_size); +} + +static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size) { + NANOARROW_UNUSED(allocator); + NANOARROW_UNUSED(size); + if (ptr != NULL) { + ArrowFree(ptr); + } +} + +static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { + &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; + +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { + return ArrowBufferAllocatorMalloc; +} + +static uint8_t* ArrowBufferDeallocatorReallocate(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t old_size, + int64_t new_size) { + NANOARROW_UNUSED(new_size); + + // Attempting to reallocate a buffer with a custom deallocator is + // a programming error. In debug mode, crash here. +#if defined(NANOARROW_DEBUG) + NANOARROW_PRINT_AND_DIE(ENOMEM, + "It is an error to reallocate a buffer whose allocator is " + "ArrowBufferDeallocator()"); +#endif + + // In release mode, ensure the the deallocator is called exactly + // once using the pointer it was given and return NULL, which + // will trigger the caller to return ENOMEM. + allocator->free(allocator, ptr, old_size); + *allocator = ArrowBufferAllocatorDefault(); + return NULL; +} + +struct ArrowBufferAllocator ArrowBufferDeallocator( + void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t size), + void* private_data) { + struct ArrowBufferAllocator allocator; + allocator.reallocate = &ArrowBufferDeallocatorReallocate; + allocator.free = custom_free; + allocator.private_data = private_data; + return allocator; +} + +static const int kInt32DecimalDigits = 9; + +static const uint64_t kUInt32PowersOfTen[] = { + 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, + 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; + +// Adapted from Arrow C++ to use 32-bit words for better C portability +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 +static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { + // We use strtoll for parsing, which needs input that is null-terminated + char chunk_string[16]; + + for (int64_t posn = 0; posn < value.size_bytes;) { + int64_t remaining = value.size_bytes - posn; + + int64_t group_size; + if (remaining > kInt32DecimalDigits) { + group_size = kInt32DecimalDigits; + } else { + group_size = remaining; + } + + const uint64_t multiple = kUInt32PowersOfTen[group_size]; + + memcpy(chunk_string, value.data + posn, group_size); + chunk_string[group_size] = '\0'; + uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); + + for (int64_t i = 0; i < out_size; i++) { + uint64_t tmp = out[i]; + tmp *= multiple; + tmp += chunk; + out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); + chunk = (uint32_t)(tmp >> 32); + } + posn += group_size; + } +} + +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value) { + // Check for sign + int is_negative = value.data[0] == '-'; + int has_sign = is_negative || value.data[0] == '+'; + value.data += has_sign; + value.size_bytes -= has_sign; + + // Check all characters are digits that are not the negative sign + for (int64_t i = 0; i < value.size_bytes; i++) { + char c = value.data[i]; + if (c < '0' || c > '9') { + return EINVAL; + } + } + + // Skip over leading 0s + int64_t n_leading_zeroes = 0; + for (int64_t i = 0; i < value.size_bytes; i++) { + if (value.data[i] == '0') { + n_leading_zeroes++; + } else { + break; + } + } + + value.data += n_leading_zeroes; + value.size_bytes -= n_leading_zeroes; + + // Use 32-bit words for portability + uint32_t words32[8]; + int n_words32 = decimal->n_words * 2; + NANOARROW_DCHECK(n_words32 <= 8); + memset(words32, 0, sizeof(words32)); + + ShiftAndAdd(value, words32, n_words32); + + if (decimal->low_word_index == 0) { + memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); + } else { + uint64_t lo; + uint64_t hi; + + for (int i = 0; i < decimal->n_words; i++) { + lo = (uint64_t)words32[i * 2]; + hi = (uint64_t)words32[i * 2 + 1] << 32; + decimal->words[decimal->n_words - i - 1] = lo | hi; + } + } + + if (is_negative) { + ArrowDecimalNegate(decimal); + } + + return NANOARROW_OK; +} + +// Adapted from Arrow C++ for C +// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer) { + NANOARROW_DCHECK(decimal->n_words == 2 || decimal->n_words == 4); + int is_negative = ArrowDecimalSign(decimal) < 0; + + uint64_t words_little_endian[4]; + if (decimal->low_word_index == 0) { + memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); + } else { + for (int i = 0; i < decimal->n_words; i++) { + words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; + } + } + + // We've already made a copy, so negate that if needed + if (is_negative) { + uint64_t carry = 1; + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = words_little_endian[i]; + elem = ~elem + carry; + carry &= (elem == 0); + words_little_endian[i] = elem; + } + } + + // Find the most significant word that is non-zero + int most_significant_elem_idx = -1; + for (int i = decimal->n_words - 1; i >= 0; i--) { + if (words_little_endian[i] != 0) { + most_significant_elem_idx = i; + break; + } + } + + // If they are all zero, the output is just '0' + if (most_significant_elem_idx == -1) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); + return NANOARROW_OK; + } + + // Define segments such that each segment represents 9 digits with the + // least significant group of 9 digits first. For example, if the input represents + // 9876543210123456789, then segments will be [123456789, 876543210, 9]. + // We handle at most a signed 256 bit integer, whose maximum value occupies 77 + // characters. Thus, we need at most 9 segments. + const uint32_t k1e9 = 1000000000U; + int num_segments = 0; + uint32_t segments[9]; + memset(segments, 0, sizeof(segments)); + uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; + + do { + // Compute remainder = words_little_endian % 1e9 and words_little_endian = + // words_little_endian / 1e9. + uint32_t remainder = 0; + uint64_t* elem = most_significant_elem; + + do { + // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); + // *elem = dividend / 1e9; + // remainder = dividend % 1e9. + uint32_t hi = (uint32_t)(*elem >> 32); + uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); + uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; + uint64_t quotient_hi = dividend_hi / k1e9; + remainder = (uint32_t)(dividend_hi % k1e9); + uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; + uint64_t quotient_lo = dividend_lo / k1e9; + remainder = (uint32_t)(dividend_lo % k1e9); + + *elem = (quotient_hi << 32) | quotient_lo; + } while (elem-- != words_little_endian); + + segments[num_segments++] = remainder; + } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); + + // We know our output has no more than 9 digits per segment, plus a negative sign, + // plus any further digits between our output of 9 digits plus enough + // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu + // including a the null terminator) is bounded properly. + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); + if (is_negative) { + buffer->data[buffer->size_bytes++] = '-'; + } + + // The most significant segment should have no leading zeroes + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", + (unsigned long)segments[num_segments - 1]); + + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + + buffer->size_bytes += n_chars; + + // Subsequent output needs to be left-padded with zeroes such that each segment + // takes up exactly 9 digits. + for (int i = num_segments - 2; i >= 0; i--) { + int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", + (unsigned long)segments[i]); + buffer->size_bytes += n_chars; + NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); + } + + return NANOARROW_OK; +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { + if (schema->format != NULL) ArrowFree((void*)schema->format); + if (schema->name != NULL) ArrowFree((void*)schema->name); + if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (schema->children != NULL) { + for (int64_t i = 0; i < schema->n_children; i++) { + if (schema->children[i] != NULL) { + if (schema->children[i]->release != NULL) { + ArrowSchemaRelease(schema->children[i]); + } + + ArrowFree(schema->children[i]); + } + } + + ArrowFree(schema->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (schema->dictionary != NULL) { + if (schema->dictionary->release != NULL) { + ArrowSchemaRelease(schema->dictionary); + } + + ArrowFree(schema->dictionary); + } + + // private data not currently used + if (schema->private_data != NULL) { + ArrowFree(schema->private_data); + } + + schema->release = NULL; +} + +static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_UNINITIALIZED: + return NULL; + case NANOARROW_TYPE_NA: + return "n"; + case NANOARROW_TYPE_BOOL: + return "b"; + + case NANOARROW_TYPE_UINT8: + return "C"; + case NANOARROW_TYPE_INT8: + return "c"; + case NANOARROW_TYPE_UINT16: + return "S"; + case NANOARROW_TYPE_INT16: + return "s"; + case NANOARROW_TYPE_UINT32: + return "I"; + case NANOARROW_TYPE_INT32: + return "i"; + case NANOARROW_TYPE_UINT64: + return "L"; + case NANOARROW_TYPE_INT64: + return "l"; + + case NANOARROW_TYPE_HALF_FLOAT: + return "e"; + case NANOARROW_TYPE_FLOAT: + return "f"; + case NANOARROW_TYPE_DOUBLE: + return "g"; + + case NANOARROW_TYPE_STRING: + return "u"; + case NANOARROW_TYPE_LARGE_STRING: + return "U"; + case NANOARROW_TYPE_STRING_VIEW: + return "vu"; + case NANOARROW_TYPE_BINARY: + return "z"; + case NANOARROW_TYPE_BINARY_VIEW: + return "vz"; + case NANOARROW_TYPE_LARGE_BINARY: + return "Z"; + + case NANOARROW_TYPE_DATE32: + return "tdD"; + case NANOARROW_TYPE_DATE64: + return "tdm"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "tiM"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "tiD"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "tin"; + + case NANOARROW_TYPE_LIST: + return "+l"; + case NANOARROW_TYPE_LARGE_LIST: + return "+L"; + case NANOARROW_TYPE_STRUCT: + return "+s"; + case NANOARROW_TYPE_MAP: + return "+m"; + case NANOARROW_TYPE_RUN_END_ENCODED: + return "+r"; + + default: + return NULL; + } +} + +static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, + enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); + break; + case NANOARROW_TYPE_MAP: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); + ArrowSchemaInit(schema->children[0]->children[0]); + ArrowSchemaInit(schema->children[0]->children[1]); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[0], "key")); + schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetName(schema->children[0]->children[1], "value")); + break; + case NANOARROW_TYPE_RUN_END_ENCODED: + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 2)); + ArrowSchemaInit(schema->children[0]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "run_ends")); + schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; + ArrowSchemaInit(schema->children[1]); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[1], "values")); + default: + break; + } + + return NANOARROW_OK; +} + +void ArrowSchemaInit(struct ArrowSchema* schema) { + schema->format = NULL; + schema->name = NULL; + schema->metadata = NULL; + schema->flags = ARROW_FLAG_NULLABLE; + schema->n_children = 0; + schema->children = NULL; + schema->dictionary = NULL; + schema->private_data = NULL; + schema->release = &ArrowSchemaReleaseInternal; +} + +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { + // We don't allocate the dictionary because it has to be nullptr + // for non-dictionary-encoded arrays. + + // Set the format to a valid format string for type + const char* template_format = ArrowSchemaFormatTemplate(type); + + // If type isn't recognized and not explicitly unset + if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); + + // For types with an umabiguous child structure, allocate children + return ArrowSchemaInitChildrenIfNeeded(schema, type); +} + +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { + ArrowSchemaInit(schema); + + int result = ArrowSchemaSetType(schema, type); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema); + return result; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size) { + if (fixed_size <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + n_chars = snprintf(buffer, sizeof(buffer), "w:%" PRId32, fixed_size); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + n_chars = snprintf(buffer, sizeof(buffer), "+w:%" PRId32, fixed_size); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); + + if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale) { + if (decimal_precision <= 0) { + return EINVAL; + } + + char buffer[64]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_DECIMAL128: + n_chars = + snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); + break; + case NANOARROW_TYPE_DECIMAL256: + n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, + decimal_scale); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + return ArrowSchemaSetFormat(schema, buffer); +} + +ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema, + enum ArrowType run_end_type) { + switch (run_end_type) { + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT64: + break; + default: + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat( + schema, ArrowSchemaFormatTemplate(NANOARROW_TYPE_RUN_END_ENCODED))); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaInitChildrenIfNeeded(schema, NANOARROW_TYPE_RUN_END_ENCODED)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[0], run_end_type)); + NANOARROW_RETURN_NOT_OK( + ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_UNINITIALIZED)); + + return NANOARROW_OK; +} + +static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "m"; + case NANOARROW_TIME_UNIT_MICRO: + return "u"; + case NANOARROW_TIME_UNIT_NANO: + return "n"; + default: + return NULL; + } +} + +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone) { + const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); + if (time_unit_str == NULL) { + return EINVAL; + } + + char buffer[128]; + int n_chars; + switch (type) { + case NANOARROW_TYPE_TIME32: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_MICRO: + case NANOARROW_TIME_UNIT_NANO: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIME64: + if (timezone != NULL) { + return EINVAL; + } + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + case NANOARROW_TIME_UNIT_MILLI: + return EINVAL; + default: + break; + } + + n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); + break; + case NANOARROW_TYPE_TIMESTAMP: + if (timezone == NULL) { + timezone = ""; + } + n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); + break; + case NANOARROW_TYPE_DURATION: + if (timezone != NULL) { + return EINVAL; + } + n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); + break; + default: + return EINVAL; + } + + if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) { + return ERANGE; + } + + buffer[n_chars] = '\0'; + + return ArrowSchemaSetFormat(schema, buffer); +} + +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children) { + if (n_children < 0 || n_children > 127) { + return EINVAL; + } + + // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator + char format_out[512]; + int64_t format_out_size = 512; + memset(format_out, 0, format_out_size); + int n_chars; + char* format_cursor = format_out; + + switch (type) { + case NANOARROW_TYPE_SPARSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+us:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + case NANOARROW_TYPE_DENSE_UNION: + n_chars = snprintf(format_cursor, format_out_size, "+ud:"); + format_cursor += n_chars; + format_out_size -= n_chars; + break; + default: + return EINVAL; + } + + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + + if (n_children > 0) { + n_chars = snprintf(format_cursor, format_out_size, "0"); + format_cursor += n_chars; + format_out_size -= n_chars; + + for (int64_t i = 1; i < n_children; i++) { + n_chars = snprintf(format_cursor, format_out_size, ",%" PRId64, i); + format_cursor += n_chars; + format_out_size -= n_chars; + } + } + + // Ensure that an encoding error from snprintf() does not result + // in an out-of-bounds access. + if (n_chars < 0) { + return ERANGE; + } + + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); + + NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); + for (int64_t i = 0; i < n_children; i++) { + ArrowSchemaInit(schema->children[i]); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { + if (schema->format != NULL) { + ArrowFree((void*)schema->format); + } + + if (format != NULL) { + size_t format_size = strlen(format) + 1; + schema->format = (const char*)ArrowMalloc(format_size); + if (schema->format == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->format, format, format_size); + } else { + schema->format = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { + if (schema->name != NULL) { + ArrowFree((void*)schema->name); + } + + if (name != NULL) { + size_t name_size = strlen(name) + 1; + schema->name = (const char*)ArrowMalloc(name_size); + if (schema->name == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->name, name, name_size); + } else { + schema->name = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { + if (schema->metadata != NULL) { + ArrowFree((void*)schema->metadata); + } + + if (metadata != NULL) { + size_t metadata_size = ArrowMetadataSizeOf(metadata); + schema->metadata = (const char*)ArrowMalloc(metadata_size); + if (schema->metadata == NULL) { + return ENOMEM; + } + + memcpy((void*)schema->metadata, metadata, metadata_size); + } else { + schema->metadata = NULL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children) { + if (schema->children != NULL) { + return EEXIST; + } + + if (n_children > 0) { + schema->children = + (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); + + if (schema->children == NULL) { + return ENOMEM; + } + + schema->n_children = n_children; + + memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); + + for (int64_t i = 0; i < n_children; i++) { + schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + + if (schema->children[i] == NULL) { + return ENOMEM; + } + + schema->children[i]->release = NULL; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { + if (schema->dictionary != NULL) { + return EEXIST; + } + + schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); + if (schema->dictionary == NULL) { + return ENOMEM; + } + + schema->dictionary->release = NULL; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, + struct ArrowSchema* schema_out) { + ArrowSchemaInit(schema_out); + + int result = ArrowSchemaSetFormat(schema_out, schema->format); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + schema_out->flags = schema->flags; + + result = ArrowSchemaSetName(schema_out, schema->name); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaSetMetadata(schema_out, schema->metadata); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowSchemaAllocateDictionary(schema_out); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + + result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); + if (result != NANOARROW_OK) { + ArrowSchemaRelease(schema_out); + return result; + } + } + + return NANOARROW_OK; +} + +static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, + enum ArrowType type) { + schema_view->type = type; + schema_view->storage_type = type; +} + +static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, + const char* format, + const char** format_end_out, + struct ArrowError* error) { + *format_end_out = format; + + // needed for decimal parsing + const char* parse_start; + char* parse_end; + + switch (format[0]) { + case 'n': + schema_view->type = NANOARROW_TYPE_NA; + schema_view->storage_type = NANOARROW_TYPE_NA; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'b': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'c': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'C': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); + *format_end_out = format + 1; + return NANOARROW_OK; + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'S': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'i': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'I': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'l': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'L': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'e': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'f': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); + *format_end_out = format + 1; + return NANOARROW_OK; + case 'g': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); + *format_end_out = format + 1; + return NANOARROW_OK; + + // decimal + case 'd': + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); + return EINVAL; + } + + parse_start = format + 2; + schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start || parse_end[0] != ',') { + ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); + return EINVAL; + } + + parse_start = parse_end + 1; + schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_end == parse_start) { + ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); + return EINVAL; + } else if (parse_end[0] != ',') { + schema_view->decimal_bitwidth = 128; + } else { + parse_start = parse_end + 1; + schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); + if (parse_start == parse_end) { + ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); + return EINVAL; + } + } + + *format_end_out = parse_end; + + switch (schema_view->decimal_bitwidth) { + case 128: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); + return NANOARROW_OK; + case 256: + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected decimal bitwidth of 128 or 256 but found %" PRId32, + schema_view->decimal_bitwidth); + return EINVAL; + } + + // validity + data + case 'w': + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; + if (format[1] != ':' || format[2] == '\0') { + ArrowErrorSet(error, "Expected ':' following 'w'"); + return EINVAL; + } + + schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); + return NANOARROW_OK; + + // validity + offset + data + case 'z': + schema_view->type = NANOARROW_TYPE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'u': + schema_view->type = NANOARROW_TYPE_STRING; + schema_view->storage_type = NANOARROW_TYPE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // validity + large_offset + data + case 'Z': + schema_view->type = NANOARROW_TYPE_LARGE_BINARY; + schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; + *format_end_out = format + 1; + return NANOARROW_OK; + case 'U': + schema_view->type = NANOARROW_TYPE_LARGE_STRING; + schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; + *format_end_out = format + 1; + return NANOARROW_OK; + + // nested types + case '+': + switch (format[1]) { + // list has validity + offset or offset + case 'l': + schema_view->storage_type = NANOARROW_TYPE_LIST; + schema_view->type = NANOARROW_TYPE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // large list has validity + large_offset or large_offset + case 'L': + schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; + schema_view->type = NANOARROW_TYPE_LARGE_LIST; + *format_end_out = format + 2; + return NANOARROW_OK; + + // run end encoded has no buffer at all + case 'r': + schema_view->storage_type = NANOARROW_TYPE_RUN_END_ENCODED; + schema_view->type = NANOARROW_TYPE_RUN_END_ENCODED; + *format_end_out = format + 2; + return NANOARROW_OK; + + // just validity buffer + case 'w': + if (format[2] != ':' || format[3] == '\0') { + ArrowErrorSet(error, "Expected ':' following '+w'"); + return EINVAL; + } + + schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; + schema_view->fixed_size = + (int32_t)strtol(format + 3, (char**)format_end_out, 10); + return NANOARROW_OK; + case 's': + schema_view->storage_type = NANOARROW_TYPE_STRUCT; + schema_view->type = NANOARROW_TYPE_STRUCT; + *format_end_out = format + 2; + return NANOARROW_OK; + case 'm': + schema_view->storage_type = NANOARROW_TYPE_MAP; + schema_view->type = NANOARROW_TYPE_MAP; + *format_end_out = format + 2; + return NANOARROW_OK; + + // unions + case 'u': + switch (format[2]) { + case 'd': + schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; + schema_view->type = NANOARROW_TYPE_DENSE_UNION; + break; + case 's': + schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; + schema_view->type = NANOARROW_TYPE_SPARSE_UNION; + break; + default: + ArrowErrorSet(error, + "Expected union format string +us: or " + "+ud: but found '%s'", + format); + return EINVAL; + } + + if (format[3] == ':') { + schema_view->union_type_ids = format + 4; + int64_t n_type_ids = + _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); + if (n_type_ids != schema_view->schema->n_children) { + ArrowErrorSet(error, + "Expected union type_ids parameter to be a comma-separated " + "list of %" PRId64 " values between 0 and 127 but found '%s'", + schema_view->schema->n_children, schema_view->union_type_ids); + return EINVAL; + } + *format_end_out = format + strlen(format); + return NANOARROW_OK; + } else { + ArrowErrorSet(error, + "Expected union format string +us: or +ud: " + "but found '%s'", + format); + return EINVAL; + } + + default: + ArrowErrorSet(error, "Expected nested type format string but found '%s'", + format); + return EINVAL; + } + + // date/time types + case 't': + switch (format[1]) { + // date + case 'd': + switch (format[2]) { + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_DATE32; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DATE64; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", + format + 2); + return EINVAL; + } + + // time of day + case 't': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); + schema_view->type = NANOARROW_TYPE_TIME32; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIME64; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", + format + 2); + return EINVAL; + } + + // timestamp + case 's': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + break; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + break; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_TIMESTAMP; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + break; + default: + ArrowErrorSet( + error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", + format + 2); + return EINVAL; + } + + if (format[3] != ':') { + ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, + format + 3); + return EINVAL; + } + + schema_view->timezone = format + 4; + *format_end_out = format + strlen(format); + return NANOARROW_OK; + + // duration + case 'D': + switch (format[2]) { + case 's': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'm': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); + schema_view->type = NANOARROW_TYPE_DURATION; + schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", + format + 2); + return EINVAL; + } + + // interval + case 'i': + switch (format[2]) { + case 'M': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'D': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); + *format_end_out = format + 3; + return NANOARROW_OK; + case 'n': + ArrowSchemaViewSetPrimitive(schema_view, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); + *format_end_out = format + 3; + return NANOARROW_OK; + default: + ArrowErrorSet(error, + "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", + format + 2); + return EINVAL; + } + + default: + ArrowErrorSet( + error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", + format + 1); + return EINVAL; + } + + // view types + case 'v': { + switch (format[1]) { + case 'u': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_STRING_VIEW); + *format_end_out = format + 2; + return NANOARROW_OK; + case 'z': + ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BINARY_VIEW); + *format_end_out = format + 2; + return NANOARROW_OK; + default: + ArrowErrorSet(error, "Expected 'u', or 'z' following 'v' but found '%s'", + format + 1); + return EINVAL; + } + } + + default: + ArrowErrorSet(error, "Unknown format: '%s'", format); + return EINVAL; + } +} + +static ArrowErrorCode ArrowSchemaViewValidateNChildren( + struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { + if (n_children != -1 && schema_view->schema->n_children != n_children) { + ArrowErrorSet( + error, "Expected schema with %" PRId64 " children but found %" PRId64 " children", + n_children, schema_view->schema->n_children); + return EINVAL; + } + + // Don't do a full validation of children but do check that they won't + // segfault if inspected + struct ArrowSchema* child; + for (int64_t i = 0; i < schema_view->schema->n_children; i++) { + child = schema_view->schema->children[i]; + if (child == NULL) { + ArrowErrorSet( + error, "Expected valid schema at schema->children[%" PRId64 "] but found NULL", + i); + return EINVAL; + } else if (child->release == NULL) { + ArrowErrorSet(error, + "Expected valid schema at schema->children[%" PRId64 + "] but found a released schema", + i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); +} + +static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); + + if (schema_view->schema->children[0]->n_children != 2) { + ArrowErrorSet(error, + "Expected child of map type to have 2 children but found %" PRId64, + schema_view->schema->children[0]->n_children); + return EINVAL; + } + + if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { + ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", + schema_view->schema->children[0]->format); + return EINVAL; + } + + if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, + "Expected child of map type to be non-nullable but was nullable"); + return EINVAL; + } + + if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { + ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowSchemaViewValidateDictionary( + struct ArrowSchemaView* schema_view, struct ArrowError* error) { + // check for valid index type + switch (schema_view->storage_type) { + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + break; + default: + ArrowErrorSet( + error, + "Expected dictionary schema index type to be an integral type but found '%s'", + schema_view->schema->format); + return EINVAL; + } + + struct ArrowSchemaView dictionary_schema_view; + return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, + error); +} + +static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, + enum ArrowType type, + struct ArrowError* error) { + switch (type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_DATE32: + case NANOARROW_TYPE_DATE64: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (schema_view->fixed_size <= 0) { + ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", + schema_view->fixed_size); + return EINVAL; + } + return ArrowSchemaViewValidateNChildren(schema_view, 0, error); + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return ArrowSchemaViewValidateNChildren(schema_view, 1, error); + + case NANOARROW_TYPE_RUN_END_ENCODED: + return ArrowSchemaViewValidateNChildren(schema_view, 2, error); + + case NANOARROW_TYPE_STRUCT: + return ArrowSchemaViewValidateNChildren(schema_view, -1, error); + + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return ArrowSchemaViewValidateUnion(schema_view, error); + + case NANOARROW_TYPE_MAP: + return ArrowSchemaViewValidateMap(schema_view, error); + + case NANOARROW_TYPE_DICTIONARY: + return ArrowSchemaViewValidateDictionary(schema_view, error); + + default: + ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", + schema_view->type); + return EINVAL; + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error) { + if (schema == NULL) { + ArrowErrorSet(error, "Expected non-NULL schema"); + return EINVAL; + } + + if (schema->release == NULL) { + ArrowErrorSet(error, "Expected non-released schema"); + return EINVAL; + } + + schema_view->schema = schema; + + const char* format = schema->format; + if (format == NULL) { + ArrowErrorSet( + error, + "Error parsing schema->format: Expected a null-terminated string but found NULL"); + return EINVAL; + } + + size_t format_len = strlen(format); + if (format_len == 0) { + ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); + return EINVAL; + } + + const char* format_end_out; + int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); + + if (result != NANOARROW_OK) { + if (error != NULL) { + char child_error[1024]; + memcpy(child_error, ArrowErrorMessage(error), 1024); + ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); + } + + return result; + } + + if ((format + format_len) != format_end_out) { + ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%zu characters", + format, (int)(format_end_out - format), format_len); + return EINVAL; + } + + if (schema->dictionary != NULL) { + schema_view->type = NANOARROW_TYPE_DICTIONARY; + } + + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); + + if (schema_view->storage_type != schema_view->type) { + NANOARROW_RETURN_NOT_OK( + ArrowSchemaViewValidate(schema_view, schema_view->type, error)); + } + + int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; + if (unknown_flags != 0) { + ArrowErrorSet(error, "Unknown ArrowSchema flag"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && + schema_view->type != NANOARROW_TYPE_DICTIONARY) { + ArrowErrorSet(error, + "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); + return EINVAL; + } + + if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && + schema_view->type != NANOARROW_TYPE_MAP) { + ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); + return EINVAL; + } + + ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); + if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { + schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; + } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { + schema_view->layout.child_size_elements = schema_view->fixed_size; + } + + schema_view->extension_name = ArrowCharView(NULL); + schema_view->extension_metadata = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:name"), + &schema_view->extension_name)); + NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, + ArrowCharView("ARROW:extension:metadata"), + &schema_view->extension_metadata)); + + return NANOARROW_OK; +} + +static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, + char* out, int64_t n) { + const char* type_string = ArrowTypeString(schema_view->type); + switch (schema_view->type) { + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + return snprintf(out, n, "%s(%" PRId32 ", %" PRId32 ")", type_string, + schema_view->decimal_precision, schema_view->decimal_scale); + case NANOARROW_TYPE_TIMESTAMP: + return snprintf(out, n, "%s('%s', '%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + return snprintf(out, n, "%s('%s')", type_string, + ArrowTimeUnitString(schema_view->time_unit)); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return snprintf(out, n, "%s(%" PRId32 ")", type_string, schema_view->fixed_size); + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); + default: + return snprintf(out, n, "%s", type_string); + } +} + +// Helper for bookkeeping to emulate sprintf()-like behaviour spread +// among multiple sprintf calls. +static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, + int64_t* n_remaining, int64_t* n_chars) { + // In the unlikely snprintf() returning a negative value (encoding error), + // ensure the result won't cause an out-of-bounds access. + if (n_chars_last < 0) { + n_chars_last = 0; + } + + *n_chars += n_chars_last; + *n_remaining -= n_chars_last; + + // n_remaining is never less than 0 + if (*n_remaining < 0) { + *n_remaining = 0; + } + + // Can't do math on a NULL pointer + if (*out != NULL) { + *out += n_chars_last; + } +} + +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, + char recursive) { + if (schema == NULL) { + return snprintf(out, n, "[invalid: pointer is null]"); + } + + if (schema->release == NULL) { + return snprintf(out, n, "[invalid: schema is released]"); + } + + struct ArrowSchemaView schema_view; + struct ArrowError error; + + if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { + return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); + } + + // Extension type and dictionary should include both the top-level type + // and the storage type. + int is_extension = schema_view.extension_name.size_bytes > 0; + int is_dictionary = schema->dictionary != NULL; + int64_t n_chars = 0; + int64_t n_chars_last = 0; + + // Uncommon but not technically impossible that both are true + if (is_extension && is_dictionary) { + n_chars_last = snprintf( + out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); + } else if (is_extension) { + n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, + schema_view.extension_name.data); + } else if (is_dictionary) { + n_chars_last = + snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (!is_dictionary) { + n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); + } else { + n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); + } + + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + if (recursive && schema->format[0] == '+') { + n_chars_last = snprintf(out, n, "<"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + + for (int64_t i = 0; i < schema->n_children; i++) { + if (i > 0) { + n_chars_last = snprintf(out, n, ", "); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + // ArrowSchemaToStringInternal() will validate the child and print the error, + // but we need the name first + if (schema->children[i] != NULL && schema->children[i]->release != NULL && + schema->children[i]->name != NULL) { + n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + n_chars_last = snprintf(out, n, ">"); + ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); + } + + if (is_extension && is_dictionary) { + n_chars += snprintf(out, n, ">}"); + } else if (is_extension) { + n_chars += snprintf(out, n, "}"); + } else if (is_dictionary) { + n_chars += snprintf(out, n, ">"); + } + + // Ensure that we always return a positive result + if (n_chars > 0) { + return n_chars; + } else { + return 0; + } +} + +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata) { + reader->metadata = metadata; + + if (reader->metadata == NULL) { + reader->offset = 0; + reader->remaining_keys = 0; + } else { + memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); + reader->offset = sizeof(int32_t); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out) { + if (reader->remaining_keys <= 0) { + return EINVAL; + } + + int64_t pos = 0; + + int32_t key_size; + memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + key_out->data = reader->metadata + reader->offset + pos; + key_out->size_bytes = key_size; + pos += key_size; + + int32_t value_size; + memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); + pos += sizeof(int32_t); + + value_out->data = reader->metadata + reader->offset + pos; + value_out->size_bytes = value_size; + pos += value_size; + + reader->offset += pos; + reader->remaining_keys--; + return NANOARROW_OK; +} + +int64_t ArrowMetadataSizeOf(const char* metadata) { + if (metadata == NULL) { + return 0; + } + + struct ArrowMetadataReader reader; + struct ArrowStringView key; + struct ArrowStringView value; + if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { + return 0; + } + + int64_t size = sizeof(int32_t); + while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { + size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; + } + + return size; +} + +static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, + struct ArrowStringView* key, + struct ArrowStringView* value_out) { + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); + + while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == + NANOARROW_OK) { + int key_equal = key->size_bytes == existing_key.size_bytes && + strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; + if (key_equal) { + value_out->data = existing_value.data; + value_out->size_bytes = existing_value.size_bytes; + break; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out) { + if (value_out == NULL) { + return EINVAL; + } + + return ArrowMetadataGetValueInternal(metadata, &key, value_out); +} + +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { + struct ArrowStringView value = ArrowCharView(NULL); + if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { + return 0; + } + + return value.data != NULL; +} + +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, + const char* metadata) { + ArrowBufferInit(buffer); + return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); +} + +static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + if (value == NULL) { + return NANOARROW_OK; + } + + if (buffer->capacity_bytes == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); + } + + if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { + return EINVAL; + } + + int32_t n_keys; + memcpy(&n_keys, buffer->data, sizeof(int32_t)); + + int32_t key_size = (int32_t)key->size_bytes; + int32_t value_size = (int32_t)value->size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( + buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); + + ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, key->data, key_size); + ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); + ArrowBufferAppendUnsafe(buffer, value->data, value_size); + + n_keys++; + memcpy(buffer->data, &n_keys, sizeof(int32_t)); + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, + struct ArrowStringView* key, + struct ArrowStringView* value) { + // Inspect the current value to see if we can avoid copying the buffer + struct ArrowStringView current_value = ArrowCharView(NULL); + NANOARROW_RETURN_NOT_OK( + ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); + + // The key should be removed but no key exists + if (value == NULL && current_value.data == NULL) { + return NANOARROW_OK; + } + + // The key/value can be appended because no key exists + if (value != NULL && current_value.data == NULL) { + return ArrowMetadataBuilderAppendInternal(buffer, key, value); + } + + struct ArrowMetadataReader reader; + struct ArrowStringView existing_key; + struct ArrowStringView existing_value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); + + struct ArrowBuffer new_buffer; + NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); + + while (reader.remaining_keys > 0) { + int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + + if (key->size_bytes == existing_key.size_bytes && + strncmp((const char*)key->data, (const char*)existing_key.data, + existing_key.size_bytes) == 0) { + result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); + value = NULL; + } else { + result = + ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); + } + + if (result != NANOARROW_OK) { + ArrowBufferReset(&new_buffer); + return result; + } + } + + ArrowBufferReset(buffer); + ArrowBufferMove(&new_buffer, buffer); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value) { + return ArrowMetadataBuilderSetInternal(buffer, &key, &value); +} + +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key) { + return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); +} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "nanoarrow.h" + +static void ArrowArrayReleaseInternal(struct ArrowArray* array) { + // Release buffers held by this array + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + if (private_data != NULL) { + ArrowBitmapReset(&private_data->bitmap); + ArrowBufferReset(&private_data->buffers[0]); + ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data->buffer_data); + for (int32_t i = 0; i < private_data->n_variadic_buffers; ++i) { + ArrowBufferReset(&private_data->variadic_buffers[i]); + } + ArrowFree(private_data->variadic_buffers); + ArrowFree(private_data->variadic_buffer_sizes); + ArrowFree(private_data); + } + + // This object owns the memory for all the children, but those + // children may have been generated elsewhere and might have + // their own release() callback. + if (array->children != NULL) { + for (int64_t i = 0; i < array->n_children; i++) { + if (array->children[i] != NULL) { + if (array->children[i]->release != NULL) { + ArrowArrayRelease(array->children[i]); + } + + ArrowFree(array->children[i]); + } + } + + ArrowFree(array->children); + } + + // This object owns the memory for the dictionary but it + // may have been generated somewhere else and have its own + // release() callback. + if (array->dictionary != NULL) { + if (array->dictionary->release != NULL) { + ArrowArrayRelease(array->dictionary); + } + + ArrowFree(array->dictionary); + } + + // Mark released + array->release = NULL; +} + +static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, + enum ArrowType storage_type) { + switch (storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_RUN_END_ENCODED: + array->n_buffers = 0; + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + array->n_buffers = 1; + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_MAP: + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_HALF_FLOAT: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + case NANOARROW_TYPE_DENSE_UNION: + array->n_buffers = 2; + break; + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1; + break; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + array->n_buffers = 3; + break; + + default: + return EINVAL; + + return NANOARROW_OK; + } + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->storage_type = storage_type; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type) { + array->length = 0; + array->null_count = 0; + array->offset = 0; + array->n_buffers = 0; + array->n_children = 0; + array->buffers = NULL; + array->children = NULL; + array->dictionary = NULL; + array->release = &ArrowArrayReleaseInternal; + array->private_data = NULL; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); + if (private_data == NULL) { + array->release = NULL; + return ENOMEM; + } + + ArrowBitmapInit(&private_data->bitmap); + ArrowBufferInit(&private_data->buffers[0]); + ArrowBufferInit(&private_data->buffers[1]); + private_data->buffer_data = + (const void**)ArrowMalloc(sizeof(void*) * NANOARROW_MAX_FIXED_BUFFERS); + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; ++i) { + private_data->buffer_data[i] = NULL; + } + private_data->n_variadic_buffers = 0; + private_data->variadic_buffers = NULL; + private_data->variadic_buffer_sizes = NULL; + + array->private_data = private_data; + array->buffers = (const void**)(private_data->buffer_data); + + // These are not technically "storage" in the sense that they do not appear + // in the ArrowSchemaView's storage_type member; however, allowing them here + // is helpful to maximize the number of types that can avoid going through + // ArrowArrayInitFromSchema(). + switch (storage_type) { + case NANOARROW_TYPE_DURATION: + case NANOARROW_TYPE_TIMESTAMP: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DATE64: + storage_type = NANOARROW_TYPE_INT64; + break; + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_DATE32: + storage_type = NANOARROW_TYPE_INT32; + break; + default: + break; + } + + int result = ArrowArraySetStorageType(array, storage_type); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + ArrowLayoutInit(&private_data->layout, storage_type); + // We can only know this not to be true when initializing based on a schema + // so assume this to be true. + private_data->union_type_id_is_child_index = 1; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayInitFromType(array, array_view->storage_type), error); + int result; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + private_data->layout = array_view->layout; + + if (array_view->n_children > 0) { + result = ArrowArrayAllocateChildren(array, array_view->n_children); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + for (int64_t i = 0; i < array_view->n_children; i++) { + result = + ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } + } + + if (array_view->dictionary != NULL) { + result = ArrowArrayAllocateDictionary(array); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + + result = + ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayRelease(array); + return result; + } + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); + if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + // We can still build arrays if this isn't true; however, the append + // functions won't work. Instead, we store this value and error only + // when StartAppending is called. + private_data->union_type_id_is_child_index = + _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { + if (array->children != NULL) { + return EINVAL; + } + + if (n_children == 0) { + return NANOARROW_OK; + } + + array->children = + (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); + if (array->children == NULL) { + return ENOMEM; + } + + memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); + + for (int64_t i = 0; i < n_children; i++) { + array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->children[i] == NULL) { + return ENOMEM; + } + array->children[i]->release = NULL; + } + + array->n_children = n_children; + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { + if (array->dictionary != NULL) { + return EINVAL; + } + + array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); + if (array->dictionary == NULL) { + return ENOMEM; + } + + array->dictionary->release = NULL; + return NANOARROW_OK; +} + +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); + private_data->bitmap.size_bits = bitmap->size_bits; + bitmap->size_bits = 0; + private_data->buffer_data[0] = private_data->bitmap.buffer.data; + array->null_count = -1; +} + +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (i) { + case 0: + ArrowBufferMove(buffer, &private_data->bitmap.buffer); + private_data->buffer_data[i] = private_data->bitmap.buffer.data; + break; + case 1: + case 2: + ArrowBufferMove(buffer, &private_data->buffers[i - 1]); + private_data->buffer_data[i] = private_data->buffers[i - 1].data; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, + struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + ArrowArrayViewInitFromType(array_view, private_data->storage_type); + array_view->layout = private_data->layout; + array_view->array = array; + array_view->length = array->length; + array_view->offset = array->offset; + array_view->null_count = array->null_count; + + array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; + array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; + array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; + array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; + array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; + array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; + + int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < array->n_children; i++) { + result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, + struct ArrowArrayView* array_view) { + // Loop through buffers and reserve the extra space that we know about + for (int64_t i = 0; i < array->n_buffers; i++) { + // Don't reserve on a validity buffer that hasn't been allocated yet + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && + ArrowArrayBuffer(array, i)->data == NULL) { + continue; + } + + int64_t additional_size_bytes = + array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; + + if (additional_size_bytes > 0) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); + } + } + + // Recursively reserve children + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayReserveInternal(array->children[i], array_view->children[i])); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements) { + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); + + // Calculate theoretical buffer sizes (recursively) + ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); + + // Walk the structure (recursively) + int result = ArrowArrayReserveInternal(array, &array_view); + ArrowArrayViewReset(&array_view); + if (result != NANOARROW_OK) { + return result; + } + + return NANOARROW_OK; +} + +static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY || + private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + continue; + } + + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + if (buffer->data == NULL) { + NANOARROW_RETURN_NOT_OK((ArrowBufferReserve(buffer, 1))); + } + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); + } + + return NANOARROW_OK; +} + +static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + const bool is_binary_view = private_data->storage_type == NANOARROW_TYPE_STRING_VIEW || + private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW; + const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS; + + for (int32_t i = 0; i < nfixed_buf; i++) { + private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; + } + + if (is_binary_view) { + const int32_t nvirt_buf = private_data->n_variadic_buffers; + private_data->buffer_data = (const void**)ArrowRealloc( + private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 1)); + for (int32_t i = 0; i < nvirt_buf; i++) { + private_data->buffer_data[nfixed_buf + i] = private_data->variadic_buffers[i].data; + } + private_data->buffer_data[nfixed_buf + nvirt_buf] = + private_data->variadic_buffer_sizes; + array->buffers = (const void**)(private_data->buffer_data); + } + + for (int64_t i = 0; i < array->n_children; i++) { + ArrowArrayFlushInternalPointers(array->children[i]); + } + + if (array->dictionary != NULL) { + ArrowArrayFlushInternalPointers(array->dictionary); + } +} + +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + // Even if the data buffer is size zero, the pointer value needed to be non-null + // in some implementations (at least one version of Arrow C++ at the time this + // was added and C# as later discovered). Only do this fix if we can assume + // CPU data access. + if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); + } + + // Make sure the value we get with array->buffers[i] is set to the actual + // pointer (which may have changed from the original due to reallocation) + ArrowArrayFlushInternalPointers(array); + + if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { + return NANOARROW_OK; + } + + // For validation, initialize an ArrowArrayView with our known buffer sizes + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), + error); + int result = ArrowArrayViewValidate(&array_view, validation_level, error); + ArrowArrayViewReset(&array_view); + return result; +} + +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error) { + return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); +} + +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type) { + memset(array_view, 0, sizeof(struct ArrowArrayView)); + array_view->storage_type = storage_type; + ArrowLayoutInit(&array_view->layout, storage_type); +} + +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children) { + if (array_view->children != NULL) { + return EINVAL; + } + + if (n_children == 0) { + array_view->n_children = 0; + return NANOARROW_OK; + } + + array_view->children = + (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); + if (array_view->children == NULL) { + return ENOMEM; + } + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = NULL; + } + + array_view->n_children = n_children; + + for (int64_t i = 0; i < n_children; i++) { + array_view->children[i] = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->children[i] == NULL) { + return ENOMEM; + } + ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { + if (array_view->dictionary != NULL) { + return EINVAL; + } + + array_view->dictionary = + (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); + if (array_view->dictionary == NULL) { + return ENOMEM; + } + + ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error) { + struct ArrowSchemaView schema_view; + int result = ArrowSchemaViewInit(&schema_view, schema, error); + if (result != NANOARROW_OK) { + return result; + } + + ArrowArrayViewInitFromType(array_view, schema_view.storage_type); + array_view->layout = schema_view.layout; + + result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); + if (result != NANOARROW_OK) { + ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); + ArrowArrayViewReset(array_view); + return result; + } + + for (int64_t i = 0; i < schema->n_children; i++) { + result = + ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (schema->dictionary != NULL) { + result = ArrowArrayViewAllocateDictionary(array_view); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + + result = + ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(array_view); + return result; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || + array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { + array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); + if (array_view->union_type_id_map == NULL) { + return ENOMEM; + } + + memset(array_view->union_type_id_map, -1, 256); + int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, + array_view->union_type_id_map + 128); + for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { + int8_t type_id = array_view->union_type_id_map[128 + child_index]; + array_view->union_type_id_map[type_id] = child_index; + } + } + + return NANOARROW_OK; +} + +void ArrowArrayViewReset(struct ArrowArrayView* array_view) { + if (array_view->children != NULL) { + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i] != NULL) { + ArrowArrayViewReset(array_view->children[i]); + ArrowFree(array_view->children[i]); + } + } + + ArrowFree(array_view->children); + } + + if (array_view->dictionary != NULL) { + ArrowArrayViewReset(array_view->dictionary); + ArrowFree(array_view->dictionary); + } + + if (array_view->union_type_id_map != NULL) { + ArrowFree(array_view->union_type_id_map); + } + + ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); +} + +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + array_view->buffer_views[i].size_bytes = + (length != 0) * element_size_bytes * (length + 1); + continue; + case NANOARROW_BUFFER_TYPE_DATA: + array_view->buffer_views[i].size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / + 8; + continue; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + array_view->buffer_views[i].size_bytes = element_size_bytes * length; + continue; + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: + case NANOARROW_BUFFER_TYPE_NONE: + array_view->buffer_views[i].size_bytes = 0; + continue; + } + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_SPARSE_UNION: + for (int64_t i = 0; i < array_view->n_children; i++) { + ArrowArrayViewSetLength(array_view->children[i], length); + } + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + if (array_view->n_children >= 1) { + ArrowArrayViewSetLength(array_view->children[0], + length * array_view->layout.child_size_elements); + } + default: + break; + } +} + +// This version recursively extracts information from the array and stores it +// in the array view, performing any checks that require the original array. +static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + array_view->array = array; + array_view->offset = array->offset; + array_view->length = array->length; + array_view->null_count = array->null_count; + array_view->variadic_buffer_sizes = NULL; + array_view->variadic_buffers = NULL; + array_view->n_variadic_buffers = 0; + + int64_t buffers_required = 0; + const int nfixed_buf = array_view->storage_type == NANOARROW_TYPE_STRING_VIEW || + array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW + ? NANOARROW_BINARY_VIEW_FIXED_BUFFERS + : NANOARROW_MAX_FIXED_BUFFERS; + for (int i = 0; i < nfixed_buf; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + buffers_required++; + + // Set buffer pointer + array_view->buffer_views[i].data.data = array->buffers[i]; + + // If non-null, set buffer size to unknown. + if (array->buffers[i] == NULL) { + array_view->buffer_views[i].size_bytes = 0; + } else { + array_view->buffer_views[i].size_bytes = -1; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW || + array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + const int64_t n_buffers = array->n_buffers; + const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS; + + const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1); + array_view->n_variadic_buffers = nvariadic_buf; + buffers_required += nvariadic_buf + 1; + array_view->variadic_buffers = array->buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS; + array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1]; + } + + if (buffers_required != array->n_buffers) { + ArrowErrorSet(error, + "Expected array with %" PRId64 " buffer(s) but found %" PRId64 + " buffer(s)", + buffers_required, array->n_buffers); + return EINVAL; + } + + // Check number of children + if (array_view->n_children != array->n_children) { + ArrowErrorSet(error, "Expected %" PRId64 " children but found %" PRId64 " children", + array_view->n_children, array->n_children); + return EINVAL; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], + array->children[i], error)); + } + + // Check dictionary + if (array->dictionary == NULL && array_view->dictionary != NULL) { + ArrowErrorSet(error, "Expected dictionary but found NULL"); + return EINVAL; + } + + if (array->dictionary != NULL && array_view->dictionary == NULL) { + ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); + return EINVAL; + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, + struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %" PRId64, + array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %" PRId64, + array_view->offset); + return EINVAL; + } + + // Ensure that offset + length fits within an int64 before a possible overflow + if ((uint64_t)array_view->offset + (uint64_t)array_view->length > (uint64_t)INT64_MAX) { + ArrowErrorSet(error, "Offset + length is > INT64_MAX"); + return EINVAL; + } + + // Calculate buffer sizes that do not require buffer access. If marked as + // unknown, assign the buffer size; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + // Only loop over the first two buffers because the size of the third buffer + // is always data dependent for all current Arrow types. + for (int i = 0; i < 2; i++) { + int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; + // Initialize with a value that will cause an error if accidentally used uninitialized + // Need to suppress the clang-tidy warning because gcc warns for possible use + int64_t min_buffer_size_bytes = // NOLINT(clang-analyzer-deadcode.DeadStores) + array_view->buffer_views[i].size_bytes + 1; + + switch (array_view->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: + if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { + continue; + } + + min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); + break; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Probably don't want/need to rely on the producer to have allocated an + // offsets buffer of length 1 for a zero-size array + min_buffer_size_bytes = + (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); + break; + case NANOARROW_BUFFER_TYPE_DATA: + min_buffer_size_bytes = + _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * + offset_plus_length) / + 8; + break; + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + min_buffer_size_bytes = element_size_bytes * offset_plus_length; + break; + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: + case NANOARROW_BUFFER_TYPE_NONE: + continue; + } + + // Assign or validate buffer size + if (array_view->buffer_views[i].size_bytes == -1) { + array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; + } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { + ArrowErrorSet(error, + "Expected %s array buffer %d to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), i, min_buffer_size_bytes, + array_view->buffer_views[i].size_bytes); + return EINVAL; + } + } + + // For list, fixed-size list and map views, we can validate the number of children + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->n_children != 1) { + ArrowErrorSet(error, + "Expected 1 child of %s array but found %" PRId64 " child arrays", + ArrowTypeString(array_view->storage_type), array_view->n_children); + return EINVAL; + } + break; + case NANOARROW_TYPE_RUN_END_ENCODED: + if (array_view->n_children != 2) { + ArrowErrorSet( + error, "Expected 2 children for %s array but found %" PRId64 " child arrays", + ArrowTypeString(array_view->storage_type), array_view->n_children); + return EINVAL; + } + break; + default: + break; + } + + // For struct, the sparse union, and the fixed-size list views, we can validate child + // lengths. + int64_t child_min_length; + switch (array_view->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + child_min_length = (array_view->offset + array_view->length); + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < child_min_length) { + ArrowErrorSet(error, + "Expected struct child %" PRId64 " to have length >= %" PRId64 + " but found child with " + "length %" PRId64, + i + 1, child_min_length, array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_min_length = (array_view->offset + array_view->length) * + array_view->layout.child_size_elements; + if (array_view->children[0]->length < child_min_length) { + ArrowErrorSet(error, + "Expected child of fixed_size_list array to have length >= %" PRId64 + " but " + "found array with length %" PRId64, + child_min_length, array_view->children[0]->length); + return EINVAL; + } + break; + + case NANOARROW_TYPE_RUN_END_ENCODED: { + if (array_view->n_children != 2) { + ArrowErrorSet(error, + "Expected 2 children for run-end encoded array but found %" PRId64, + array_view->n_children); + return EINVAL; + } + struct ArrowArrayView* run_ends_view = array_view->children[0]; + struct ArrowArrayView* values_view = array_view->children[1]; + int64_t max_length; + switch (run_ends_view->storage_type) { + case NANOARROW_TYPE_INT16: + max_length = INT16_MAX; + break; + case NANOARROW_TYPE_INT32: + max_length = INT32_MAX; + break; + case NANOARROW_TYPE_INT64: + max_length = INT64_MAX; + break; + default: + ArrowErrorSet( + error, + "Run-end encoded array only supports INT16, INT32 or INT64 run-ends " + "but found run-ends type %s", + ArrowTypeString(run_ends_view->storage_type)); + return EINVAL; + } + + // There is already a check above that offset_plus_length < INT64_MAX + if (offset_plus_length > max_length) { + ArrowErrorSet(error, + "Offset + length of a run-end encoded array must fit in a value" + " of the run end type %s but is %" PRId64 " + %" PRId64, + ArrowTypeString(run_ends_view->storage_type), array_view->offset, + array_view->length); + return EINVAL; + } + + if (run_ends_view->length > values_view->length) { + ArrowErrorSet(error, + "Length of run_ends is greater than the length of values: %" PRId64 + " > %" PRId64, + run_ends_view->length, values_view->length); + return EINVAL; + } + + if (run_ends_view->length == 0 && values_view->length != 0) { + ArrowErrorSet(error, + "Run-end encoded array has zero length %" PRId64 + ", but values array has " + "non-zero length", + values_view->length); + return EINVAL; + } + + if (run_ends_view->null_count != 0) { + ArrowErrorSet(error, "Null count must be 0 for run ends array, but is %" PRId64, + run_ends_view->null_count); + return EINVAL; + } + break; + } + + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateMinimal(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, + struct ArrowError* error) { + // Perform minimal validation. This will validate or assign + // buffer sizes as long as buffer access is not required. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + // Calculate buffer sizes or child lengths that require accessing the offsets + // buffer. Where appropriate, validate that the first offset is >= 0. + // If a buffer size is marked as unknown, assign it; otherwise, validate it. + int64_t offset_plus_length = array_view->offset + array_view->length; + + int64_t first_offset; + int64_t last_offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), last_offset, + array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset buffer, + // set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; + } + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + + // If the data buffer size is unknown, assign it; otherwise, check it + if (array_view->buffer_views[2].size_bytes == -1) { + array_view->buffer_views[2].size_bytes = last_offset; + } else if (array_view->buffer_views[2].size_bytes < last_offset) { + ArrowErrorSet(error, + "Expected %s array buffer 2 to have size >= %" PRId64 + " bytes but found " + "buffer with %" PRId64 " bytes", + ArrowTypeString(array_view->storage_type), last_offset, + array_view->buffer_views[2].size_bytes); + return EINVAL; + } + } else if (array_view->buffer_views[2].size_bytes == -1) { + // If the data buffer size is unknown and there are no bytes in the offset + // buffer, set the data buffer size to 0. + array_view->buffer_views[2].size_bytes = 0; + } + break; + + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array_view->n_children; i++) { + if (array_view->children[i]->length < offset_plus_length) { + ArrowErrorSet(error, + "Expected struct child %" PRId64 " to have length >= %" PRId64 + " but found child with " + "length %" PRId64, + i + 1, offset_plus_length, array_view->children[i]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet(error, + "Expected child of %s array to have length >= %" PRId64 + " but found array with " + "length %" PRId64, + ArrowTypeString(array_view->storage_type), last_offset, + array_view->children[0]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_LARGE_LIST: + if (array_view->buffer_views[1].size_bytes != 0) { + first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset]; + if (first_offset < 0) { + ArrowErrorSet(error, "Expected first offset >= 0 but found %" PRId64, + first_offset); + return EINVAL; + } + + last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; + if (last_offset < 0) { + ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64, + last_offset); + return EINVAL; + } + + if (array_view->children[0]->length < last_offset) { + ArrowErrorSet(error, + "Expected child of large list array to have length >= %" PRId64 + " but found array " + "with length %" PRId64, + last_offset, array_view->children[0]->length); + return EINVAL; + } + } + break; + + case NANOARROW_TYPE_RUN_END_ENCODED: { + struct ArrowArrayView* run_ends_view = array_view->children[0]; + if (run_ends_view->length == 0) { + break; + } + + int64_t first_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0); + if (first_run_end < 1) { + ArrowErrorSet( + error, + "All run ends must be greater than 0 but the first run end is %" PRId64, + first_run_end); + return EINVAL; + } + + // offset + length < INT64_MAX is checked in ArrowArrayViewValidateMinimal() + int64_t last_run_end = + ArrowArrayViewGetIntUnsafe(run_ends_view, run_ends_view->length - 1); + if (last_run_end < offset_plus_length) { + ArrowErrorSet(error, + "Last run end is %" PRId64 " but it should be >= (%" PRId64 + " + %" PRId64 ")", + last_run_end, array_view->offset, array_view->length); + return EINVAL; + } + break; + } + default: + break; + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewValidateDefault(array_view->children[i], error)); + } + + // Recurse for dictionary + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error) { + // Extract information from the array into the array view + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); + + // Run default validation. Because we've marked all non-NULL buffers as having unknown + // size, validation will also update the buffer sizes as it goes. + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int32_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { + if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { + ArrowErrorSet(error, "[%" PRId64 "] Expected element size >= 0", i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, + struct ArrowError* error) { + if (view.size_bytes <= (int64_t)sizeof(int64_t)) { + return NANOARROW_OK; + } + + for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { + if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { + ArrowErrorSet(error, "[%" PRId64 "] Expected element size >= 0", i); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, + int8_t max_value, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { + ArrowErrorSet(error, + "[%" PRId64 "] Expected buffer value between %" PRId8 " and %" PRId8 + " but found value %" PRId8, + i, min_value, max_value, view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, + int64_t n_values, struct ArrowError* error) { + for (int64_t i = 0; i < view.size_bytes; i++) { + int item_found = 0; + for (int64_t j = 0; j < n_values; j++) { + if (view.data.as_int8[i] == values[j]) { + item_found = 1; + break; + } + } + + if (!item_found) { + ArrowErrorSet(error, "[%" PRId64 "] Unexpected buffer value %" PRId8, i, + view.data.as_int8[i]); + return EINVAL; + } + } + + return NANOARROW_OK; +} + +static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, + struct ArrowError* error) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + switch (array_view->layout.buffer_type[i]) { + // Only validate the portion of the buffer that is strictly required, + // which includes not validating the offset buffer of a zero-length array. + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + if (array_view->length == 0) { + continue; + } + if (array_view->layout.element_size_bits[i] == 32) { + struct ArrowBufferView sliced_offsets; + sliced_offsets.data.as_int32 = + array_view->buffer_views[i].data.as_int32 + array_view->offset; + sliced_offsets.size_bytes = (array_view->length + 1) * sizeof(int32_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt32(sliced_offsets, error)); + } else { + struct ArrowBufferView sliced_offsets; + sliced_offsets.data.as_int64 = + array_view->buffer_views[i].data.as_int64 + array_view->offset; + sliced_offsets.size_bytes = (array_view->length + 1) * sizeof(int64_t); + NANOARROW_RETURN_NOT_OK(ArrowAssertIncreasingInt64(sliced_offsets, error)); + } + break; + default: + break; + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || + array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { + struct ArrowBufferView sliced_type_ids; + sliced_type_ids.size_bytes = array_view->length * sizeof(int8_t); + if (array_view->length > 0) { + sliced_type_ids.data.as_int8 = + array_view->buffer_views[0].data.as_int8 + array_view->offset; + } else { + sliced_type_ids.data.as_int8 = NULL; + } + + if (array_view->union_type_id_map == NULL) { + // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough + // information to validate this buffer. + ArrowErrorSet(error, + "Insufficient information provided for validation of union array"); + return EINVAL; + } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( + array_view->union_type_id_map, array_view->n_children, + array_view->n_children)) { + NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( + sliced_type_ids, 0, (int8_t)(array_view->n_children - 1), error)); + } else { + NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(sliced_type_ids, + array_view->union_type_id_map + 128, + array_view->n_children, error)); + } + } + + if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && + array_view->union_type_id_map != NULL) { + // Check that offsets refer to child elements that actually exist + for (int64_t i = 0; i < array_view->length; i++) { + int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); + int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); + int64_t child_length = array_view->children[child_id]->length; + if (offset < 0 || offset > child_length) { + ArrowErrorSet(error, + "[%" PRId64 "] Expected union offset for child id %" PRId8 + " to be between 0 and %" PRId64 + " but " + "found offset value %" PRId64, + i, child_id, child_length, offset); + return EINVAL; + } + } + } + + if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) { + struct ArrowArrayView* run_ends_view = array_view->children[0]; + if (run_ends_view->length > 0) { + int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0); + for (int64_t i = 1; i < run_ends_view->length; i++) { + const int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i); + if (run_end <= last_run_end) { + ArrowErrorSet( + error, + "Every run end must be strictly greater than the previous run end, " + "but run_ends[%" PRId64 " is %" PRId64 " and run_ends[%" PRId64 + "] is %" PRId64, + i, run_end, i - 1, last_run_end); + return EINVAL; + } + last_run_end = run_end; + } + } + } + + // Recurse for children + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); + } + + // Dictionary valiation not implemented + if (array_view->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); + // TODO: validate the indices + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + switch (validation_level) { + case NANOARROW_VALIDATION_LEVEL_NONE: + return NANOARROW_OK; + case NANOARROW_VALIDATION_LEVEL_MINIMAL: + return ArrowArrayViewValidateMinimal(array_view, error); + case NANOARROW_VALIDATION_LEVEL_DEFAULT: + return ArrowArrayViewValidateDefault(array_view, error); + case NANOARROW_VALIDATION_LEVEL_FULL: + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); + return ArrowArrayViewValidateFull(array_view, error); + } + + ArrowErrorSet(error, "validation_level not recognized"); + return EINVAL; +} + +struct ArrowComparisonInternalState { + enum ArrowCompareLevel level; + int is_equal; + struct ArrowError* reason; +}; + +NANOARROW_CHECK_PRINTF_ATTRIBUTE static void ArrowComparePrependPath( + struct ArrowError* out, const char* fmt, ...) { + if (out == NULL) { + return; + } + + char prefix[128]; + prefix[0] = '\0'; + va_list args; + va_start(args, fmt); + int prefix_len = vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + + if (prefix_len <= 0) { + return; + } + + size_t out_len = strlen(out->message); + size_t out_len_to_move = sizeof(struct ArrowError) - prefix_len - 1; + if (out_len_to_move > out_len) { + out_len_to_move = out_len; + } + + memmove(out->message + prefix_len, out->message, out_len_to_move); + memcpy(out->message, prefix, prefix_len); + out->message[out_len + prefix_len] = '\0'; +} + +#define SET_NOT_EQUAL_AND_RETURN_IF_IMPL(cond_, state_, reason_) \ + do { \ + if (cond_) { \ + ArrowErrorSet(state_->reason, ": %s", reason_); \ + state_->is_equal = 0; \ + return; \ + } \ + } while (0) + +#define SET_NOT_EQUAL_AND_RETURN_IF(condition_, state_) \ + SET_NOT_EQUAL_AND_RETURN_IF_IMPL(condition_, state_, #condition_) + +static void ArrowArrayViewCompareBuffer(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, int i, + struct ArrowComparisonInternalState* state) { + SET_NOT_EQUAL_AND_RETURN_IF( + actual->buffer_views[i].size_bytes != expected->buffer_views[i].size_bytes, state); + + int64_t buffer_size = actual->buffer_views[i].size_bytes; + if (buffer_size > 0) { + SET_NOT_EQUAL_AND_RETURN_IF( + memcmp(actual->buffer_views[i].data.data, expected->buffer_views[i].data.data, + buffer_size) != 0, + state); + } +} + +static void ArrowArrayViewCompareIdentical(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + struct ArrowComparisonInternalState* state) { + SET_NOT_EQUAL_AND_RETURN_IF(actual->storage_type != expected->storage_type, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->n_children != expected->n_children, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary == NULL && expected->dictionary != NULL, + state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary != NULL && expected->dictionary == NULL, + state); + + SET_NOT_EQUAL_AND_RETURN_IF(actual->length != expected->length, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->offset != expected->offset, state); + SET_NOT_EQUAL_AND_RETURN_IF(actual->null_count != expected->null_count, state); + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + ArrowArrayViewCompareBuffer(actual, expected, i, state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".buffers[%d]", i); + return; + } + } + + for (int64_t i = 0; i < actual->n_children; i++) { + ArrowArrayViewCompareIdentical(actual->children[i], expected->children[i], state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".children[%" PRId64 "]", i); + return; + } + } + + if (actual->dictionary != NULL) { + ArrowArrayViewCompareIdentical(actual->dictionary, expected->dictionary, state); + if (!state->is_equal) { + ArrowComparePrependPath(state->reason, ".dictionary"); + return; + } + } +} + +// Top-level entry point to take care of creating, cleaning up, and +// propagating the ArrowComparisonInternalState to the caller +ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + enum ArrowCompareLevel level, int* out, + struct ArrowError* reason) { + struct ArrowComparisonInternalState state; + state.level = level; + state.is_equal = 1; + state.reason = reason; + + switch (level) { + case NANOARROW_COMPARE_IDENTICAL: + ArrowArrayViewCompareIdentical(actual, expected, &state); + break; + default: + return EINVAL; + } + + *out = state.is_equal; + if (!state.is_equal) { + ArrowComparePrependPath(state.reason, "root"); + } + + return NANOARROW_OK; +} + +#undef SET_NOT_EQUAL_AND_RETURN_IF +#undef SET_NOT_EQUAL_AND_RETURN_IF_IMPL +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "nanoarrow.h" + +struct BasicArrayStreamPrivate { + struct ArrowSchema schema; + int64_t n_arrays; + struct ArrowArray* arrays; + int64_t arrays_i; +}; + +static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + return ArrowSchemaDeepCopy(&private_data->schema, schema); +} + +static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, + struct ArrowArray* array) { + if (array_stream == NULL || array_stream->release == NULL) { + return EINVAL; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->arrays_i == private_data->n_arrays) { + array->release = NULL; + return NANOARROW_OK; + } + + ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); + return NANOARROW_OK; +} + +static const char* ArrowBasicArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_UNUSED(array_stream); + return NULL; +} + +static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { + if (array_stream == NULL || array_stream->release == NULL) { + return; + } + + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + if (private_data->schema.release != NULL) { + ArrowSchemaRelease(&private_data->schema); + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + ArrowArrayRelease(&private_data->arrays[i]); + } + } + + if (private_data->arrays != NULL) { + ArrowFree(private_data->arrays); + } + + ArrowFree(private_data); + array_stream->release = NULL; +} + +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)ArrowMalloc( + sizeof(struct BasicArrayStreamPrivate)); + if (private_data == NULL) { + return ENOMEM; + } + + ArrowSchemaMove(schema, &private_data->schema); + + private_data->n_arrays = n_arrays; + private_data->arrays = NULL; + private_data->arrays_i = 0; + + if (n_arrays > 0) { + private_data->arrays = + (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); + if (private_data->arrays == NULL) { + ArrowBasicArrayStreamRelease(array_stream); + return ENOMEM; + } + } + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + private_data->arrays[i].release = NULL; + } + + array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; + array_stream->get_next = &ArrowBasicArrayStreamGetNext; + array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; + array_stream->release = ArrowBasicArrayStreamRelease; + array_stream->private_data = private_data; + return NANOARROW_OK; +} + +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + ArrowArrayMove(array, &private_data->arrays[i]); +} + +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, + struct ArrowError* error) { + struct BasicArrayStreamPrivate* private_data = + (struct BasicArrayStreamPrivate*)array_stream->private_data; + + struct ArrowArrayView array_view; + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); + + for (int64_t i = 0; i < private_data->n_arrays; i++) { + if (private_data->arrays[i].release != NULL) { + int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); + if (result != NANOARROW_OK) { + ArrowArrayViewReset(&array_view); + return result; + } + } + } + + ArrowArrayViewReset(&array_view); + return NANOARROW_OK; +} diff --git a/src/oracledb/interchange/nanoarrow/nanoarrow.h b/src/oracledb/interchange/nanoarrow/nanoarrow.h new file mode 100644 index 00000000..0738957c --- /dev/null +++ b/src/oracledb/interchange/nanoarrow/nanoarrow.h @@ -0,0 +1,4279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUILD_ID_H_INCLUDED +#define NANOARROW_BUILD_ID_H_INCLUDED + +#define NANOARROW_VERSION_MAJOR 0 +#define NANOARROW_VERSION_MINOR 6 +#define NANOARROW_VERSION_PATCH 0 +#define NANOARROW_VERSION "0.6.0" + +#define NANOARROW_VERSION_INT \ + (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ + NANOARROW_VERSION_PATCH) + +#define NANOARROW_NAMESPACE PythonPkg + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED +#define NANOARROW_NANOARROW_TYPES_H_INCLUDED + +#include +#include + + + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +/// \defgroup nanoarrow-arrow-cdata Arrow C Data interface +/// +/// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) +/// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) +/// interfaces are part of the +/// Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for +/// documentation of these structures. +/// +/// @{ + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +/// @} + +// Utility macros +#define _NANOARROW_CONCAT(x, y) x##y +#define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) + +#define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) return NAME; \ + } while (0) + +#define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) + +#define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ + NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d(%s)\n* %s:%d", EXPR_STR, \ + NAME, strerror(NAME), __FILE__, __LINE__); \ + return NAME; \ + } \ + } while (0) +#else +#define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) { \ + ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ + return NAME; \ + } \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +// For checking ArrowErrorSet() calls for valid printf format strings/arguments +// If using mingw's c99-compliant printf, we need a different format-checking attribute +#if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ + __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) +#elif defined(__GNUC__) +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) +#else +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +// For checking calls to functions that return ArrowErrorCode +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#endif + +#else +#define NANOARROW_CHECK_RETURN_ATTRIBUTE +#define NANOARROW_CHECK_PRINTF_ATTRIBUTE +#endif + +#define NANOARROW_UNUSED(x) (void)(x) + +/// \brief Return code for success. +/// \ingroup nanoarrow-errors +#define NANOARROW_OK 0 + +/// \brief Represents an errno-compatible error code +/// \ingroup nanoarrow-errors +typedef int ArrowErrorCode; + +#if defined(NANOARROW_DEBUG) +#define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode +#endif + +/// \brief Flags supported by ArrowSchemaViewInit() +/// \ingroup nanoarrow-schema-view +#define NANOARROW_FLAG_ALL_SUPPORTED \ + (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) + +/// \brief Error type containing a UTF-8 encoded message. +/// \ingroup nanoarrow-errors +struct ArrowError { + /// \brief A character buffer with space for an error message. + char message[1024]; +}; + +/// \brief Ensure an ArrowError is null-terminated by zeroing the first character. +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorInit(struct ArrowError* error) { + if (error != NULL) { + error->message[0] = '\0'; + } +} + +/// \brief Get the contents of an error +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, returns "", or returns the contents of the error message +/// otherwise. +static inline const char* ArrowErrorMessage(struct ArrowError* error) { + if (error == NULL) { + return ""; + } else { + return error->message; + } +} + +/// \brief Set the contents of an error from an existing null-terminated string +/// \ingroup nanoarrow-errors +/// +/// If error is NULL, this function does nothing. +static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { + if (error == NULL) { + return; + } + + int64_t src_len = strlen(src); + if (src_len >= ((int64_t)sizeof(error->message))) { + memcpy(error->message, src, sizeof(error->message) - 1); + error->message[sizeof(error->message) - 1] = '\0'; + } else { + memcpy(error->message, src, src_len); + error->message[src_len] = '\0'; + } +} + +/// \brief Check the result of an expression and return it if not NANOARROW_OK +/// \ingroup nanoarrow-errors +#define NANOARROW_RETURN_NOT_OK(EXPR) \ + _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) + +/// \brief Check the result of an expression and return it if not NANOARROW_OK, +/// adding an auto-generated message to an ArrowError. +/// \ingroup nanoarrow-errors +/// +/// This macro is used to ensure that functions that accept an ArrowError +/// as input always set its message when returning an error code (e.g., when calling +/// a nanoarrow function that does *not* accept ArrowError). +#define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ + _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ + _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) + +#if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) +#define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ + do { \ + fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ + __FILE__, (int)__LINE__); \ + abort(); \ + } while (0) +#endif + +#if defined(NANOARROW_DEBUG) +#define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ + do { \ + const int NAME = (EXPR); \ + if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ + } while (0) + +/// \brief Assert that an expression's value is NANOARROW_OK +/// \ingroup nanoarrow-errors +/// +/// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), +/// print a message to stderr and abort. If nanoarrow was built in release mode, +/// this statement has no effect. You can customize fatal error behaviour +/// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h +/// This macro is provided as a convenience for users and is not used internally. +#define NANOARROW_ASSERT_OK(EXPR) \ + _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) + +#define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ + do { \ + if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ + } while (0) + +#define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) +#else +#define NANOARROW_ASSERT_OK(EXPR) (void)(EXPR) +#define NANOARROW_DCHECK(EXPR) +#endif + +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowSchema)); + src->release = NULL; +} + +static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + NANOARROW_DCHECK(schema != NULL); + schema->release(schema); + NANOARROW_DCHECK(schema->release == NULL); +} + +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArray)); + src->release = NULL; +} + +static inline void ArrowArrayRelease(struct ArrowArray* array) { + NANOARROW_DCHECK(array != NULL); + array->release(array); + NANOARROW_DCHECK(array->release == NULL); +} + +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst) { + NANOARROW_DCHECK(src != NULL); + NANOARROW_DCHECK(dst != NULL); + + memcpy(dst, src, sizeof(struct ArrowArrayStream)); + src->release = NULL; +} + +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + + const char* value = array_stream->get_last_error(array_stream); + if (value == NULL) { + return ""; + } else { + return value; + } +} + +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_schema(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error) { + NANOARROW_DCHECK(array_stream != NULL); + + int result = array_stream->get_next(array_stream, out); + if (result != NANOARROW_OK && error != NULL) { + ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); + } + + return result; +} + +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { + NANOARROW_DCHECK(array_stream != NULL); + array_stream->release(array_stream); + NANOARROW_DCHECK(array_stream->release == NULL); +} + +static char _ArrowIsLittleEndian(void) { + uint32_t check = 1; + char first_byte; + memcpy(&first_byte, &check, sizeof(char)); + return first_byte; +} + +/// \brief Arrow type enumerator +/// \ingroup nanoarrow-utils +/// +/// These names are intended to map to the corresponding arrow::Type::type +/// enumerator; however, the numeric values are specifically not equal +/// (i.e., do not rely on numeric comparison). +enum ArrowType { + NANOARROW_TYPE_UNINITIALIZED = 0, + NANOARROW_TYPE_NA = 1, + NANOARROW_TYPE_BOOL, + NANOARROW_TYPE_UINT8, + NANOARROW_TYPE_INT8, + NANOARROW_TYPE_UINT16, + NANOARROW_TYPE_INT16, + NANOARROW_TYPE_UINT32, + NANOARROW_TYPE_INT32, + NANOARROW_TYPE_UINT64, + NANOARROW_TYPE_INT64, + NANOARROW_TYPE_HALF_FLOAT, + NANOARROW_TYPE_FLOAT, + NANOARROW_TYPE_DOUBLE, + NANOARROW_TYPE_STRING, + NANOARROW_TYPE_BINARY, + NANOARROW_TYPE_FIXED_SIZE_BINARY, + NANOARROW_TYPE_DATE32, + NANOARROW_TYPE_DATE64, + NANOARROW_TYPE_TIMESTAMP, + NANOARROW_TYPE_TIME32, + NANOARROW_TYPE_TIME64, + NANOARROW_TYPE_INTERVAL_MONTHS, + NANOARROW_TYPE_INTERVAL_DAY_TIME, + NANOARROW_TYPE_DECIMAL128, + NANOARROW_TYPE_DECIMAL256, + NANOARROW_TYPE_LIST, + NANOARROW_TYPE_STRUCT, + NANOARROW_TYPE_SPARSE_UNION, + NANOARROW_TYPE_DENSE_UNION, + NANOARROW_TYPE_DICTIONARY, + NANOARROW_TYPE_MAP, + NANOARROW_TYPE_EXTENSION, + NANOARROW_TYPE_FIXED_SIZE_LIST, + NANOARROW_TYPE_DURATION, + NANOARROW_TYPE_LARGE_STRING, + NANOARROW_TYPE_LARGE_BINARY, + NANOARROW_TYPE_LARGE_LIST, + NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO, + NANOARROW_TYPE_RUN_END_ENCODED, + NANOARROW_TYPE_BINARY_VIEW, + NANOARROW_TYPE_STRING_VIEW +}; + +/// \brief Get a string value of an enum ArrowType value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for type +static inline const char* ArrowTypeString(enum ArrowType type); + +static inline const char* ArrowTypeString(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_NA: + return "na"; + case NANOARROW_TYPE_BOOL: + return "bool"; + case NANOARROW_TYPE_UINT8: + return "uint8"; + case NANOARROW_TYPE_INT8: + return "int8"; + case NANOARROW_TYPE_UINT16: + return "uint16"; + case NANOARROW_TYPE_INT16: + return "int16"; + case NANOARROW_TYPE_UINT32: + return "uint32"; + case NANOARROW_TYPE_INT32: + return "int32"; + case NANOARROW_TYPE_UINT64: + return "uint64"; + case NANOARROW_TYPE_INT64: + return "int64"; + case NANOARROW_TYPE_HALF_FLOAT: + return "half_float"; + case NANOARROW_TYPE_FLOAT: + return "float"; + case NANOARROW_TYPE_DOUBLE: + return "double"; + case NANOARROW_TYPE_STRING: + return "string"; + case NANOARROW_TYPE_BINARY: + return "binary"; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return "fixed_size_binary"; + case NANOARROW_TYPE_DATE32: + return "date32"; + case NANOARROW_TYPE_DATE64: + return "date64"; + case NANOARROW_TYPE_TIMESTAMP: + return "timestamp"; + case NANOARROW_TYPE_TIME32: + return "time32"; + case NANOARROW_TYPE_TIME64: + return "time64"; + case NANOARROW_TYPE_INTERVAL_MONTHS: + return "interval_months"; + case NANOARROW_TYPE_INTERVAL_DAY_TIME: + return "interval_day_time"; + case NANOARROW_TYPE_DECIMAL128: + return "decimal128"; + case NANOARROW_TYPE_DECIMAL256: + return "decimal256"; + case NANOARROW_TYPE_LIST: + return "list"; + case NANOARROW_TYPE_STRUCT: + return "struct"; + case NANOARROW_TYPE_SPARSE_UNION: + return "sparse_union"; + case NANOARROW_TYPE_DENSE_UNION: + return "dense_union"; + case NANOARROW_TYPE_DICTIONARY: + return "dictionary"; + case NANOARROW_TYPE_MAP: + return "map"; + case NANOARROW_TYPE_EXTENSION: + return "extension"; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + return "fixed_size_list"; + case NANOARROW_TYPE_DURATION: + return "duration"; + case NANOARROW_TYPE_LARGE_STRING: + return "large_string"; + case NANOARROW_TYPE_LARGE_BINARY: + return "large_binary"; + case NANOARROW_TYPE_LARGE_LIST: + return "large_list"; + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: + return "interval_month_day_nano"; + case NANOARROW_TYPE_RUN_END_ENCODED: + return "run_end_encoded"; + case NANOARROW_TYPE_BINARY_VIEW: + return "binary_view"; + case NANOARROW_TYPE_STRING_VIEW: + return "string_view"; + default: + return NULL; + } +} + +/// \brief Arrow time unit enumerator +/// \ingroup nanoarrow-utils +/// +/// These names and values map to the corresponding arrow::TimeUnit::type +/// enumerator. +enum ArrowTimeUnit { + NANOARROW_TIME_UNIT_SECOND = 0, + NANOARROW_TIME_UNIT_MILLI = 1, + NANOARROW_TIME_UNIT_MICRO = 2, + NANOARROW_TIME_UNIT_NANO = 3 +}; + +/// \brief Validation level enumerator +/// \ingroup nanoarrow-array +enum ArrowValidationLevel { + /// \brief Do not validate buffer sizes or content. + NANOARROW_VALIDATION_LEVEL_NONE = 0, + + /// \brief Validate buffer sizes that depend on array length but do not validate buffer + /// sizes that depend on buffer data access. + NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, + + /// \brief Validate all buffer sizes, including those that require buffer data access, + /// but do not perform any checks that are O(1) along the length of the buffers. + NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, + + /// \brief Validate all buffer sizes and all buffer content. This is useful in the + /// context of untrusted input or input that may have been corrupted in transit. + NANOARROW_VALIDATION_LEVEL_FULL = 3 +}; + +/// \brief Comparison level enumerator +/// \ingroup nanoarrow-utils +enum ArrowCompareLevel { + /// \brief Consider arrays equal if buffers contain identical content + /// and have identical offset, null count, and length. Note that this is + /// a much stricter check than logical equality, which would take into + /// account potentially different content of null slots, arrays with a + /// non-zero offset, and other considerations. + NANOARROW_COMPARE_IDENTICAL, +}; + +/// \brief Get a string value of an enum ArrowTimeUnit value +/// \ingroup nanoarrow-utils +/// +/// Returns NULL for invalid values for time_unit +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); + +static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + return "s"; + case NANOARROW_TIME_UNIT_MILLI: + return "ms"; + case NANOARROW_TIME_UNIT_MICRO: + return "us"; + case NANOARROW_TIME_UNIT_NANO: + return "ns"; + default: + return NULL; + } +} + +/// \brief Functional types of buffers as described in the Arrow Columnar Specification +/// \ingroup nanoarrow-array-view +enum ArrowBufferType { + NANOARROW_BUFFER_TYPE_NONE, + NANOARROW_BUFFER_TYPE_VALIDITY, + NANOARROW_BUFFER_TYPE_TYPE_ID, + NANOARROW_BUFFER_TYPE_UNION_OFFSET, + NANOARROW_BUFFER_TYPE_DATA_OFFSET, + NANOARROW_BUFFER_TYPE_DATA, + NANOARROW_BUFFER_TYPE_VARIADIC_DATA, + NANOARROW_BUFFER_TYPE_VARIADIC_SIZE +}; + +/// \brief The maximum number of fixed buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +#define NANOARROW_MAX_FIXED_BUFFERS 3 + +/// \brief An non-owning view of a string +/// \ingroup nanoarrow-utils +struct ArrowStringView { + /// \brief A pointer to the start of the string + /// + /// If size_bytes is 0, this value may be NULL. + const char* data; + + /// \brief The size of the string in bytes, + /// + /// (Not including the null terminator.) + int64_t size_bytes; +}; + +/// \brief Return a view of a const C string +/// \ingroup nanoarrow-utils +static inline struct ArrowStringView ArrowCharView(const char* value); + +static inline struct ArrowStringView ArrowCharView(const char* value) { + struct ArrowStringView out; + + out.data = value; + if (value) { + out.size_bytes = (int64_t)strlen(value); + } else { + out.size_bytes = 0; + } + + return out; +} + +union ArrowBufferViewData { + const void* data; + const int8_t* as_int8; + const uint8_t* as_uint8; + const int16_t* as_int16; + const uint16_t* as_uint16; + const int32_t* as_int32; + const uint32_t* as_uint32; + const int64_t* as_int64; + const uint64_t* as_uint64; + const double* as_double; + const float* as_float; + const char* as_char; + const union ArrowBinaryView* as_binary_view; +}; + +/// \brief An non-owning view of a buffer +/// \ingroup nanoarrow-utils +struct ArrowBufferView { + /// \brief A pointer to the start of the buffer + /// + /// If size_bytes is 0, this value may be NULL. + union ArrowBufferViewData data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; +}; + +/// \brief Array buffer allocation and deallocation +/// \ingroup nanoarrow-buffer +/// +/// Container for allocate, reallocate, and free methods that can be used +/// to customize allocation and deallocation of buffers when constructing +/// an ArrowArray. +struct ArrowBufferAllocator { + /// \brief Reallocate a buffer or return NULL if it cannot be reallocated + uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, + int64_t old_size, int64_t new_size); + + /// \brief Deallocate a buffer allocated by this allocator + void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); + + /// \brief Opaque data specific to the allocator + void* private_data; +}; + +typedef void (*ArrowBufferDeallocatorCallback)(struct ArrowBufferAllocator* allocator, + uint8_t* ptr, int64_t size); + +/// \brief An owning mutable view of a buffer +/// \ingroup nanoarrow-buffer +struct ArrowBuffer { + /// \brief A pointer to the start of the buffer + /// + /// If capacity_bytes is 0, this value may be NULL. + uint8_t* data; + + /// \brief The size of the buffer in bytes + int64_t size_bytes; + + /// \brief The capacity of the buffer in bytes + int64_t capacity_bytes; + + /// \brief The allocator that will be used to reallocate and/or free the buffer + struct ArrowBufferAllocator allocator; +}; + +/// \brief An owning mutable view of a bitmap +/// \ingroup nanoarrow-bitmap +struct ArrowBitmap { + /// \brief An ArrowBuffer to hold the allocated memory + struct ArrowBuffer buffer; + + /// \brief The number of bits that have been appended to the bitmap + int64_t size_bits; +}; + +/// \brief A description of an arrangement of buffers +/// \ingroup nanoarrow-utils +/// +/// Contains the minimum amount of information required to +/// calculate the size of each buffer in an ArrowArray knowing only +/// the length and offset of the array. +struct ArrowLayout { + /// \brief The function of each buffer + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The data type of each buffer + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The size of an element each buffer or 0 if this size is variable or unknown + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of elements in the child array per element in this array for a + /// fixed-size list + int64_t child_size_elements; +}; + +/// \brief A non-owning view of an ArrowArray +/// \ingroup nanoarrow-array-view +/// +/// This data structure provides access to the values contained within +/// an ArrowArray with fields provided in a more readily-extractible +/// form. You can re-use an ArrowArrayView for multiple ArrowArrays +/// with the same storage type, use it to represent a hypothetical +/// ArrowArray that does not exist yet, or use it to validate the buffers +/// of a future ArrowArray. +struct ArrowArrayView { + /// \brief The underlying ArrowArray or NULL if it has not been set or + /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. + const struct ArrowArray* array; + + /// \brief The number of elements from the physical start of the buffers. + int64_t offset; + + /// \brief The number of elements in this view. + int64_t length; + + /// \brief A cached null count or -1 to indicate that this value is unknown. + int64_t null_count; + + /// \brief The type used to store values in this array + /// + /// This type represents only the minimum required information to + /// extract values from the array buffers (e.g., for a Date32 array, + /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded + /// arrays, this will be the index type. + enum ArrowType storage_type; + + /// \brief The buffer types, strides, and sizes of this Array's buffers + struct ArrowLayout layout; + + /// \brief This Array's buffers as ArrowBufferView objects + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; + + /// \brief The number of children of this view + int64_t n_children; + + /// \brief Pointers to views of this array's children + struct ArrowArrayView** children; + + /// \brief Pointer to a view of this array's dictionary + struct ArrowArrayView* dictionary; + + /// \brief Union type id to child index mapping + /// + /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer + /// such that child_index == union_type_id_map[type_id] and + /// type_id == union_type_id_map[128 + child_index]. This value may be + /// NULL in the case where child_id == type_id. + int8_t* union_type_id_map; + + /// \brief Number of variadic buffers + int32_t n_variadic_buffers; + + /// \brief Pointers to variadic buffers of binary/string_view arrays + const void** variadic_buffers; + + /// \brief Size of each variadic buffer + int64_t* variadic_buffer_sizes; +}; + +// Used as the private data member for ArrowArrays allocated here and accessed +// internally within inline ArrowArray* helpers. +struct ArrowArrayPrivateData { + // Holder for the validity buffer (or first buffer for union types, which are + // the only type whose first buffer is not a valdiity buffer) + struct ArrowBitmap bitmap; + + // Holder for additional buffers as required + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; + + // The array of pointers to buffers. This must be updated after a sequence + // of appends to synchronize its values with the actual buffer addresses + // (which may have been reallocated during that time) + const void** buffer_data; + + // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown + enum ArrowType storage_type; + + // The buffer arrangement for the storage type + struct ArrowLayout layout; + + // Flag to indicate if there are non-sequence union type ids. + // In the future this could be replaced with a type id<->child mapping + // to support constructing unions in append mode where type_id != child_index + int8_t union_type_id_is_child_index; + + // Number of variadic buffers for binary view types + int32_t n_variadic_buffers; + + // Variadic buffers for binary view types + struct ArrowBuffer* variadic_buffers; + + // Size of each variadic buffer in bytes + int64_t* variadic_buffer_sizes; +}; + +/// \brief A representation of an interval. +/// \ingroup nanoarrow-utils +struct ArrowInterval { + /// \brief The type of interval being used + enum ArrowType type; + /// \brief The number of months represented by the interval + int32_t months; + /// \brief The number of days represented by the interval + int32_t days; + /// \brief The number of ms represented by the interval + int32_t ms; + /// \brief The number of ns represented by the interval + int64_t ns; +}; + +/// \brief Zero initialize an Interval with a given unit +/// \ingroup nanoarrow-utils +static inline void ArrowIntervalInit(struct ArrowInterval* interval, + enum ArrowType type) { + memset(interval, 0, sizeof(struct ArrowInterval)); + interval->type = type; +} + +/// \brief A representation of a fixed-precision decimal number +/// \ingroup nanoarrow-utils +/// +/// This structure should be initialized with ArrowDecimalInit() once and +/// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), +/// or ArrowDecimalSetBytes256(). +struct ArrowDecimal { + /// \brief An array of 64-bit integers of n_words length defined in native-endian order + uint64_t words[4]; + + /// \brief The number of significant digits this decimal number can represent + int32_t precision; + + /// \brief The number of digits after the decimal point. This can be negative. + int32_t scale; + + /// \brief The number of words in the words array + int n_words; + + /// \brief Cached value used by the implementation + int high_word_index; + + /// \brief Cached value used by the implementation + int low_word_index; +}; + +/// \brief Initialize a decimal with a given set of type parameters +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, + int32_t precision, int32_t scale) { + memset(decimal->words, 0, sizeof(decimal->words)); + decimal->precision = precision; + decimal->scale = scale; + decimal->n_words = (int)(bitwidth / 8 / sizeof(uint64_t)); + + if (_ArrowIsLittleEndian()) { + decimal->low_word_index = 0; + decimal->high_word_index = decimal->n_words - 1; + } else { + decimal->low_word_index = decimal->n_words - 1; + decimal->high_word_index = 0; + } +} + +/// \brief Get a signed integer value of a sufficiently small ArrowDecimal +/// +/// This does not check if the decimal's precision sufficiently small to fit +/// within the signed 64-bit integer range (A precision less than or equal +/// to 18 is sufficiently small). +static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { + return (int64_t)decimal->words[decimal->low_word_index]; +} + +/// \brief Copy the bytes of this decimal into a sufficiently large buffer +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, + uint8_t* out) { + memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); +} + +/// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise +/// \ingroup nanoarrow-utils +static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { + return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); +} + +/// \brief Sets the integer value of this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { + if (value < 0) { + memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); + } else { + memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); + } + + decimal->words[decimal->low_word_index] = value; +} + +/// \brief Negate the value of this decimal in place +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { + uint64_t carry = 1; + + if (decimal->low_word_index == 0) { + for (int i = 0; i < decimal->n_words; i++) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } else { + for (int i = decimal->low_word_index; i >= 0; i--) { + uint64_t elem = decimal->words[i]; + elem = ~elem + carry; + carry &= (elem == 0); + decimal->words[i] = elem; + } + } +} + +/// \brief Copy bytes from a buffer into this decimal +/// \ingroup nanoarrow-utils +static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, + const uint8_t* value) { + memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_H_INCLUDED +#define NANOARROW_H_INCLUDED + +#include +#include +#include + + + +// If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this +// define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE +// MyNamespace here. + +// This section remaps the non-prefixed symbols to the prefixed symbols so that +// code written against this build can be used independent of the value of +// NANOARROW_NAMESPACE. +#ifdef NANOARROW_NAMESPACE +#define NANOARROW_CAT(A, B) A##B +#define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) + +#define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) +#define ArrowNanoarrowVersionInt \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) +#define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) +#define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) +#define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) +#define ArrowBufferAllocatorDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) +#define ArrowBufferDeallocator \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) +#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) +#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) +#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) +#define ArrowDecimalAppendDigitsToBuffer \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) +#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) +#define ArrowSchemaInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) +#define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) +#define ArrowSchemaSetTypeStruct \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) +#define ArrowSchemaSetTypeFixedSize \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) +#define ArrowSchemaSetTypeDecimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) +#define ArrowSchemaSetTypeRunEndEncoded \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeRunEndEncoded) +#define ArrowSchemaSetTypeDateTime \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) +#define ArrowSchemaSetTypeUnion \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) +#define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) +#define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) +#define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) +#define ArrowSchemaSetMetadata \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) +#define ArrowSchemaAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) +#define ArrowSchemaAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) +#define ArrowMetadataReaderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) +#define ArrowMetadataReaderRead \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) +#define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) +#define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) +#define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) +#define ArrowMetadataBuilderInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) +#define ArrowMetadataBuilderAppend \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) +#define ArrowMetadataBuilderSet \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) +#define ArrowMetadataBuilderRemove \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) +#define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) +#define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) +#define ArrowArrayInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) +#define ArrowArrayInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayInitFromArrayView \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) +#define ArrowArrayAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) +#define ArrowArrayAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) +#define ArrowArraySetValidityBitmap \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) +#define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) +#define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) +#define ArrowArrayFinishBuilding \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) +#define ArrowArrayFinishBuildingDefault \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayViewInitFromType \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) +#define ArrowArrayViewInitFromSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) +#define ArrowArrayViewAllocateChildren \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) +#define ArrowArrayViewAllocateDictionary \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) +#define ArrowArrayViewSetLength \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) +#define ArrowArrayViewSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) +#define ArrowArrayViewSetArrayMinimal \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) +#define ArrowArrayViewValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) +#define ArrowArrayViewCompare NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewCompare) +#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) +#define ArrowBasicArrayStreamInit \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) +#define ArrowBasicArrayStreamSetArray \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) +#define ArrowBasicArrayStreamValidate \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// \defgroup nanoarrow Nanoarrow C library +/// +/// Except where noted, objects are not thread-safe and clients should +/// take care to serialize accesses to methods. +/// +/// Because this library is intended to be vendored, it provides full type +/// definitions and encourages clients to stack or statically allocate +/// where convenient. + +/// \defgroup nanoarrow-malloc Memory management +/// +/// Non-buffer members of a struct ArrowSchema and struct ArrowArray +/// must be allocated using ArrowMalloc() or ArrowRealloc() and freed +/// using ArrowFree() for schemas and arrays allocated here. Buffer members +/// are allocated using an ArrowBufferAllocator. +/// +/// @{ + +/// \brief Allocate like malloc() +void* ArrowMalloc(int64_t size); + +/// \brief Reallocate like realloc() +void* ArrowRealloc(void* ptr, int64_t size); + +/// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). +void ArrowFree(void* ptr); + +/// \brief Return the default allocator +/// +/// The default allocator uses ArrowMalloc(), ArrowRealloc(), and +/// ArrowFree(). +struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); + +/// \brief Create a custom deallocator +/// +/// Creates a buffer allocator with only a free method that can be used to +/// attach a custom deallocator to an ArrowBuffer. This may be used to +/// avoid copying an existing buffer that was not allocated using the +/// infrastructure provided here (e.g., by an R or Python object). +struct ArrowBufferAllocator ArrowBufferDeallocator(ArrowBufferDeallocatorCallback, + void* private_data); + +/// @} + +/// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); + +/// \brief Call the release callback of an ArrowSchema +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowSchemaRelease(struct ArrowSchema* schema); + +/// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL +/// \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); + +/// \brief Call the release callback of an ArrowArray +static inline void ArrowArrayRelease(struct ArrowArray* array); + +/// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to +/// NULL \ingroup nanoarrow-arrow-cdata +static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dst); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_schema callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetSchema( + struct ArrowArrayStream* array_stream, struct ArrowSchema* out, + struct ArrowError* error); + +/// \brief Call the get_schema callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this wrapper checks the return code +/// and propagates the error reported by get_last_error into error. This +/// makes it significantly less verbose to iterate over array streams +/// using NANOARROW_RETURN_NOT_OK()-style error handling. +static inline ArrowErrorCode ArrowArrayStreamGetNext( + struct ArrowArrayStream* array_stream, struct ArrowArray* out, + struct ArrowError* error); + +/// \brief Call the get_next callback of an ArrowArrayStream +/// \ingroup nanoarrow-arrow-cdata +/// +/// Unlike the get_next callback, this function never returns NULL (i.e., its +/// result is safe to use in printf-style error formatters). Null values from the +/// original callback are reported as "". +static inline const char* ArrowArrayStreamGetLastError( + struct ArrowArrayStream* array_stream); + +/// \brief Call the release callback of an ArrowArrayStream +static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); + +/// \defgroup nanoarrow-errors Error handling +/// +/// Functions generally return an errno-compatible error code; functions that +/// need to communicate more verbose error information accept a pointer +/// to an ArrowError. This can be stack or statically allocated. The +/// content of the message is undefined unless an error code has been +/// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the +/// ArrowError pointed to by the argument will be propagated with a +/// null-terminated error message. It is safe to pass a NULL ArrowError anywhere +/// in the nanoarrow API. +/// +/// Except where documented, it is generally not safe to continue after a +/// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and +/// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use +/// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms +/// for memory management and error propgagtion. +/// +/// @{ + +/// \brief Set the contents of an error using printf syntax. +/// +/// If error is NULL, this function does nothing and returns NANOARROW_OK. +NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, + const char* fmt, ...); + +/// @} + +/// \defgroup nanoarrow-utils Utility data structures +/// +/// @{ + +/// \brief Return a version string in the form "major.minor.patch" +const char* ArrowNanoarrowVersion(void); + +/// \brief Return an integer that can be used to compare versions sequentially +int ArrowNanoarrowVersionInt(void); + +/// \brief Initialize a description of buffer arrangements from a storage type +void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); + +/// \brief Create a string view from a null-terminated string +static inline struct ArrowStringView ArrowCharView(const char* value); + +/// \brief Sets the integer value of an ArrowDecimal from a string +ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, + struct ArrowStringView value); + +/// \brief Get the integer value of an ArrowDecimal as string +ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, + struct ArrowBuffer* buffer); + +/// \brief Get the half float value of a float +static inline uint16_t ArrowFloatToHalfFloat(float value); + +/// \brief Get the float value of a half float +static inline float ArrowHalfFloatToFloat(uint16_t value); + +/// \brief Resolve a chunk index from increasing int64_t offsets +/// +/// Given a buffer of increasing int64_t offsets that begin with 0 (e.g., offset buffer +/// of a large type, run ends of a chunked array implementation), resolve a value v +/// where lo <= v < hi such that offsets[v] <= index < offsets[v + 1]. +static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* offsets, + int64_t lo, int64_t hi); + +/// @} + +/// \defgroup nanoarrow-schema Creating schemas +/// +/// These functions allocate, copy, and destroy ArrowSchema structures +/// +/// @{ + +/// \brief Initialize an ArrowSchema +/// +/// Initializes the fields and release callback of schema_out. Caller +/// is responsible for calling the schema->release callback if +/// NANOARROW_OK is returned. +void ArrowSchemaInit(struct ArrowSchema* schema); + +/// \brief Initialize an ArrowSchema from an ArrowType +/// +/// A convenience constructor for that calls ArrowSchemaInit() and +/// ArrowSchemaSetType() for the common case of constructing an +/// unparameterized type. The caller is responsible for calling the schema->release +/// callback if NANOARROW_OK is returned. +ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Get a human-readable summary of a Schema +/// +/// Writes a summary of an ArrowSchema to out (up to n - 1 characters) +/// and returns the number of characters required for the output if +/// n were sufficiently large. If recursive is non-zero, the result will +/// also include children. +int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, + char recursive); + +/// \brief Set the format field of a schema from an ArrowType +/// +/// Initializes the fields and release callback of schema_out. For +/// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and +/// NANOARROW_TYPE_MAP, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized +/// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); + +/// \brief Set the format field and initialize children of a struct schema +/// +/// The specified number of children are initialized; however, the caller is responsible +/// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); + +/// \brief Set the format field of a fixed-size schema +/// +/// Returns EINVAL for fixed_size <= 0 or for type that is not +/// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. +/// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are +/// allocated, initialized, and named; however, the caller must +/// ArrowSchemaSetType() the first child. Schema must have been initialized using +/// ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, + enum ArrowType type, int32_t fixed_size); + +/// \brief Set the format field of a decimal schema +/// +/// Returns EINVAL for scale <= 0 or for type that is not +/// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale); + +/// \brief Set the format field of a run-end encoded schema +/// +/// Returns EINVAL for run_end_type that is not +/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64. +/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +/// The caller must call `ArrowSchemaSetTypeXXX(schema->children[1])` to +/// set the value type. Note that when building arrays using the `ArrowArrayAppendXXX()` +/// functions, the run-end encoded array's logical length must be updated manually. +ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema, + enum ArrowType run_end_type); + +/// \brief Set the format field of a time, timestamp, or duration schema +/// +/// Returns EINVAL for type that is not +/// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, +/// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The +/// timezone parameter must be NULL for a non-timestamp type. Schema must have been +/// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, + enum ArrowTimeUnit time_unit, + const char* timezone); + +/// \brief Set the format field of a union schema +/// +/// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION +/// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are +/// allocated, and initialized. +ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, + int64_t n_children); + +/// \brief Make a (recursive) copy of a schema +/// +/// Allocates and copies fields of schema into schema_out. +ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, + struct ArrowSchema* schema_out); + +/// \brief Copy format into schema->format +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); + +/// \brief Copy name into schema->name +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); + +/// \brief Copy metadata into schema->metadata +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy. +ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); + +/// \brief Allocate the schema->children array +/// +/// Includes the memory for each child struct ArrowSchema. +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, + int64_t n_children); + +/// \brief Allocate the schema->dictionary member +/// +/// schema must have been allocated using ArrowSchemaInitFromType() or +/// ArrowSchemaDeepCopy(). +ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); + +/// @} + +/// \defgroup nanoarrow-metadata Create, read, and modify schema metadata +/// +/// @{ + +/// \brief Reader for key/value pairs in schema metadata +/// +/// The ArrowMetadataReader does not own any data and is only valid +/// for the lifetime of the underlying metadata pointer. +struct ArrowMetadataReader { + /// \brief A metadata string from a schema->metadata field. + const char* metadata; + + /// \brief The current offset into the metadata string + int64_t offset; + + /// \brief The number of remaining keys + int32_t remaining_keys; +}; + +/// \brief Initialize an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, + const char* metadata); + +/// \brief Read the next key/value pair from an ArrowMetadataReader +ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, + struct ArrowStringView* key_out, + struct ArrowStringView* value_out); + +/// \brief The number of bytes in in a key/value metadata string +int64_t ArrowMetadataSizeOf(const char* metadata); + +/// \brief Check for a key in schema metadata +char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); + +/// \brief Extract a value from schema metadata +/// +/// If key does not exist in metadata, value_out is unmodified +ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, + struct ArrowStringView* value_out); + +/// \brief Initialize a builder for schema metadata from key/value pairs +/// +/// metadata can be an existing metadata string or NULL to initialize +/// an empty metadata string. +ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); + +/// \brief Append a key/value pair to a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Set a key/value pair to a buffer containing serialized metadata +/// +/// Ensures that the only entry for key in the metadata is set to value. +/// This function maintains the existing position of (the first instance of) +/// key if present in the data. +ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, + struct ArrowStringView key, + struct ArrowStringView value); + +/// \brief Remove a key from a buffer containing serialized metadata +ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, + struct ArrowStringView key); + +/// @} + +/// \defgroup nanoarrow-schema-view Reading schemas +/// +/// @{ + +/// \brief A non-owning view of a parsed ArrowSchema +/// +/// Contains more readily extractable values than a raw ArrowSchema. +/// Clients can stack or statically allocate this structure but are +/// encouraged to use the provided getters to ensure forward +/// compatibility. +struct ArrowSchemaView { + /// \brief A pointer to the schema represented by this view + const struct ArrowSchema* schema; + + /// \brief The data type represented by the schema + /// + /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a + /// non-null dictionary member; datetime types are valid values. + /// This value will never be NANOARROW_TYPE_EXTENSION (see + /// extension_name and/or extension_metadata to check for + /// an extension type). + enum ArrowType type; + + /// \brief The storage data type represented by the schema + /// + /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION + /// or any datetime type. This value represents only the type required to + /// interpret the buffers in the array. + enum ArrowType storage_type; + + /// \brief The storage layout represented by the schema + struct ArrowLayout layout; + + /// \brief The extension type name if it exists + /// + /// If the ARROW:extension:name key is present in schema.metadata, + /// extension_name.data will be non-NULL. + struct ArrowStringView extension_name; + + /// \brief The extension type metadata if it exists + /// + /// If the ARROW:extension:metadata key is present in schema.metadata, + /// extension_metadata.data will be non-NULL. + struct ArrowStringView extension_metadata; + + /// \brief Format fixed size parameter + /// + /// This value is set when parsing a fixed-size binary or fixed-size + /// list schema; this value is undefined for other types. For a + /// fixed-size binary schema this value is in bytes; for a fixed-size + /// list schema this value refers to the number of child elements for + /// each element of the parent. + int32_t fixed_size; + + /// \brief Decimal bitwidth + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_bitwidth; + + /// \brief Decimal precision + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_precision; + + /// \brief Decimal scale + /// + /// This value is set when parsing a decimal type schema; + /// this value is undefined for other types. + int32_t decimal_scale; + + /// \brief Format time unit parameter + /// + /// This value is set when parsing a date/time type. The value is + /// undefined for other types. + enum ArrowTimeUnit time_unit; + + /// \brief Format timezone parameter + /// + /// This value is set when parsing a timestamp type and represents + /// the timezone format parameter. This value points to + /// data within the schema and is undefined for other types. + const char* timezone; + + /// \brief Union type ids parameter + /// + /// This value is set when parsing a union type and represents + /// type ids parameter. This value points to + /// data within the schema and is undefined for other types. + const char* union_type_ids; +}; + +/// \brief Initialize an ArrowSchemaView +ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-buffer Owning, growable buffers +/// +/// @{ + +/// \brief Initialize an ArrowBuffer +/// +/// Initialize a buffer with a NULL, zero-size buffer using the default +/// buffer allocator. +static inline void ArrowBufferInit(struct ArrowBuffer* buffer); + +/// \brief Set a newly-initialized buffer's allocator +/// +/// Returns EINVAL if the buffer has already been allocated. +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); + +/// \brief Reset an ArrowBuffer +/// +/// Releases the buffer using the allocator's free method if +/// the buffer's data member is non-null, sets the data member +/// to NULL, and sets the buffer's size and capacity to 0. +static inline void ArrowBufferReset(struct ArrowBuffer* buffer); + +/// \brief Move an ArrowBuffer +/// +/// Transfers the buffer data and lifecycle management to another +/// address and resets buffer. +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); + +/// \brief Grow or shrink a buffer to a given size +/// +/// When shrinking the size of the buffer, the buffer is only reallocated +/// if shrink_to_fit is non-zero. +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_size_bytes, + char shrink_to_fit); + +/// \brief Ensure a buffer has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bytes, overallocating when required. +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function does not check that buffer has the required capacity +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes); + +/// \brief Write data to buffer and increment the buffer size +/// +/// This function writes and ensures that the buffer has the required capacity, +/// possibly by reallocating the buffer. Like ArrowBufferReserve, this will +/// overallocate when reallocation is required. +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes); + +/// \brief Write fill to buffer and increment the buffer size +/// +/// This function writes the specified number of fill bytes and +/// ensures that the buffer has the required capacity, +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes); + +/// \brief Write an 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value); + +/// \brief Write an unsigned 8-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value); + +/// \brief Write a 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value); + +/// \brief Write an unsigned 16-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value); + +/// \brief Write a 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value); + +/// \brief Write an unsigned 32-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value); + +/// \brief Write a 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value); + +/// \brief Write an unsigned 64-bit integer to a buffer +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value); + +/// \brief Write a double to a buffer +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value); + +/// \brief Write a float to a buffer +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value); + +/// \brief Write an ArrowStringView to a buffer +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value); + +/// \brief Write an ArrowBufferView to a buffer +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value); + +/// @} + +/// \defgroup nanoarrow-bitmap Bitmap utilities +/// +/// @{ + +/// \brief Extract a boolean value from a bitmap +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to true +static inline void ArrowBitSet(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap to false +static inline void ArrowBitClear(uint8_t* bits, int64_t i); + +/// \brief Set a boolean value to a bitmap +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); + +/// \brief Set a boolean value to a range in a bitmap +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set); + +/// \brief Count true values in a bitmap +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); + +/// \brief Extract int8 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out); + +/// \brief Extract int32 boolean values from a range in a bitmap +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out); + +/// \brief Initialize an ArrowBitmap +/// +/// Initialize the builder's buffer, empty its cache, and reset the size to zero +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); + +/// \brief Move an ArrowBitmap +/// +/// Transfers the underlying buffer data and lifecycle management to another +/// address and resets the bitmap. +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); + +/// \brief Ensure a bitmap builder has at least a given additional capacity +/// +/// Ensures that the buffer has space to append at least +/// additional_size_bits, overallocating when required. +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits); + +/// \brief Grow or shrink a bitmap to a given size +/// +/// When shrinking the size of the bitmap, the bitmap is only reallocated +/// if shrink_to_fit is non-zero. +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_size_bits, char shrink_to_fit); + +/// \brief Reserve space for and append zero or more of the same boolean value to a bitmap +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append zero or more of the same boolean value to a bitmap +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length); + +/// \brief Append boolean values encoded as int8_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values); + +/// \brief Append boolean values encoded as int32_t to a bitmap +/// +/// The values must all be 0 or 1. +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values); + +/// \brief Reset a bitmap builder +/// +/// Releases any memory held by buffer, empties the cache, and resets the size to zero +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); + +/// @} + +/// \defgroup nanoarrow-array Creating arrays +/// +/// These functions allocate, copy, and destroy ArrowArray structures. +/// Once an ArrowArray has been initialized via ArrowArrayInitFromType() +/// or ArrowArrayInitFromSchema(), the caller is responsible for releasing +/// it using the embedded release callback. +/// +/// @{ + +/// \brief Initialize the fields of an array +/// +/// Initializes the fields and release callback of array. Caller +/// is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, + enum ArrowType storage_type); + +/// \brief Initialize the contents of an ArrowArray from an ArrowSchema +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Initialize the contents of an ArrowArray from an ArrowArrayView +/// +/// Caller is responsible for calling the array->release callback if +/// NANOARROW_OK is returned. +ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, + const struct ArrowArrayView* array_view, + struct ArrowError* error); + +/// \brief Allocate the array->children array +/// +/// Includes the memory for each child struct ArrowArray, +/// whose members are marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// schema must have been allocated using ArrowArrayInitFromType(). +ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); + +/// \brief Allocate the array->dictionary member +/// +/// Includes the memory for the struct ArrowArray, whose contents +/// is marked as released and may be subsequently initialized +/// with ArrowArrayInitFromType() or moved from an existing ArrowArray. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); + +/// \brief Set the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); + +/// \brief Set a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, + struct ArrowBuffer* buffer); + +/// \brief Get the validity bitmap of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); + +/// \brief Get a buffer of an ArrowArray +/// +/// array must have been allocated using ArrowArrayInitFromType() +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); + +/// \brief Start element-wise appending to an ArrowArray +/// +/// Initializes any values needed to use ArrowArrayAppend*() functions. +/// All element-wise appenders append by value and return EINVAL if the exact value +/// cannot be represented by the underlying storage type. +/// array must have been allocated using ArrowArrayInitFromType() +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); + +/// \brief Reserve space for future appends +/// +/// For buffer sizes that can be calculated (i.e., not string data buffers or +/// child array sizes for non-fixed-size arrays), recursively reserve space for +/// additional elements. This is useful for reducing the number of reallocations +/// that occur using the item-wise appenders. +ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, + int64_t additional_size_elements); + +/// \brief Append a null value to an array +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); + +/// \brief Append an empty, non-null value to an array +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); + +/// \brief Append a signed integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); + +/// \brief Append an unsigned integer value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range). +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value); + +/// \brief Append a double value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise (e.g., value +/// is outside the valid array range or there is an attempt to append +/// a non-integer to an array with an integer storage type). +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value); + +/// \brief Append a string of bytes to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// binary, string, large binary, large string, or fixed-size binary array, or value is +/// the wrong size for a fixed-size binary array). +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value); + +/// \brief Append a string value to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type, EOVERFLOW if appending value would overflow +/// the offset type (e.g., if the data buffer would be larger than 2 GB for a +/// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a +/// string or large string array). +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value); + +/// \brief Append a Interval to an array +/// +/// Returns NANOARROW_OK if value can be exactly represented by +/// the underlying storage type or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value); + +/// \brief Append a decimal value to an array +/// +/// Returns NANOARROW_OK if array is a decimal array with the appropriate +/// bitwidth or EINVAL otherwise. +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + const struct ArrowDecimal* value); + +/// \brief Finish a nested array element +/// +/// Appends a non-null element to the array based on the first child's current +/// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW +/// if the child of a list or map array would exceed INT_MAX elements, or EINVAL +/// if the underlying storage type is not a struct, list, large list, or fixed-size +/// list, or if there was an attempt to add a struct or fixed-size list element where the +/// length of the child array(s) did not match the expected length. +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); + +/// \brief Finish a union array element +/// +/// Appends an element to the union type ids buffer and increments array->length. +/// For sparse unions, up to one element is added to non type-id children. Returns +/// EINVAL if the underlying storage type is not a union, if type_id is not valid, +/// or if child sizes after appending are inconsistent. +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id); + +/// \brief Shrink buffer capacity to the size required +/// +/// Also applies shrinking to any child arrays. array must have been allocated using +/// ArrowArrayInitFromType +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); + +/// \brief Finish building an ArrowArray +/// +/// Flushes any pointers from internal buffers that may have been reallocated +/// into array->buffers and checks the actual size of the buffers +/// against the expected size based on the final length. +/// array must have been allocated using ArrowArrayInitFromType() +ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Finish building an ArrowArray with explicit validation +/// +/// Finish building with an explicit validation level. This could perform less validation +/// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU +/// buffer data access is not possible or more validation (i.e., +/// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or +/// corruptible source. +ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// @} + +/// \defgroup nanoarrow-array-view Reading arrays +/// +/// These functions read and validate the contents ArrowArray structures. +/// +/// @{ + +/// \brief Initialize the contents of an ArrowArrayView +void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, + enum ArrowType storage_type); + +/// \brief Move an ArrowArrayView +/// +/// Transfers the ArrowArrayView data and lifecycle management to another +/// address and resets the contents of src. +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst); + +/// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema +ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, + const struct ArrowSchema* schema, + struct ArrowError* error); + +/// \brief Allocate the array_view->children array +/// +/// Includes the memory for each child struct ArrowArrayView +ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, + int64_t n_children); + +/// \brief Allocate array_view->dictionary +ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); + +/// \brief Set data-independent buffer sizes from length +void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); + +/// \brief Set buffer sizes and data pointers from an ArrowArray +ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Set buffer sizes and data pointers from an ArrowArray except for those +/// that require dereferencing buffer content. +ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, + const struct ArrowArray* array, + struct ArrowError* error); + +/// \brief Get the number of buffers +/// +/// The number of buffers referred to by this ArrowArrayView. In may cases this can also +/// be calculated from the ArrowLayout member of the ArrowArrayView or ArrowSchemaView; +/// however, for binary view and string view types, the number of total buffers depends on +/// the number of variadic buffers. +static inline int64_t ArrowArrayViewGetNumBuffers(struct ArrowArrayView* array_view); + +/// \brief Get a view of a specific buffer from an ArrowArrayView +/// +/// This is the ArrowArrayView equivalent of ArrowArray::buffers[i] that includes +/// size information (if known). +static inline struct ArrowBufferView ArrowArrayViewGetBufferView( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the function of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the function of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline enum ArrowBufferType ArrowArrayViewGetBufferType( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the data type of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the data type of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline enum ArrowType ArrowArrayViewGetBufferDataType( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the element size (in bits) of a specific buffer in an ArrowArrayView +/// +/// In may cases this can also be obtained from the ArrowLayout member of the +/// ArrowArrayView or ArrowSchemaView; however, for binary view and string view types, +/// the element width of each buffer may be different between two arrays of the same type +/// depending on the number of variadic buffers. +static inline int64_t ArrowArrayViewGetBufferElementSizeBits( + struct ArrowArrayView* array_view, int64_t i); + +/// \brief Performs checks on the content of an ArrowArrayView +/// +/// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, +/// the buffer sizes and some content (fist and last offset) have already +/// been validated at the "default" level. If setting the buffer pointers +/// and sizes otherwise, you may wish to perform checks at a different level. See +/// documentation for ArrowValidationLevel for the details of checks performed +/// at each level. +ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, + enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Compare two ArrowArrayView objects for equality +/// +/// Given two ArrowArrayView instances, place either 0 (not equal) and +/// 1 (equal) at the address pointed to by out. If the comparison determines +/// that actual and expected are not equal, a reason will be communicated via +/// error if error is non-NULL. +/// +/// Returns NANOARROW_OK if the comparison completed successfully. +ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual, + const struct ArrowArrayView* expected, + enum ArrowCompareLevel level, int* out, + struct ArrowError* reason); + +/// \brief Reset the contents of an ArrowArrayView and frees resources +void ArrowArrayViewReset(struct ArrowArrayView* array_view); + +/// \brief Check for a null element in an ArrowArrayView +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Compute null count for an ArrowArrayView +static inline int64_t ArrowArrayViewComputeNullCount( + const struct ArrowArrayView* array_view); + +/// \brief Get the type id of a union array element +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get the child index of a union array element +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get the index to use into the relevant union child array +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for an int64. +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, + int64_t i); + +/// \brief Get an element in an ArrowArrayView as an unsigned integer +/// +/// This function does not check for null values, that values are actually integers, or +/// that values are within a valid range for a uint64. +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as a double +/// +/// This function does not check for null values, or +/// that values are within a valid range for a double. +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowStringView +/// +/// This function does not check for null values. +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowBufferView +/// +/// This function does not check for null values. +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i); + +/// \brief Get an element in an ArrowArrayView as an ArrowDecimal +/// +/// This function does not check for null values. The out parameter must +/// be initialized with ArrowDecimalInit() with the proper parameters for this +/// type before calling this for the first time. +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out); + +/// @} + +/// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation +/// +/// An implementation of an ArrowArrayStream based on a collection of +/// zero or more previously-existing ArrowArray objects. Users should +/// initialize and/or validate the contents before transferring the +/// responsibility of the ArrowArrayStream elsewhere. +/// +/// @{ + +/// \brief Initialize an ArrowArrayStream backed by this implementation +/// +/// This function moves the ownership of schema to the array_stream. If +/// this function returns NANOARROW_OK, the caller is responsible for +/// releasing the ArrowArrayStream. +ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, + struct ArrowSchema* schema, int64_t n_arrays); + +/// \brief Set the ith ArrowArray in this ArrowArrayStream. +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function move the ownership of array to the array_stream. i must +/// be greater than zero and less than the value of n_arrays passed in +/// ArrowBasicArrayStreamInit(). Callers are not required to fill all +/// n_arrays members (i.e., n_arrays is a maximum bound). +void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, + struct ArrowArray* array); + +/// \brief Validate the contents of this ArrowArrayStream +/// +/// array_stream must have been initialized with ArrowBasicArrayStreamInit(). +/// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() +/// to validate the contents of the arrays. +ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, + struct ArrowError* error); + +/// @} + +// Undefine ArrowErrorCode, which may have been defined to annotate functions that return +// it to warn for an unused result. +#if defined(ArrowErrorCode) +#undef ArrowErrorCode +#endif + +// Inline function definitions + + + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED +#define NANOARROW_BUFFER_INLINE_H_INCLUDED + +#include +#include +#include + + + +#ifdef __cplusplus +extern "C" { +#endif + +// Modified from Arrow C++ (1eb46f76) cpp/src/arrow/chunk_resolver.h#L133-L162 +static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* offsets, + int64_t lo, int64_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + int64_t n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int64_t m = n >> 1; + const int64_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + +static inline int64_t ArrowResolveChunk32(int32_t index, const int32_t* offsets, + int32_t lo, int32_t hi) { + // Similar to std::upper_bound(), but slightly different as our offsets + // array always starts with 0. + int32_t n = hi - lo; + // First iteration does not need to check for n > 1 + // (lo < hi is guaranteed by the precondition). + NANOARROW_DCHECK(n > 1); + do { + const int32_t m = n >> 1; + const int32_t mid = lo + m; + if (index >= offsets[mid]) { + lo = mid; + n -= m; + } else { + n = m; + } + } while (n > 1); + return lo; +} + +static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { + int64_t doubled_capacity = current_capacity * 2; + if (doubled_capacity > new_capacity) { + return doubled_capacity; + } else { + return new_capacity; + } +} + +// float to half float conversion, adapted from Arrow Go +// https://github.com/apache/arrow/blob/main/go/arrow/float16/float16.go +static inline uint16_t ArrowFloatToHalfFloat(float value) { + union { + float f; + uint32_t b; + } u; + u.f = value; + + uint16_t sn = (uint16_t)((u.b >> 31) & 0x1); + uint16_t exp = (u.b >> 23) & 0xff; + int16_t res = (int16_t)(exp - 127 + 15); + uint16_t fc = (uint16_t)(u.b >> 13) & 0x3ff; + + if (exp == 0) { + res = 0; + } else if (exp == 0xff) { + res = 0x1f; + } else if (res > 0x1e) { + res = 0x1f; + fc = 0; + } else if (res < 0x01) { + res = 0; + fc = 0; + } + + return (uint16_t)((sn << 15) | (uint16_t)(res << 10) | fc); +} + +// half float to float conversion, adapted from Arrow Go +// https://github.com/apache/arrow/blob/main/go/arrow/float16/float16.go +static inline float ArrowHalfFloatToFloat(uint16_t value) { + uint32_t sn = (uint32_t)((value >> 15) & 0x1); + uint32_t exp = (value >> 10) & 0x1f; + uint32_t res = exp + 127 - 15; + uint32_t fc = value & 0x3ff; + + if (exp == 0) { + res = 0; + } else if (exp == 0x1f) { + res = 0xff; + } + + union { + float f; + uint32_t b; + } u; + u.b = (uint32_t)(sn << 31) | (uint32_t)(res << 23) | (uint32_t)(fc << 13); + return u.f; +} + +static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { + buffer->data = NULL; + buffer->size_bytes = 0; + buffer->capacity_bytes = 0; + buffer->allocator = ArrowBufferAllocatorDefault(); +} + +static inline ArrowErrorCode ArrowBufferSetAllocator( + struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { + // This is not a perfect test for "has a buffer already been allocated" + // but is likely to catch most cases. + if (buffer->data == NULL) { + buffer->allocator = allocator; + return NANOARROW_OK; + } else { + return EINVAL; + } +} + +static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { + buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, + buffer->capacity_bytes); + ArrowBufferInit(buffer); +} + +static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { + memcpy(dst, src, sizeof(struct ArrowBuffer)); + src->data = NULL; + ArrowBufferInit(src); +} + +static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, + int64_t new_size_bytes, + char shrink_to_fit) { + if (new_size_bytes < 0) { + return EINVAL; + } + + int needs_reallocation = new_size_bytes > buffer->capacity_bytes || + (shrink_to_fit && new_size_bytes < buffer->capacity_bytes); + + if (needs_reallocation) { + buffer->data = buffer->allocator.reallocate(&buffer->allocator, buffer->data, + buffer->capacity_bytes, new_size_bytes); + + if (buffer->data == NULL && new_size_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_size_bytes; + } + + buffer->size_bytes = new_size_bytes; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, + int64_t additional_size_bytes) { + int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; + if (min_capacity_bytes <= buffer->capacity_bytes) { + return NANOARROW_OK; + } + + int64_t new_capacity_bytes = + _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes); + buffer->data = buffer->allocator.reallocate(&buffer->allocator, buffer->data, + buffer->capacity_bytes, new_capacity_bytes); + + if (buffer->data == NULL && new_capacity_bytes > 0) { + buffer->capacity_bytes = 0; + buffer->size_bytes = 0; + return ENOMEM; + } + + buffer->capacity_bytes = new_capacity_bytes; + return NANOARROW_OK; +} + +static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, + int64_t size_bytes) { + if (size_bytes > 0) { + NANOARROW_DCHECK(buffer->data != NULL); + memcpy(buffer->data + buffer->size_bytes, data, size_bytes); + buffer->size_bytes += size_bytes; + } +} + +static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, + const void* data, int64_t size_bytes) { + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + ArrowBufferAppendUnsafe(buffer, data, size_bytes); + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, + int8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, + uint8_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, + int16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, + uint16_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, + int32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, + uint32_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, + int64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, + uint64_t value) { + return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); +} + +static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, + double value) { + return ArrowBufferAppend(buffer, &value, sizeof(double)); +} + +static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, + float value) { + return ArrowBufferAppend(buffer, &value, sizeof(float)); +} + +static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, + struct ArrowStringView value) { + return ArrowBufferAppend(buffer, value.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, + struct ArrowBufferView value) { + return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); +} + +static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, + uint8_t value, int64_t size_bytes) { + if (size_bytes == 0) { + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); + + NANOARROW_DCHECK(buffer->data != NULL); // To help clang-tidy + memset(buffer->data + buffer->size_bytes, value, size_bytes); + buffer->size_bytes += size_bytes; + + return NANOARROW_OK; +} + +static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; +static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + +static const uint8_t _ArrowkBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { + return (value + 7) & ~((int64_t)7); +} + +static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { + return (value / 8) * 8; +} + +static inline int64_t _ArrowBytesForBits(int64_t bits) { + return (bits >> 3) + ((bits & 7) != 0); +} + +static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + out[0] = (word & 0x1) != 0; + out[1] = (word & 0x2) != 0; + out[2] = (word & 0x4) != 0; + out[3] = (word & 0x8) != 0; + out[4] = (word & 0x10) != 0; + out[5] = (word & 0x20) != 0; + out[6] = (word & 0x40) != 0; + out[7] = (word & 0x80) != 0; +} + +static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +} + +static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { + *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +} + +static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, + int64_t length, int8_t* out) { + if (length == 0) { + return; + } + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt8(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, + int64_t length, int32_t* out) { + if (length == 0) { + return; + } + + NANOARROW_DCHECK(bits != NULL && out != NULL); + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + for (int i = 0; i < length; i++) { + out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + return; + } + + // first byte + for (int i = 0; i < 8 - (i_begin % 8); i++) { + *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); + } + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + _ArrowBitsUnpackInt32(bits[i], out); + out += 8; + } + + // last byte + const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); + for (int i = 0; i < bits_remaining; i++) { + *out++ = ArrowBitGet(&bits[bytes_last_valid], i); + } +} + +static inline void ArrowBitSet(uint8_t* bits, int64_t i) { + bits[i / 8] |= _ArrowkBitmask[i % 8]; +} + +static inline void ArrowBitClear(uint8_t* bits, int64_t i) { + bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; +} + +static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { + bits[i / 8] ^= (uint8_t)(((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & + _ArrowkBitmask[i % 8]); +} + +static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, + uint8_t bits_are_set) { + if (length == 0) { + return; + } + + NANOARROW_DCHECK(bits != NULL); + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const uint8_t fill_byte = (uint8_t)(-bits_are_set); + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_end = i_end / 8 + 1; + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) { + return; + } + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); +} + +static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, + int64_t length) { + if (length == 0) { + return 0; + } + + NANOARROW_DCHECK(bits != NULL); + + const int64_t i_begin = start_offset; + const int64_t i_end = start_offset + length; + const int64_t i_last_valid = i_end - 1; + + const int64_t bytes_begin = i_begin / 8; + const int64_t bytes_last_valid = i_last_valid / 8; + + if (bytes_begin == bytes_last_valid) { + // count bits within a single byte + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; + const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; + + const uint8_t only_byte_mask = + i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); + + const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; + return _ArrowkBytePopcount[byte_masked]; + } + + const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; + const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; + int64_t count = 0; + + // first byte + count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; + + // middle bytes + for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { + count += _ArrowkBytePopcount[bits[i]]; + } + + // last byte + count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; + + return count; +} + +static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { + ArrowBufferInit(&bitmap->buffer); + bitmap->size_bits = 0; +} + +static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { + ArrowBufferMove(&src->buffer, &dst->buffer); + dst->size_bits = src->size_bits; + src->size_bits = 0; +} + +static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, + int64_t additional_size_bits) { + int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; + int64_t min_capacity_bytes = _ArrowBytesForBits(min_capacity_bits); + int64_t current_size_bytes = bitmap->buffer.size_bytes; + int64_t current_capacity_bytes = bitmap->buffer.capacity_bytes; + + if (min_capacity_bytes <= current_capacity_bytes) { + return NANOARROW_OK; + } + + int64_t additional_capacity_bytes = min_capacity_bytes - current_size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&bitmap->buffer, additional_capacity_bytes)); + + // Zero out the last byte for deterministic output in the common case + // of reserving a known remaining size. We should have returned above + // if there was not at least one additional byte to allocate; however, + // DCHECK() just to be sure. + NANOARROW_DCHECK(bitmap->buffer.capacity_bytes > current_capacity_bytes); + bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, + int64_t new_size_bits, + char shrink_to_fit) { + if (new_size_bits < 0) { + return EINVAL; + } + + int64_t new_size_bytes = _ArrowBytesForBits(new_size_bits); + NANOARROW_RETURN_NOT_OK( + ArrowBufferResize(&bitmap->buffer, new_size_bytes, shrink_to_fit)); + + bitmap->size_bits = new_size_bits; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); + + ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); + return NANOARROW_OK; +} + +static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, + uint8_t bits_are_set, int64_t length) { + ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); + bitmap->size_bits += length; + bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); +} + +static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + const int8_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int8_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt8(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, + const int32_t* values, int64_t n_values) { + if (n_values == 0) { + return; + } + + const int32_t* values_cursor = values; + int64_t n_remaining = n_values; + int64_t out_i_cursor = bitmap->size_bits; + uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; + + // First byte + if ((out_i_cursor % 8) != 0) { + int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; + for (int i = 0; i < n_partial_bits; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); + } + + out_cursor++; + values_cursor += n_partial_bits; + n_remaining -= n_partial_bits; + } + + // Middle bytes + int64_t n_full_bytes = n_remaining / 8; + for (int64_t i = 0; i < n_full_bytes; i++) { + _ArrowBitmapPackInt32(values_cursor, out_cursor); + values_cursor += 8; + out_cursor++; + } + + // Last byte + out_i_cursor += n_full_bytes * 8; + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte + *out_cursor = 0x00; + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); + } + out_cursor++; + } + + bitmap->size_bits += n_values; + bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; +} + +static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { + ArrowBufferReset(&bitmap->buffer); + bitmap->size_bits = 0; +} + +#ifdef __cplusplus +} +#endif + +#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED +#define NANOARROW_ARRAY_INLINE_H_INCLUDED + +#include +#include +#include +#include +#include + + + + +#ifdef __cplusplus +extern "C" { +#endif + +static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + return &private_data->bitmap; +} + +static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + switch (i) { + case 0: + return &private_data->bitmap.buffer; + default: + return private_data->buffers + i - 1; + } +} + +// We don't currently support the case of unions where type_id != child_index; +// however, these functions are used to keep track of where that assumption +// is made. +static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, + int8_t type_id) { + NANOARROW_UNUSED(array); + return type_id; +} + +static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, + int8_t child_index) { + NANOARROW_UNUSED(array); + return child_index; +} + +static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { + if (*type_ids == '\0') { + return 0; + } + + int32_t i = 0; + long type_id; + char* end_ptr; + do { + type_id = strtol(type_ids, &end_ptr, 10); + if (end_ptr == type_ids || type_id < 0 || type_id > 127) { + return -1; + } + + if (out != NULL) { + out[i] = (int8_t)type_id; + } + + i++; + + type_ids = end_ptr; + if (*type_ids == '\0') { + return i; + } else if (*type_ids != ',') { + return -1; + } else { + type_ids++; + } + } while (1); + + return -1; +} + +static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, + int64_t n_type_ids, + int64_t n_children) { + if (n_type_ids != n_children) { + return 0; + } + + for (int8_t i = 0; i < n_type_ids; i++) { + if (type_ids[i] != i) { + return 0; + } + } + + return 1; +} + +static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, + int64_t n_children) { + int8_t type_ids[128]; + int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); + return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); +} + +static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UNINITIALIZED: + return EINVAL; + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + // Note that this value could be -1 if the type_ids string was invalid + if (private_data->union_type_id_is_child_index != 1) { + return EINVAL; + } else { + break; + } + default: + break; + } + if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { + return EINVAL; + } + + // Initialize any data offset buffer with a single zero + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 64) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); + } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && + private_data->layout.element_size_bits[i] == 32) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); + } + } + + // Start building any child arrays or dictionaries + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); + } + + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); + } + + if (array->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); + } + + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, + int64_t buffer_i, uint8_t value, + int64_t n) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + int64_t bytes_required = + _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * + (array->length + 1)) / + 8; + if (bytes_required > buffer->size_bytes) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); + } + + ArrowBitsSetTo(buffer->data, array->length, n, value); + return NANOARROW_OK; +} + +static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, + int64_t n, uint8_t is_valid) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + if (n == 0) { + return NANOARROW_OK; + } + + // Some type-specific handling + switch (private_data->storage_type) { + case NANOARROW_TYPE_NA: + // (An empty value for a null array *is* a null) + array->null_count += n; + array->length += n; + return NANOARROW_OK; + + case NANOARROW_TYPE_DENSE_UNION: { + // Add one null to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + for (int64_t i = 0; i < n; i++) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); + } + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_SPARSE_UNION: { + // Add n nulls to the first child and append n references to that child + int8_t type_id = _ArrowArrayUnionTypeId(array, 0); + NANOARROW_RETURN_NOT_OK( + _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); + for (int64_t i = 1; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); + // For the purposes of array->null_count, union elements are never considered "null" + // even if some children contain nulls. + array->length += n; + return NANOARROW_OK; + } + + case NANOARROW_TYPE_FIXED_SIZE_LIST: + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( + array->children[0], n * private_data->layout.child_size_elements)); + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); + } + break; + + default: + break; + } + + // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet + // and we need to append nulls, do it now. + if (!is_valid && private_data->bitmap.buffer.data == NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } else if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); + ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); + } + + // Add appropriate buffer fill + struct ArrowBuffer* buffer; + int64_t size_bytes; + + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + buffer = ArrowArrayBuffer(array, i); + size_bytes = private_data->layout.element_size_bits[i] / 8; + + switch (private_data->layout.buffer_type[i]) { + case NANOARROW_BUFFER_TYPE_NONE: + case NANOARROW_BUFFER_TYPE_VARIADIC_DATA: + case NANOARROW_BUFFER_TYPE_VARIADIC_SIZE: + case NANOARROW_BUFFER_TYPE_VALIDITY: + continue; + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: + // Append the current value at the end of the offset buffer for each element + NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); + + for (int64_t j = 0; j < n; j++) { + ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), + size_bytes); + } + + // Skip the data buffer + i++; + continue; + case NANOARROW_BUFFER_TYPE_DATA: + // Zero out the next bit of memory + if (private_data->layout.element_size_bits[i] % 8 == 0) { + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); + } else { + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); + } + continue; + + case NANOARROW_BUFFER_TYPE_TYPE_ID: + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: + // These cases return above + return EINVAL; + } + } + + array->length += n; + array->null_count += n * !is_valid; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 0); +} + +static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { + return _ArrowArrayAppendEmptyInternal(array, n, 1); +} + +static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, + int64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); + break; + case NANOARROW_TYPE_INT32: + _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); + break; + case NANOARROW_TYPE_INT16: + _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); + break; + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); + return ArrowArrayAppendUInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, + uint64_t value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_UINT64: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); + break; + case NANOARROW_TYPE_UINT32: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); + break; + case NANOARROW_TYPE_UINT16: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); + break; + case NANOARROW_TYPE_UINT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); + break; + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT8: + _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); + return ArrowArrayAppendInt(array, value); + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; + case NANOARROW_TYPE_BOOL: + NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, + double value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DOUBLE: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); + break; + case NANOARROW_TYPE_FLOAT: + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); + break; + case NANOARROW_TYPE_HALF_FLOAT: + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendUInt16(data_buffer, ArrowFloatToHalfFloat((float)value))); + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +// Binary views only have two fixed buffers, but be aware that they must also +// always have more 1 buffer to store variadic buffer sizes (even if there are none) +#define NANOARROW_BINARY_VIEW_FIXED_BUFFERS 2 +#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12 +#define NANOARROW_BINARY_VIEW_PREFIX_SIZE 4 +#define NANOARROW_BINARY_VIEW_BLOCK_SIZE (32 << 10) // 32KB + +// The Arrow C++ implementation uses anonymous structs as members +// of the ArrowBinaryView. For Cython support in this library, we define +// those structs outside of the ArrowBinaryView +struct ArrowBinaryViewInlined { + int32_t size; + uint8_t data[NANOARROW_BINARY_VIEW_INLINE_SIZE]; +}; + +struct ArrowBinaryViewRef { + int32_t size; + uint8_t prefix[NANOARROW_BINARY_VIEW_PREFIX_SIZE]; + int32_t buffer_index; + int32_t offset; +}; + +union ArrowBinaryView { + struct ArrowBinaryViewInlined inlined; + struct ArrowBinaryViewRef ref; + int64_t alignment_dummy; +}; + +static inline int32_t ArrowArrayVariadicBufferCount(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + return private_data->n_variadic_buffers; +} + +static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* array, + int32_t nbuffers) { + const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array); + const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers; + + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + private_data->variadic_buffers = (struct ArrowBuffer*)ArrowRealloc( + private_data->variadic_buffers, sizeof(struct ArrowBuffer) * nvariadic_bufs_needed); + if (private_data->variadic_buffers == NULL) { + return ENOMEM; + } + private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc( + private_data->variadic_buffer_sizes, sizeof(int64_t) * nvariadic_bufs_needed); + if (private_data->variadic_buffer_sizes == NULL) { + return ENOMEM; + } + + for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) { + ArrowBufferInit(&private_data->variadic_buffers[i]); + private_data->variadic_buffer_sizes[i] = 0; + } + private_data->n_variadic_buffers = nvariadic_bufs_needed; + array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + nvariadic_bufs_needed; + + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, + struct ArrowBufferView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + if (private_data->storage_type == NANOARROW_TYPE_STRING_VIEW || + private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + union ArrowBinaryView bvt; + bvt.inlined.size = (int32_t)value.size_bytes; + + if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes); + memset(bvt.inlined.data + bvt.inlined.size, 0, + NANOARROW_BINARY_VIEW_INLINE_SIZE - bvt.inlined.size); + } else { + int32_t current_n_vbufs = ArrowArrayVariadicBufferCount(array); + if (current_n_vbufs == 0 || + private_data->variadic_buffers[current_n_vbufs - 1].size_bytes + + value.size_bytes > + NANOARROW_BINARY_VIEW_BLOCK_SIZE) { + const int32_t additional_bufs_needed = 1; + NANOARROW_RETURN_NOT_OK( + ArrowArrayAddVariadicBuffers(array, additional_bufs_needed)); + current_n_vbufs += additional_bufs_needed; + } + + const int32_t buf_index = current_n_vbufs - 1; + struct ArrowBuffer* variadic_buf = &private_data->variadic_buffers[buf_index]; + memcpy(bvt.ref.prefix, value.data.as_char, NANOARROW_BINARY_VIEW_PREFIX_SIZE); + bvt.ref.buffer_index = (int32_t)buf_index; + bvt.ref.offset = (int32_t)variadic_buf->size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(variadic_buf, value.data.as_char, value.size_bytes)); + private_data->variadic_buffer_sizes[buf_index] = variadic_buf->size_bytes; + } + NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt))); + } else { + struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); + struct ArrowBuffer* data_buffer = ArrowArrayBuffer( + array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); + int32_t offset; + int64_t large_offset; + int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + offset = ((int32_t*)offset_buffer->data)[array->length]; + if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { + return EOVERFLOW; + } + + offset += (int32_t)value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + large_offset = ((int64_t*)offset_buffer->data)[array->length]; + large_offset += value.size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + if (value.size_bytes != fixed_size_bytes) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); + break; + default: + return EINVAL; + } + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, + struct ArrowStringView value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBufferView buffer_view; + buffer_view.data.data = value.data; + buffer_view.size_bytes = value.size_bytes; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_STRING_VIEW: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_BINARY_VIEW: + return ArrowArrayAppendBytes(array, buffer_view); + default: + return EINVAL; + } +} + +static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, + const struct ArrowInterval* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, + const struct ArrowDecimal* value) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + if (value->n_words != 2) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); + break; + } + case NANOARROW_TYPE_DECIMAL256: + if (value->n_words != 4) { + return EINVAL; + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); + break; + } + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_length; + + switch (private_data->storage_type) { + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: + child_length = array->children[0]->length; + if (child_length > INT32_MAX) { + return EOVERFLOW; + } + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); + break; + case NANOARROW_TYPE_LARGE_LIST: + child_length = array->children[0]->length; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + child_length = array->children[0]->length; + if (child_length != + ((array->length + 1) * private_data->layout.child_size_elements)) { + return EINVAL; + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < array->n_children; i++) { + child_length = array->children[i]->length; + if (child_length != (array->length + 1)) { + return EINVAL; + } + } + break; + default: + return EINVAL; + } + + if (private_data->bitmap.buffer.data != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); + } + + array->length++; + return NANOARROW_OK; +} + +static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, + int8_t type_id) { + struct ArrowArrayPrivateData* private_data = + (struct ArrowArrayPrivateData*)array->private_data; + + int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); + if (child_index < 0 || child_index >= array->n_children) { + return EINVAL; + } + + switch (private_data->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + // Append the target child length to the union offsets buffer + _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); + NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( + ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); + break; + case NANOARROW_TYPE_SPARSE_UNION: + // Append one empty to any non-target column that isn't already the right length + // or abort if appending a null will result in a column with invalid length + for (int64_t i = 0; i < array->n_children; i++) { + if (i == child_index || array->children[i]->length == (array->length + 1)) { + continue; + } + + if (array->children[i]->length != array->length) { + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); + } + + break; + default: + return EINVAL; + } + + // Write to the type_ids buffer + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); + array->length++; + return NANOARROW_OK; +} + +static inline void ArrowArrayViewMove(struct ArrowArrayView* src, + struct ArrowArrayView* dst) { + memcpy(dst, src, sizeof(struct ArrowArrayView)); + ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); +} + +static inline int64_t ArrowArrayViewGetNumBuffers(struct ArrowArrayView* array_view) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + return NANOARROW_BINARY_VIEW_FIXED_BUFFERS + array_view->n_variadic_buffers + 1; + default: + break; + } + + int64_t n_buffers = 0; + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { + break; + } + + n_buffers++; + } + + return n_buffers; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBufferView( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->buffer_views[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + struct ArrowBufferView view; + view.data.as_int64 = array_view->variadic_buffer_sizes; + view.size_bytes = array_view->n_variadic_buffers * sizeof(double); + return view; + } else { + struct ArrowBufferView view; + view.data.data = + array_view->variadic_buffers[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS]; + view.size_bytes = + array_view->variadic_buffer_sizes[i - NANOARROW_BINARY_VIEW_FIXED_BUFFERS]; + return view; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + struct ArrowBufferView view; + view.data.data = NULL; + view.size_bytes = 0; + return view; + } else { + return array_view->buffer_views[i]; + } + } +} + +enum ArrowBufferType ArrowArrayViewGetBufferType(struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.buffer_type[i]; + } else if (i == + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return NANOARROW_BUFFER_TYPE_VARIADIC_SIZE; + } else { + return NANOARROW_BUFFER_TYPE_VARIADIC_DATA; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return NANOARROW_BUFFER_TYPE_NONE; + } else { + return array_view->layout.buffer_type[i]; + } + } +} + +static inline enum ArrowType ArrowArrayViewGetBufferDataType( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.buffer_data_type[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return NANOARROW_TYPE_INT64; + } else if (array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) { + return NANOARROW_TYPE_BINARY; + } else { + return NANOARROW_TYPE_STRING; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return NANOARROW_TYPE_UNINITIALIZED; + } else { + return array_view->layout.buffer_data_type[i]; + } + } +} + +static inline int64_t ArrowArrayViewGetBufferElementSizeBits( + struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: + if (i < NANOARROW_BINARY_VIEW_FIXED_BUFFERS) { + return array_view->layout.element_size_bits[i]; + } else if (i >= + (array_view->n_variadic_buffers + NANOARROW_BINARY_VIEW_FIXED_BUFFERS)) { + return sizeof(int64_t) * 8; + } else { + return 0; + } + default: + // We need this check to avoid -Warray-bounds from complaining + if (i >= NANOARROW_MAX_FIXED_BUFFERS) { + return 0; + } else { + return array_view->layout.element_size_bits[i]; + } + } +} + +static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, + int64_t i) { + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return 0x01; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0x00; + default: + return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); + } +} + +static inline int64_t ArrowArrayViewComputeNullCount( + const struct ArrowArrayView* array_view) { + if (array_view->length == 0) { + return 0; + } + + switch (array_view->storage_type) { + case NANOARROW_TYPE_NA: + return array_view->length; + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + // Unions are "never null" in Arrow land + return 0; + default: + break; + } + + const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; + if (validity_buffer == NULL) { + return 0; + } + return array_view->length - + ArrowBitCountSet(validity_buffer, array_view->offset, array_view->length); +} + +static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, + int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->buffer_views[0].data.as_int8[array_view->offset + i]; + default: + return -1; + } +} + +static inline int8_t ArrowArrayViewUnionChildIndex( + const struct ArrowArrayView* array_view, int64_t i) { + int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); + if (array_view->union_type_id_map == NULL) { + return type_id; + } else { + return array_view->union_type_id_map[type_id]; + } +} + +static inline int64_t ArrowArrayViewUnionChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_DENSE_UNION: + return array_view->buffer_views[1].data.as_int32[array_view->offset + i]; + case NANOARROW_TYPE_SPARSE_UNION: + return array_view->offset + i; + default: + return -1; + } +} + +static inline int64_t ArrowArrayViewListChildOffset( + const struct ArrowArrayView* array_view, int64_t i) { + switch (array_view->storage_type) { + case NANOARROW_TYPE_LIST: + return array_view->buffer_views[1].data.as_int32[i]; + case NANOARROW_TYPE_LARGE_LIST: + return array_view->buffer_views[1].data.as_int64[i]; + default: + return -1; + } +} + +static struct ArrowBufferView ArrowArrayViewGetBytesFromViewArrayUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + const union ArrowBinaryView* bv = &array_view->buffer_views[1].data.as_binary_view[i]; + struct ArrowBufferView out = {{NULL}, bv->inlined.size}; + if (bv->inlined.size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + out.data.as_uint8 = bv->inlined.data; + return out; + } + + out.data.data = array_view->variadic_buffers[bv->ref.buffer_index]; + out.data.as_uint8 += bv->ref.offset; + return out; +} + +static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, + int64_t i) { + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + i += array_view->offset; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (int64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (int64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return (int64_t)ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return INT64_MAX; + } +} + +static inline uint64_t ArrowArrayViewGetUIntUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INTERVAL_MONTHS: + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return (uint64_t)data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return (uint64_t)data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return (uint64_t)ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return UINT64_MAX; + } +} + +static inline double ArrowArrayViewGetDoubleUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT64: + return (double)data_view->data.as_int64[i]; + case NANOARROW_TYPE_UINT64: + return (double)data_view->data.as_uint64[i]; + case NANOARROW_TYPE_INT32: + return data_view->data.as_int32[i]; + case NANOARROW_TYPE_UINT32: + return data_view->data.as_uint32[i]; + case NANOARROW_TYPE_INT16: + return data_view->data.as_int16[i]; + case NANOARROW_TYPE_UINT16: + return data_view->data.as_uint16[i]; + case NANOARROW_TYPE_INT8: + return data_view->data.as_int8[i]; + case NANOARROW_TYPE_UINT8: + return data_view->data.as_uint8[i]; + case NANOARROW_TYPE_DOUBLE: + return data_view->data.as_double[i]; + case NANOARROW_TYPE_FLOAT: + return data_view->data.as_float[i]; + case NANOARROW_TYPE_HALF_FLOAT: + return ArrowHalfFloatToFloat(data_view->data.as_uint16[i]); + case NANOARROW_TYPE_BOOL: + return ArrowBitGet(data_view->data.as_uint8, i); + default: + return DBL_MAX; + } +} + +static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const char* data_view = array_view->buffer_views[2].data.as_char; + + struct ArrowStringView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.data = data_view + offsets_view->data.as_int32[i]; + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.data = data_view + offsets_view->data.as_int64[i]; + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); + break; + case NANOARROW_TYPE_STRING_VIEW: + case NANOARROW_TYPE_BINARY_VIEW: { + struct ArrowBufferView buf_view = + ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i); + view.data = buf_view.data.as_char; + view.size_bytes = buf_view.size_bytes; + break; + } + default: + view.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( + const struct ArrowArrayView* array_view, int64_t i) { + i += array_view->offset; + const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; + const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; + + struct ArrowBufferView view; + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_BINARY: + view.size_bytes = + offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; + break; + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_LARGE_BINARY: + view.size_bytes = + offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; + view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + view.size_bytes = array_view->layout.element_size_bits[1] / 8; + view.data.as_uint8 = + array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); + break; + case NANOARROW_TYPE_STRING_VIEW: + case NANOARROW_TYPE_BINARY_VIEW: + view = ArrowArrayViewGetBytesFromViewArrayUnsafe(array_view, i); + break; + default: + view.data.data = NULL; + view.size_bytes = 0; + break; + } + + return view; +} + +static inline void ArrowArrayViewGetIntervalUnsafe( + const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_INTERVAL_MONTHS: { + const size_t size = sizeof(int32_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_DAY_TIME: { + const size_t size = sizeof(int32_t) + sizeof(int32_t); + memcpy(&out->days, data_view + i * size, sizeof(int32_t)); + memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); + break; + } + case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { + const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); + memcpy(&out->months, data_view + i * size, sizeof(int32_t)); + memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); + memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); + break; + } + default: + break; + } +} + +static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, + int64_t i, struct ArrowDecimal* out) { + i += array_view->offset; + const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; + switch (array_view->storage_type) { + case NANOARROW_TYPE_DECIMAL128: + ArrowDecimalSetBytes(out, data_view + (i * 16)); + break; + case NANOARROW_TYPE_DECIMAL256: + ArrowDecimalSetBytes(out, data_view + (i * 32)); + break; + default: + memset(out->words, 0, sizeof(out->words)); + break; + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/oracledb/interchange/nanoarrow_bridge.pxd b/src/oracledb/interchange/nanoarrow_bridge.pxd new file mode 100644 index 00000000..806f660e --- /dev/null +++ b/src/oracledb/interchange/nanoarrow_bridge.pxd @@ -0,0 +1,102 @@ +#------------------------------------------------------------------------------ +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#------------------------------------------------------------------------------ + +#------------------------------------------------------------------------------ +# nanoarrow_bridge.pxd +# +# Cython definition file declaring the classes used for bridging between the +# nanoarrow C interface and Python. +#------------------------------------------------------------------------------ + +# cython: language_level = 3 + +from libc.stdint cimport int8_t, uint8_t, int16_t, uint16_t +from libc.stdint cimport int32_t, uint32_t, int64_t, uint64_t + +cdef extern from "nanoarrow.h": + + cdef struct ArrowArray: + int64_t length + int64_t null_count + int64_t offset + int64_t n_buffers + void (*release)(ArrowSchema *) + + cdef struct ArrowSchema: + void (*release)(ArrowSchema*) + + cpdef enum ArrowType: + NANOARROW_TYPE_BOOL + NANOARROW_TYPE_DECIMAL128 + NANOARROW_TYPE_DOUBLE + NANOARROW_TYPE_FLOAT + NANOARROW_TYPE_INT64 + NANOARROW_TYPE_STRING + NANOARROW_TYPE_TIMESTAMP + + cpdef enum ArrowTimeUnit: + NANOARROW_TIME_UNIT_SECOND + NANOARROW_TIME_UNIT_MILLI + NANOARROW_TIME_UNIT_MICRO + NANOARROW_TIME_UNIT_NANO + + cdef struct ArrowStringView: + const char* data + int64_t size_bytes + + cdef struct ArrowDecimal: + pass + + +cdef class OracleArrowArray: + """ + OracleArrowArray corresponds to a Column in the Relational model + + It uses functions defined in the Arrow C Data Interface + to work with Arrow buffers and incrementally append values + + The only user-facing API in this object will be __arrow_c_array__() + which is documented in the Arrow PyCapsule Interface. Arrow-backed + DataFrame libraries will use __arrow_c_array__() to directly access + the underlying arrow data + + """ + cdef: + public int32_t precision + public int32_t scale + public str name + public ArrowType arrow_type + public ArrowTimeUnit time_unit + double factor + ArrowArray *arrow_array + ArrowSchema *arrow_schema + + cdef str _schema_to_string(self) + cdef int append_bytes(self, void* ptr, int64_t num_bytes) except -1 + cdef int append_double(self, double value) except -1 + cdef int append_float(self, float value) except -1 + cdef int append_int64(self, int64_t value) except -1 + cdef int append_null(self) except -1 + cdef int append_decimal(self, void* ptr, int64_t num_bytes) except -1 + cdef int finish_building(self) except -1 diff --git a/src/oracledb/interchange/nanoarrow_bridge.pyx b/src/oracledb/interchange/nanoarrow_bridge.pyx new file mode 100644 index 00000000..05705c5e --- /dev/null +++ b/src/oracledb/interchange/nanoarrow_bridge.pyx @@ -0,0 +1,334 @@ +#------------------------------------------------------------------------------ +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ +# nanoarrow_bridge.pyx +# +# Cython wrapper around the Arrow C Data interface +#------------------------------------------------------------------------------ + +cimport cpython + +from libc.stdint cimport uintptr_t +from libc.string cimport strlen, strchr +from cpython.pycapsule cimport PyCapsule_New + +from .. import errors + +cdef extern from "nanoarrow/nanoarrow.c": + + ctypedef int ArrowErrorCode + + cdef union ArrowBufferViewData: + const void* data + + cdef struct ArrowBufferView: + ArrowBufferViewData data + int64_t size_bytes + + cdef struct ArrowArrayView: + ArrowBufferView *buffer_views + + cdef struct ArrowError: + pass + + cdef ArrowErrorCode NANOARROW_OK + + void ArrowArrayRelease(ArrowArray *array) + void ArrowSchemaRelease(ArrowSchema *schema) + + ArrowErrorCode ArrowArrayInitFromType(ArrowArray* array, + ArrowType storage_type) + ArrowErrorCode ArrowArrayAppendBytes(ArrowArray* array, + ArrowBufferView value) + ArrowErrorCode ArrowArrayAppendDouble(ArrowArray* array, double value) + ArrowErrorCode ArrowArrayAppendNull(ArrowArray* array, int64_t n) + ArrowErrorCode ArrowArrayAppendInt(ArrowArray* array, int64_t value) + ArrowErrorCode ArrowArrayAppendDecimal(ArrowArray * array, + const ArrowDecimal * value) + ArrowErrorCode ArrowArrayFinishBuildingDefault(ArrowArray* array, + ArrowError* error) + ArrowErrorCode ArrowArrayReserve(ArrowArray* array, + int64_t additional_size_elements) + inline ArrowErrorCode ArrowArrayStartAppending(ArrowArray* array) + ArrowErrorCode ArrowArrayViewInitFromSchema(ArrowArrayView* array_view, + const ArrowSchema* schema, + ArrowError* error) + ArrowErrorCode ArrowArrayViewSetArray(ArrowArrayView* array_view, + const ArrowArray* array, + ArrowError* error) + void ArrowSchemaInit(ArrowSchema* schema) + ArrowErrorCode ArrowSchemaInitFromType(ArrowSchema* schema, ArrowType type) + ArrowErrorCode ArrowSchemaSetTypeDateTime(ArrowSchema* schema, + ArrowType arrow_type, + ArrowTimeUnit time_unit, + const char* timezone) + ArrowErrorCode ArrowSchemaSetTypeDecimal(ArrowSchema* schema, + ArrowType type, + int32_t decimal_precision, + int32_t decimal_scale) + ArrowErrorCode ArrowSchemaSetName(ArrowSchema* schema, const char* name) + int64_t ArrowSchemaToString(const ArrowSchema* schema, char* out, + int64_t n, char recursive) + void ArrowDecimalInit(ArrowDecimal * decimal, int32_t bitwidth, + int32_t precision, int32_t scale) + ArrowErrorCode ArrowDecimalSetDigits(ArrowDecimal * decimal, + ArrowStringView value) + + +cdef int _check_nanoarrow(int code) except -1: + """ + Checks the return code of the nanoarrow function and raises an exception if + it is not NANOARROW_OK. + """ + if code != NANOARROW_OK: + errors._raise_err(errors.ERR_ARROW_C_API_ERROR, code=code) + + +cdef void pycapsule_schema_deleter(object schema_capsule) noexcept: + cdef ArrowSchema * schema = cpython.PyCapsule_GetPointer( + schema_capsule, 'arrow_schema' + ) + if schema.release != NULL: + ArrowSchemaRelease(schema) + + +cdef void pycapsule_array_deleter(object array_capsule) noexcept: + cdef ArrowArray * array = cpython.PyCapsule_GetPointer( + array_capsule, 'arrow_array' + ) + # Do not invoke the deleter on a used/moved capsule + if array.release != NULL: + ArrowArrayRelease(array) + + +cdef class OracleArrowArray: + + def __cinit__(self, ArrowType arrow_type, str name, int8_t precision, + int8_t scale, ArrowTimeUnit time_unit): + cdef ArrowType storage_type = arrow_type + self.arrow_type = arrow_type + self.time_unit = time_unit + self.name = name + self.arrow_array = \ + cpython.PyMem_Malloc(sizeof(ArrowArray)) + if arrow_type == NANOARROW_TYPE_TIMESTAMP: + storage_type = NANOARROW_TYPE_INT64 + if time_unit == NANOARROW_TIME_UNIT_MILLI: + self.factor = 1e3 + elif time_unit == NANOARROW_TIME_UNIT_MICRO: + self.factor = 1e6 + elif time_unit == NANOARROW_TIME_UNIT_NANO: + self.factor = 1e9 + else: + self.factor = 1 + + _check_nanoarrow(ArrowArrayInitFromType(self.arrow_array, + storage_type)) + self.arrow_schema = \ + cpython.PyMem_Malloc(sizeof(ArrowSchema)) + _check_nanoarrow(ArrowArrayStartAppending(self.arrow_array)) + if arrow_type == NANOARROW_TYPE_DECIMAL128: + self.precision = precision + self.scale = scale + ArrowSchemaInit(self.arrow_schema) + _check_nanoarrow(ArrowSchemaSetTypeDecimal(self.arrow_schema, + arrow_type, + precision, scale)) + else: + _check_nanoarrow(ArrowSchemaInitFromType(self.arrow_schema, + storage_type)) + if arrow_type == NANOARROW_TYPE_TIMESTAMP: + _check_nanoarrow(ArrowSchemaSetTypeDateTime(self.arrow_schema, + arrow_type, + time_unit, NULL)) + _check_nanoarrow(ArrowSchemaSetName(self.arrow_schema, name.encode())) + + def __dealloc__(self): + if self.arrow_array != NULL: + cpython.PyMem_Free(self.arrow_array) + if self.arrow_schema != NULL: + cpython.PyMem_Free(self.arrow_schema) + + def __len__(self): + return self.arrow_array.length + + def __repr__(self): + return ( + f"OracleArrowArray(name={self.name}, " + f"len={self.arrow_array.length}, " + f"type={self._schema_to_string()})" + ) + + def __str__(self): + return self.__repr__() + + cdef str _schema_to_string(self): + """ + Converts the schema to a string representation. + """ + cdef char buffer[81] + ArrowSchemaToString(self.arrow_schema, buffer, sizeof(buffer), 0) + return buffer.decode() + + cdef int append_bytes(self, void* ptr, int64_t num_bytes) except -1: + """ + Append a value of type bytes to the array. + """ + cdef ArrowBufferView data + data.data.data = ptr + data.size_bytes = num_bytes + _check_nanoarrow(ArrowArrayAppendBytes(self.arrow_array, data)) + + cdef int append_double(self, double value) except -1: + """ + Append a value of type double to the array. + """ + _check_nanoarrow(ArrowArrayAppendDouble(self.arrow_array, value)) + + cdef int append_float(self, float value) except -1: + """ + Append a value of type float to the array. + """ + self.append_double(value) + + cdef int append_int64(self, int64_t value) except -1: + """ + Append a value of type int64_t to the array. + """ + _check_nanoarrow(ArrowArrayAppendInt(self.arrow_array, value)) + + cdef int append_null(self) except -1: + """ + Append a null value to the array. + """ + _check_nanoarrow(ArrowArrayAppendNull(self.arrow_array, 1)) + + cdef int append_decimal(self, void* ptr, int64_t num_bytes) except -1: + """ + Append a value of type ArrowDecimal to the array + + Arrow decimals are fixed-point decimal numbers encoded as a + scaled integer. decimal128(7, 3) can exactly represent the numbers + 1234.567 and -1234.567 encoded internally as the 128-bit integers + 1234567 and -1234567, respectively + + """ + cdef: + int64_t i = 0, j = 0 + char* digits = ptr + ArrowStringView decimal_view + ArrowDecimal * decimal = \ + cpython.PyMem_Malloc(sizeof(ArrowDecimal)) + + try: + decimal_view.data = digits + decimal_view.size_bytes = num_bytes + ArrowDecimalInit(decimal, 128, self.precision, self.scale) + _check_nanoarrow(ArrowDecimalSetDigits(decimal, decimal_view)) + _check_nanoarrow(ArrowArrayAppendDecimal(self.arrow_array, decimal)) + finally: + cpython.PyMem_Free(decimal) + + cdef int finish_building(self) except -1: + """ + Finish building the array. No more data will be added to it. + """ + _check_nanoarrow(ArrowArrayFinishBuildingDefault(self.arrow_array, + NULL)) + + def get_buffer_info(self): + """ + Get buffer information required by the dataframe interchange logic. + """ + cdef: + int64_t n_buffers = self.arrow_array.n_buffers + ArrowBufferView *buffer + ArrowArrayView *view + view = cpython.PyMem_Malloc(sizeof(ArrowArrayView)) + _check_nanoarrow(ArrowArrayViewInitFromSchema(view, self.arrow_schema, + NULL)) + _check_nanoarrow(ArrowArrayViewSetArray(view, self.arrow_array, NULL)) + + # initialize all buffers to None to begin with + buffers = { + "validity": None, + "offsets": None, + "data": None + } + + # validity buffer + if n_buffers > 0 and self.arrow_array.null_count > 0: + buffer = &view.buffer_views[0] + buffers["validity"] = ( + buffer.size_bytes, + buffer.data.data + ) + + # data / offset buffer + if n_buffers == 2: + buffer = &view.buffer_views[1] + buffers["data"] = ( + buffer.size_bytes, + buffer.data.data + ) + elif n_buffers == 3: + buffer = &view.buffer_views[1] + buffers["offsets"] = ( + buffer.size_bytes, + buffer.data.data + ) + buffer = &view.buffer_views[2] + buffers["data"] = ( + buffer.size_bytes, + buffer.data.data + ) + + return buffers + + @property + def null_count(self) -> int: + return self.arrow_array.null_count + + @property + def offset(self) -> int: + return self.arrow_array.offset + + def __arrow_c_array__(self, requested_schema=None): + """ + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + if requested_schema is not None: + raise NotImplementedError("requested_schema") + + array_capsule = PyCapsule_New( + self.arrow_array, 'arrow_array', &pycapsule_array_deleter + ) + schema_capsule = PyCapsule_New( + self.arrow_schema, "arrow_schema", &pycapsule_schema_deleter + ) + return schema_capsule, array_capsule diff --git a/src/oracledb/interchange/protocol.py b/src/oracledb/interchange/protocol.py new file mode 100644 index 00000000..e464bb55 --- /dev/null +++ b/src/oracledb/interchange/protocol.py @@ -0,0 +1,282 @@ +# ----------------------------------------------------------------------------- +# MIT License + +# Copyright (c) 2025 Consortium for Python Data API Standards contributors + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# protocol.py +# +# Implement DataFrame class as documented in the standard +# https://data-apis.org/dataframe-protocol/latest/API.html +# ----------------------------------------------------------------------------- + +from enum import IntEnum +from typing import ( + Any, + ClassVar, + Literal, + Protocol, + Tuple, + TypedDict, +) + +from collections.abc import Iterable, Sequence + + +class DlpackDeviceType(IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class DtypeKind(IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + DECIMAL = 24 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +class ColumnNullType(IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + """Buffers backing a column.""" + + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", "Dtype"] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Tuple["Buffer", "Dtype"] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Tuple["Buffer", "Dtype"] + + +class CategoricalDescription(TypedDict): + """Description of a categorical column.""" + + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: Literal[True] + # Python-level only (e.g. `{int: str}`). + # None if not a dictionary-style categorical. + categories: "Column" + + +class Buffer(Protocol): + """Interchange buffer object.""" + + @property + def bufsize(self) -> int: + """Buffer size in bytes.""" + + @property + def ptr(self) -> int: + """Pointer to start of the buffer as an integer.""" + + def __dlpack__(self) -> Any: + """Represent this structure as DLPack interface.""" + + def __dlpack_device__(self) -> Tuple["DlpackDeviceType", int | None]: + """Device type and device ID for where the data in the buffer + resides.""" + + +class Column(Protocol): + """Interchange column object.""" + + def size(self) -> int: + """Size of the column in elements.""" + + @property + def offset(self) -> int: + """Offset of the first element with respect to the start + of the underlying buffer.""" # noqa: W505 + + @property + def dtype(self) -> "Dtype": + """Data type of the column.""" + + @property + def describe_categorical(self) -> "CategoricalDescription": + """Description of the categorical data type of the column.""" + + @property + def describe_null(self) -> Tuple["ColumnNullType", Any]: + """Description of the null representation the column uses.""" + + @property + def null_count(self) -> int | None: + """Number of null elements, if known.""" + + @property + def metadata(self) -> dict[str, Any]: + """The metadata for the column.""" + + def num_chunks(self) -> int: + """Return the number of chunks the column consists of.""" + + def get_chunks(self, n_chunks: int | None = None) -> Iterable["Column"]: + """Return an iterator yielding the column chunks.""" + + def get_buffers(self) -> "ColumnBuffers": + """Return a dictionary containing the underlying buffers.""" + + +class DataFrame(Protocol): + """Interchange dataframe object.""" + + version: ClassVar[int] # Version of the protocol + + def __dataframe__( + self, + nan_as_null: bool = False, # noqa: FBT001 + allow_copy: bool = True, # noqa: FBT001 + ) -> "DataFrame": + """Convert to a dataframe object implementing the dataframe + interchange protocol.""" # noqa: W505 + + @property + def metadata(self) -> dict[str, Any]: + """The metadata for the dataframe.""" + + def num_columns(self) -> int: + """Return the number of columns in the dataframe.""" + + def num_rows(self) -> int | None: + """Return the number of rows in the dataframe, if available.""" + + def num_chunks(self) -> int: + """Return the number of chunks the dataframe consists of..""" + + def column_names(self) -> Iterable[str]: + """Return the column names.""" + + def get_column(self, i: int) -> "Column": + """Return the column at the indicated position.""" + + def get_column_by_name(self, name: str) -> "Column": + """Return the column with the given name.""" + + def get_columns(self) -> Iterable["Column"]: + """Return an iterator yielding the columns.""" + + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """Create a new dataframe by selecting a subset of columns by index.""" + + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """Create a new dataframe by selecting a subset of columns by name.""" + + def get_chunks(self, n_chunks: int | None = None) -> Iterable["DataFrame"]: + """Return an iterator yielding the chunks of the dataframe.""" + + +class SupportsInterchange(Protocol): + """Dataframe that supports conversion into an interchange + dataframe object.""" + + def __dataframe__( + self, + nan_as_null: bool = False, # noqa: FBT001 + allow_copy: bool = True, # noqa: FBT001 + ) -> "SupportsInterchange": + """Convert to a dataframe object implementing the dataframe + interchange protocol.""" # noqa: W505 + + +class Endianness: + """Enum indicating the byte-order of a data type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class CopyNotAllowedError(RuntimeError): + """Exception raised when a copy is required, + but `allow_copy` is set to `False`.""" diff --git a/src/oracledb/thick_impl.pyx b/src/oracledb/thick_impl.pyx index dc71cd37..8dfe11f1 100644 --- a/src/oracledb/thick_impl.pyx +++ b/src/oracledb/thick_impl.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -64,13 +64,16 @@ from .base_impl cimport ( BaseVarImpl, BindVar, C_DEFAULTS, + char_type, ConnectParamsImpl, + convert_oracle_data_to_arrow, DbType, DB_TYPE_NUM_CURSOR, DRIVER_NAME, DRIVER_VERSION, DRIVER_INSTALLATION_URL, ENCODING_UTF8, + OracleData, OracleMetadata, PURITY_DEFAULT, PY_TYPE_DATE, diff --git a/src/oracledb/thin_impl.pyx b/src/oracledb/thin_impl.pyx index 13a29688..c08a24f6 100644 --- a/src/oracledb/thin_impl.pyx +++ b/src/oracledb/thin_impl.pyx @@ -1,5 +1,5 @@ #------------------------------------------------------------------------------ -# Copyright (c) 2020, 2024, Oracle and/or its affiliates. +# Copyright (c) 2020, 2025, Oracle and/or its affiliates. # # This software is dual-licensed to you under the Universal Permissive License # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License @@ -110,6 +110,7 @@ from .base_impl cimport ( Buffer, ConnectParamsImpl, convert_oracle_data_to_python, + convert_oracle_data_to_arrow, convert_date_to_python, CS_FORM_IMPLICIT, CS_FORM_NCHAR, diff --git a/tests/sql/create_schema.sql b/tests/sql/create_schema.sql index e9486286..b342d5af 100644 --- a/tests/sql/create_schema.sql +++ b/tests/sql/create_schema.sql @@ -379,6 +379,19 @@ create table &main_user..PlsqlSessionCallbacks ( ) / +create table &main_user..TestDataframe ( + Id number(9), + FirstName varchar2(100), + LastName varchar2(100), + City varchar2(100), + Country varchar2(100), + DateOfBirth date, + Salary number(9, 2), + CreditScore number(3, 0), + LastUpdated timestamp +) +/ + -- create queue table and queues for testing advanced queuing begin diff --git a/tests/test_8000_dataframe.py b/tests/test_8000_dataframe.py new file mode 100644 index 00000000..9e1215a8 --- /dev/null +++ b/tests/test_8000_dataframe.py @@ -0,0 +1,481 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# This software is dual-licensed to you under the Universal Permissive License +# (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License +# 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose +# either license. +# +# If you elect to accept the software under the Apache License, Version 2.0, +# the following applies: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------------- + +""" +Module for testing dataframes +""" +import datetime +import decimal + +import oracledb + +try: + import pyarrow + import pandas + + HAS_INTEROP = True +except ImportError: + HAS_INTEROP = False + +import test_env + +# basic +DATASET_1 = [ + ( + 1, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1989, 8, 22), + 12132.40, + 400, + datetime.datetime.now(), + ), + ( + 2, + "Big", + "Hero", + "San Fransokyo", + "Japansa", + datetime.date(1988, 8, 22), + 234234.32, + 400, + datetime.datetime.now(), + ), +] + +# None, -ve +DATASET_2 = [ + ( + 1, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1989, 8, 22), + None, + 400, + datetime.datetime.now(), + ), + ( + 2, + "Big", + "Hero", + "San Fransokyo", + None, + datetime.date(1988, 8, 22), + -12312.1, + 0, + datetime.datetime.now(), + ), +] + +# None, +/- 0.XXX +DATASET_3 = [ + ( + 1, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1989, 8, 22), + None, + 400, + datetime.datetime.now(), + ), + ( + 2, + "Big", + "Hero", + "San Fransokyo", + None, + datetime.date(1988, 8, 22), + 0.12, + 0, + datetime.datetime.now(), + ), + ( + 3, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1989, 8, 22), + None, + 400, + datetime.datetime.now(), + ), + ( + 4, + "Big", + "Hero", + "San Fransokyo", + None, + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), +] + +# Duplicates +DATASET_4 = [ + ( + 1, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1989, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), + ( + 2, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), + ( + 3, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), + ( + 4, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), + ( + 5, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), + ( + 6, + "John", + "Doe", + "San Francisco", + "USA", + datetime.date(1988, 8, 22), + -0.01, + 0, + datetime.datetime.now(), + ), +] + + +class TestCase(test_env.BaseTestCase): + + def __check_interop(self): + """ + Checks to see if the pyarrow and pandas modules are available. + """ + if not HAS_INTEROP: + self.skipTest("missing pandas or pyarrow modules") + + def __convert_to_array(self, data, typ): + """ + Convert raw data to an Arrow array using pyarrow. + """ + if isinstance(typ, pyarrow.Decimal128Type): + data = [ + decimal.Decimal(str(value)) if value is not None else value + for value in data + ] + elif isinstance(typ, pyarrow.TimestampType): + if typ.unit == "s": + data = [ + datetime.datetime(v.year, v.month, v.day).timestamp() + for v in data + ] + else: + data = [value.timestamp() * 1000000 for value in data] + mask = [value is None for value in data] + return pyarrow.array(data, typ, mask=mask) + + def __convert_to_df(self, data): + """ + Converts the data set to a Pandas data frame for comparison to what is + returned from the database. + """ + data_by_col = [[row[i] for row in data] for i in range(len(data[0]))] + fetch_decimals = oracledb.defaults.fetch_decimals + types = [ + pyarrow.decimal128(9) if fetch_decimals else pyarrow.int64(), + pyarrow.string(), + pyarrow.string(), + pyarrow.string(), + pyarrow.string(), + pyarrow.timestamp("s"), + pyarrow.decimal128(9, 2) if fetch_decimals else pyarrow.float64(), + pyarrow.decimal128(3) if fetch_decimals else pyarrow.int64(), + pyarrow.timestamp("us"), + ] + arrays = [ + self.__convert_to_array(d, t) for d, t in zip(data_by_col, types) + ] + names = [ + "ID", + "FIRSTNAME", + "LASTNAME", + "CITY", + "COUNTRY", + "DATEOFBIRTH", + "SALARY", + "CREDITSCORE", + "LASTUPDATED", + ] + pa_tab = pyarrow.Table.from_arrays(arrays, names=names) + return pa_tab.to_pandas() + + def __get_data_from_df(self, df): + """ + Returns data from the data frame in a normalized fashion suitable for + comparison. In particular, NaN values cannot be compared to one another + so they are converted to the value None for comparison purposes. + """ + return [ + tuple(None if pandas.isna(v) else v for v in row) + for row in df.itertuples(index=False, name=None) + ] + + def __populate_table(self, data): + """ + Populate the test table with the given data. + """ + self.cursor.execute("truncate table TestDataframe") + types = [None] * len(data[0]) + types[8] = oracledb.DB_TYPE_TIMESTAMP + self.cursor.setinputsizes(*types) + self.cursor.executemany( + """ + insert into TestDataframe ( + Id, FirstName, LastName, City, Country, + DateOfBirth, Salary, CreditScore, LastUpdated + ) values ( + :id, :first_name, :last_name, :city, :country, + :dob, :salary, :credit_score, :last_updated + ) + """, + data, + ) + self.conn.commit() + + def __test_df_interop(self, data): + """ + Tests interoperability with external data frames using the data set + provided. + """ + self.__check_interop() + self.__populate_table(data) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + self.__validate_df(ora_df, data) + + def __test_df_batches_interop(self, data, batch_size, num_batches): + """ + Tests interoperability with external data frames using the data set + provided. + """ + self.__check_interop() + self.__populate_table(data) + statement = "select * from TestDataFrame order by Id" + batches = list(self.conn.fetch_df_batches(statement, size=batch_size)) + self.assertEqual(len(batches), num_batches) + if num_batches == 1: + self.__validate_df(batches[0], data) + else: + offset = 0 + for batch in batches: + self.__validate_df(batch, data[offset : offset + batch_size]) + offset += batch_size + + def __validate_df(self, ora_df, data): + """ + Validates the data frame by converting it to Pandas and comparing it + with the original data set that was used. + """ + raw_df = self.__convert_to_df(data) + raw_data = self.__get_data_from_df(raw_df) + fetched_tab = pyarrow.Table.from_arrays( + ora_df.column_arrays(), names=ora_df.column_names() + ) + fetched_df = fetched_tab.to_pandas() + fetched_data = self.__get_data_from_df(fetched_df) + self.assertEqual(fetched_data, raw_data) + + def test_8000(self): + "8000 - test basic fetch of data frame" + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + self.assertEqual(ora_df.num_rows(), len(DATASET_1)) + self.assertEqual(ora_df.num_columns(), len(DATASET_1[0])) + metadata = dict( + num_columns=ora_df.num_columns(), + num_rows=ora_df.num_rows(), + num_chunks=1, + ) + self.assertEqual(ora_df.metadata, metadata) + + def test_8001(self): + "8001 - test conversion to external dataframe" + self.__test_df_interop(DATASET_1) + + def test_8002(self): + "8001 - test null and negative values" + self.__test_df_interop(DATASET_2) + + def test_8003(self): + "8002 - test with fetch_decimals" + with test_env.DefaultsContextManager("fetch_decimals", True): + self.__test_df_interop(DATASET_1) + + def test_8004(self): + "8003 - test null and negative values with fetch_decimals" + with test_env.DefaultsContextManager("fetch_decimals", True): + self.__test_df_interop(DATASET_2) + + def test_8005(self): + "8005 - test null and values with leading zeros" + self.__test_df_interop(DATASET_3) + + def test_8006(self): + "8005 - test null and values with leading zeros with fetch_decimals" + with test_env.DefaultsContextManager("fetch_decimals", True): + self.__test_df_interop(DATASET_3) + + def test_8007(self): + "8007 - duplicate values in the rows" + self.__test_df_interop(DATASET_4) + + def test_8008(self): + "8008 - batches without specification of size" + self.__test_df_batches_interop( + DATASET_4, batch_size=None, num_batches=1 + ) + + def test_8009(self): + "8009 - batches with specification of size" + self.__test_df_batches_interop(DATASET_4, batch_size=5, num_batches=2) + + def test_8010(self): + "8010 - verify passing Arrow arrays twice fails" + self.__check_interop() + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + pyarrow.Table.from_arrays( + ora_df.column_arrays(), names=ora_df.column_names() + ) + with self.assertRaises(pyarrow.lib.ArrowInvalid): + pyarrow.Table.from_arrays( + ora_df.column_arrays(), names=ora_df.column_names() + ) + + def test_8011(self): + "8011 - verify empty data set" + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame where Id = 4" + ora_df = self.conn.fetch_df_all(statement) + self.assertEqual(ora_df.num_rows(), 0) + + def test_8012(self): + "8012 - verify empty data set with batches" + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame where Id = 4" + for ora_df in self.conn.fetch_df_batches(statement): + self.assertEqual(ora_df.num_rows(), 0) + + def test_8013(self): + "8013 - negative checks on attributes" + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + with self.assertRaises(IndexError): + ora_df.get_column(121) + with self.assertRaises(IndexError): + ora_df.get_column(-1) + with self.assertRaises(KeyError): + ora_df.get_column_by_name("missing_column") + + def test_8014(self): + "8014 - check size and null count with no nulls" + self.__populate_table(DATASET_1) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + col = ora_df.get_column(0) + self.assertEqual(col.size(), len(DATASET_1)) + self.assertEqual(col.null_count, 0) + + def test_8015(self): + "8015 - check size and null count with nulls present" + self.__populate_table(DATASET_2) + statement = "select * from TestDataFrame order by Id" + ora_df = self.conn.fetch_df_all(statement) + col = ora_df.get_column_by_name("SALARY") + self.assertEqual(col.size(), len(DATASET_2)) + self.assertEqual(col.null_count, 1) + + def test_8016(self): + "8016 - check unsupported error for LOBs" + statement = "select to_clob('test_8016') from dual" + with self.assertRaisesFullCode("DPY-3030"): + self.conn.fetch_df_all(statement) + + def test_8017(self): + "8017 - batches with specification of size matching number of rows" + self.__test_df_batches_interop( + DATASET_2, batch_size=len(DATASET_2), num_batches=1 + ) + + +if __name__ == "__main__": + test_env.run_test_cases() diff --git a/utils/templates/connection.py b/utils/templates/connection.py index eaabc2f3..bb607666 100644 --- a/utils/templates/connection.py +++ b/utils/templates/connection.py @@ -711,6 +711,43 @@ def encode_oson(self, value): self._verify_connected() return self._impl.encode_oson(value) + def fetch_df_all( + self, + statement: str, + parameters: Optional[Union[list, tuple, dict]] = None, + arraysize: Optional[int] = None, + ): + """ + Fetch all data as OracleDataFrame. + """ + cursor = self.cursor() + cursor._impl.fetching_arrow = True + if arraysize is not None: + cursor.arraysize = arraysize + cursor.prefetchrows = cursor.arraysize + cursor.execute(statement, parameters) + return cursor._impl.fetch_df_all(cursor) + + def fetch_df_batches( + self, + statement: str, + parameters: Optional[Union[list, tuple, dict]] = None, + size: Optional[int] = None, + ): + """ + Fetch data in batches. Each batch is an OracleDataFrame + """ + cursor = self.cursor() + cursor._impl.fetching_arrow = True + if size is not None: + cursor.arraysize = size + cursor.prefetchrows = cursor.arraysize + cursor.execute(statement, parameters) + if size is None: + yield cursor._impl.fetch_df_all(cursor) + else: + yield from cursor._impl.fetch_df_batches(cursor, batch_size=size) + def getSodaDatabase(self) -> SodaDatabase: """ Return a SODA database object for performing all operations on Simple