From ee5a2ff3106052d5874107bcb11c95a8c5dfdbc4 Mon Sep 17 00:00:00 2001 From: mara004 Date: Mon, 30 Oct 2023 16:23:32 +0100 Subject: [PATCH] Conda packaging code (#268) No CI integration yet, to be done in a following PR. --- .github/workflows/build_packages.yaml | 4 +- .gitignore | 1 + README.md | 213 +++++++++------- conda/bundle/recipe/meta.yaml | 75 ++++++ conda/helpers/recipe/meta.yaml | 73 ++++++ conda/prepare_script.py | 15 ++ conda/raw/minitest.py | 22 ++ conda/raw/recipe/meta.yaml | 67 +++++ data/.gitkeep | 1 - docs/devel/2023_10_setup_tasks.md | 22 -- docs/devel/changelog_staging.md | 9 +- docs/devel/tasks.md | 34 --- run | 16 +- setup.cfg | 4 - setup.py | 111 ++++---- setupsrc/pypdfium2_setup/_compat.py | 33 --- setupsrc/pypdfium2_setup/autorelease.py | 15 +- setupsrc/pypdfium2_setup/build_pdfium.py | 128 +++------- setupsrc/pypdfium2_setup/craft_packages.py | 246 +++++++++++++++--- setupsrc/pypdfium2_setup/emplace.py | 107 ++++---- setupsrc/pypdfium2_setup/packaging_base.py | 282 ++++++++++++++------- setupsrc/pypdfium2_setup/update_pdfium.py | 58 ++--- src/pypdfium2/_helpers/document.py | 2 +- src/pypdfium2/_library_scope.py | 15 +- src/pypdfium2/version.py | 94 ++++--- tests_old/conftest.py | 3 - tests_old/test_setup.py | 118 --------- 27 files changed, 1019 insertions(+), 749 deletions(-) create mode 100644 conda/bundle/recipe/meta.yaml create mode 100644 conda/helpers/recipe/meta.yaml create mode 100644 conda/prepare_script.py create mode 100644 conda/raw/minitest.py create mode 100644 conda/raw/recipe/meta.yaml delete mode 100644 data/.gitkeep delete mode 100644 docs/devel/2023_10_setup_tasks.md delete mode 100644 docs/devel/tasks.md delete mode 100644 setupsrc/pypdfium2_setup/_compat.py delete mode 100644 tests_old/test_setup.py diff --git a/.github/workflows/build_packages.yaml b/.github/workflows/build_packages.yaml index 322c7484e..c4f6e13df 100644 --- a/.github/workflows/build_packages.yaml +++ b/.github/workflows/build_packages.yaml @@ -82,8 +82,8 @@ jobs: - name: Run test suite run: ./run test - - name: Run packaging script - run: ./run packaging + - name: Run PyPI packaging script + run: ./run packaging_pypi - name: Upload release notes uses: actions/upload-artifact@v3 diff --git a/.gitignore b/.gitignore index cb37fc929..2d11edac2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ build/ dist/ +conda/*/out/ tests/output/ tests_old/output/ diff --git a/README.md b/README.md index e319137ae..7ab1f3456 100644 --- a/README.md +++ b/README.md @@ -7,90 +7,115 @@ [pypdfium2](https://github.com/pypdfium2-team/pypdfium2) is an [ABI-level](#drawbacks-of-abi-level-bindings) Python 3 binding to [PDFium](https://pdfium.googlesource.com/pdfium/+/refs/heads/main), a powerful and liberal-licensed library for PDF rendering, inspection, manipulation and creation. -This bindings project is built using [ctypesgen](https://github.com/ctypesgen/ctypesgen) and external [PDFium binaries](https://github.com/bblanchon/pdfium-binaries/). -Its custom setup infrastructure provides a seamless packaging and installation process. A wide range of platforms is supported with wheel packages. +It is built with [ctypesgen](https://github.com/pypdfium2-team/ctypesgen) and external [PDFium binaries](https://github.com/bblanchon/pdfium-binaries/). +The custom setup infrastructure provides a seamless packaging and installation process. A wide range of platforms is supported with pre-built packages. pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ctypes API remains accessible as well. ## Installation -* Installing the latest PyPI release (recommended) +* From PyPI (recommended) ```bash python3 -m pip install -U pypdfium2 ``` This will use a pre-built wheel package, the easiest way of installing pypdfium2. -* Installing from source +* From source - * With an external PDFium binary + * Dependencies: + - System: git, C pre-processor (gcc/clang, has to be in `$PATH`) + - Python: ctypesgen (pypdfium2-team fork), wheel, setuptools. Usually installed automatically. + + * With pre-built binary ```bash # In the directory containing the source code of pypdfium2 - python3 -m pip install . + python3 -m pip install -v . ``` + A binary is downloaded implicitly from `pdfium-binaries` and bundled into pypdfium2. - * With a locally built PDFium binary + * With self-built binary ```bash python3 setupsrc/pypdfium2_setup/build_pdfium.py # call with --help to list options - PDFIUM_PLATFORM="sourcebuild" python3 -m pip install . + PDFIUM_PLATFORM="sourcebuild" python3 -m pip install -v . + ``` + Building PDFium may take a long time, as it comes with its bundled toolchain and deps, rather than consuming them from the system.[^pdfium_buildsystem] + However, there is at least an option `--use-syslibs` to build against system-provided runtime libraries. + + * With system-provided binary + ```bash + # Substitute `$PDFIUM_VER` with the system pdfium's build version. + PDFIUM_PLATFORM="system:$PDFIUM_VER" python3 -m pip install -v . ``` - Building PDFium may take a long time because it comes with its own toolchain and bundled dependencies, rather than using system-provided components.[^pdfium_buildsystem] + Link against external pdfium instead of bundling it. + For ABI safety reasons, you'll want to make sure `$PDFIUM_VER` is correct and the bindings are rebuilt whenever system pdfium is updated. - The host system needs to provide `git` and a C pre-processor (`gcc` or `clang`). - Setup code also depends on the Python packages `ctypesgen`, `wheel`, and `setuptools`, which will usually get installed automatically. + See [Setup Magic](#setup-magic) for further options. - When installing from source, some additional options of the `pip` package manager may be relevant: - * `-v`: Request more detailed logging output. Useful for debugging. - * `-e`: Install in editable mode, so that the installation will point to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development. - * `--no-build-isolation`: Do not isolate the installation in a virtual environment and use system packages instead. In this case, dependencies specified in `pyproject.toml` (PEP 518) will not take effect and should be pre-installed by the caller. This is an indispensable option if wanting to run the installation with custom versions of setup dependencies.[^no_build_isolation] + Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity depends somewhat on a correctly acting caller. - [^pdfium_buildsystem]: Replacing PDFium's toolchain with a lean build system that is designed to run on an arbitrary host platform is a long-standing task. This would be required to enable local source build capabilities on installation of an `sdist`. If you have the time and expertise to set up such a build system, please start a repository and inform us about it. + Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. It is preferred to let callers decide consciously what to do, and run the build script without pip encapsulation. - [^no_build_isolation]: Possible scenarios include using a locally modified version of a dependency, or supplying a dependency built from a certain commit. + Relevant pip options: + * `-v`: Verbose logging output. Useful for debugging. + * `-e`: Install in editable mode, so the installation points to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development. + * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly. + + [^pdfium_buildsystem]: This means pdfium may not compile on arbitrary hosts. The script is limited to build hosts supported by Google's toolchain. Ideally, we'd need an alternative build system that runs with system packages instead. + +* Conda + + pypdfium2 will soon provide official conda packages in a custom channel. + The main packaging code is merged, only CI integration / docs are not done yet. + + **Beware:** There have been some third-party attempts to conda package pypdfium2/pdfium-binaries. **Any recipes/packages that might be provided by other distributors, including `anaconda/main` or `conda-forge`, are unofficial!** + +* Unofficial packages -* Unofficial distributions + The authors of this project have no control over and are not responsible for possible third-party builds of pypdfium2, and we do not support them. Please use the official packages instead. - pypdfium2 currently releases official builds on PyPI and GitHub. - The authors of this project have no control over and are not responsible for third-party distributions of pypdfium2 (such as unofficial conda packages/recipes). - However, we are interested in cooperation with external package maintainers for wider adoption of pypdfium2 (e.g. linux distros). + Nonetheless, we may assist external package maintainers on behalf of wider adoption of pypdfium2 (e.g. linux distros). However, this does not imply our approval. + + +### Runtime Dependencies +As of this writing, pypdfium2 does not need any mandatory runtime dependencies apart from Python itself. + +However, some optional support model features require additional packages: +* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module name `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience methods to translate between raw bitmap buffers and PIL images. +* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap. -### Setup magic -As pypdfium2 uses external binaries, there are some special setup aspects to consider. -Note, the APIs below may change any time and are mostly of internal interest. +### Setup Magic + +As pypdfium2 requires a C extension and has custom setup code, there are some special features to consider. Note, the APIs below may change any time and are mostly of internal interest. * Binaries are stored in platform-specific sub-directories of `data/`, along with bindings and version information. -* The env var `$PDFIUM_PLATFORM` controls which binary to include on setup. + +* `$PDFIUM_PLATFORM` defines which binary to include on setup. - Format spec: `[$PLATFORM][-v8][:$VERSION]` (`[]` = segments, `$CAPS` = variables). - Examples: `auto`, `auto:5975` `auto-v8:5975` (`auto` may be substituted by an explicit platform name, e.g. `linux_x64`). - Platform: + If unset or `auto`, the host platform is detected and a corresponding binary will be selected. + If an explicit platform identifier (e.g. `linux_x64`, `darwin_arm64`, ...), binaries for the requested platform will be used.[^platform_ids] + + If `system`, bind against system-provided pdfium instead of embedding a binary. Version must be given explicitly so matching bindings can be generated. + If `sourcebuild`, binaries will be taken from `data/sourcebuild/`, assuming a prior run of `build_pdfium.py`. - + If `system`, caller-supplied bindings loading system pdfium, and a version file will be expected. This may be changed to auto-generate these files from a given version shorthand in the future. + If `none`, no platform-dependent files will be included, so as to create a source distribution. - `sourcebuild`, `system` and `none` are standalone, they cannot be followed by additional specifiers. + `sourcebuild` and `none` are standalone, they cannot be followed by additional specifiers. - V8: If given, use the V8 (JavaScript) and XFA enabled pdfium binaries. Otherwise, use the regular (non-V8) binaries. - Version: If given, use the specified pdfium-binaries release. Otherwise, use the latest one. -* `$PYPDFIUM_MODULES=[raw,helpers]` defines which modules to include. Metadata adapts dynamically. + - It is possible to prepend `prepared!` to install with existing platform files instead of generating on the fly; the value will be used for metadata / file inclusion. This can be helpful when installing in an isolated env where ctypesgen is not available, but it is not desirable to use the reference bindings (e.g. conda). + +* `$PYPDFIUM_MODULES=[raw,helpers]` defines the modules to include. Metadata adapts dynamically. - May be used by packagers to decouple raw bindings and helpers, which can be important if packaging against system pdfium. - Would also allow to install only the raw module without helpers, or only helpers with a custom raw module. + * `$PDFIUM_BINDINGS=reference` allows to override ctypesgen and use the reference bindings file `autorelease/bindings.py` instead. - This is a convenience option to get pypdfium2 installed from source even if a working ctypesgen is not available in the install env. - - Warning: This might not be ABI-safe. Please make sure binary/bindings build headers match to avoid ABI issues. + - Warning: This may not be ABI-safe. Please make sure binary/bindings build headers match to avoid ABI issues. [^platform_ids]: Intended for packaging, so that wheels can be crafted for any platform without access to a native host. -### Runtime Dependencies - -pypdfium2 does not have any mandatory runtime dependencies apart from Python and its standard library. - -However, some optional support model features require additional packages: -* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module name `PIL`) is a highly pouplar imaging library for Python. - pypdfium2 provides convenience methods to directly take or return PIL image objects when dealing with raster graphics. -* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get raster graphics in the form of multidimensional numpy arrays. - ## Usage @@ -210,11 +235,11 @@ permission_flags = pdfium_c.FPDF_GetDocPermission(pdf.raw) # explicit permission_flags = pdfium_c.FPDF_GetDocPermission(pdf) # implicit ``` -For PDFium documentation, please look at the comments in its [public header files](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/).[^pdfium_docs] +For PDFium docs, please look at the comments in its [public header files](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/).[^pdfium_docs] A large variety of examples on how to interface with the raw API using [`ctypes`](https://docs.python.org/3/library/ctypes.html) is already provided with [support model source code](src/pypdfium2/_helpers). Nonetheless, the following guide may be helpful to get started with the raw API, especially for developers who are not familiar with `ctypes` yet. -[^pdfium_docs]: Unfortunately, no recent HTML-rendered documentation is available for PDFium at the moment. +[^pdfium_docs]: Unfortunately, no recent HTML-rendered docs are available for PDFium at the moment. @@ -251,7 +276,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, version = c_version.value if ok else None ``` -* If an array is required as output parameter, you can initialise one like this (conceived in general terms): +* If an array is required as output parameter, you can initialise one like this (in general terms): ```python # long form array_type = (c_type * array_length) @@ -284,7 +309,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes) # Decode to string, cutting off the null terminator # Encoding: UTF-16LE (2 bytes per character) - title = buffer.raw[:n_bytes-2].decode('utf-16-le') + title = buffer.raw[:n_bytes-2].decode("utf-16-le") ``` Example B: Extracting text in given boundaries. @@ -327,26 +352,26 @@ Nonetheless, the following guide may be helpful to get started with the raw API, To access the data, you'll want to re-interpret the pointer using `ctypes.cast()` to encompass the whole array: ```python # (Assuming `bitmap` is an FPDF_BITMAP and `size` is the expected number of bytes in the buffer) - first_item = pdfium_c.FPDFBitmap_GetBuffer(bitmap) - buffer = ctypes.cast(first_item, ctypes.POINTER(ctypes.c_ubyte * size)) + buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap) + buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * size)) # Buffer as ctypes array (referencing the original buffer, will be unavailable as soon as the bitmap is destroyed) - c_array = buffer.contents + c_array = buffer_ptr.contents # Buffer as Python bytes (independent copy) data = bytes(c_array) ``` * Writing data from Python into a C buffer works in a similar fashion: ```python - # (Assuming `first_item` is a pointer to the first item of a C buffer to write into, + # (Assuming `buffer_ptr` is a pointer to the first item of a C buffer to write into, # `size` the number of bytes it can store, and `py_buffer` a Python byte buffer) - c_buffer = ctypes.cast(first_item, ctypes.POINTER(ctypes.c_char * size)) + buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_char * size)) # Read from the Python buffer, starting at its current position, directly into the C buffer # (until the target is full or the end of the source is reached) - n_bytes = py_buffer.readinto(c_buffer.contents) # returns the number of bytes read + n_bytes = py_buffer.readinto(buffer_ptr.contents) # returns the number of bytes read ``` -* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help you because `ctypes` does not have original object return (OOR), - i. e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor] That's why you'll want to use `ctypes.addressof()` to get the memory addresses of the underlying C object. +* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i. e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor] + That's why you'll want to use `ctypes.addressof()` to get the memory addresses of the underlying C object. For instance, this is used to avoid infinite loops on circular bookmark references when iterating through the document outline: ```python # (Assuming `pdf` is an FPDF_DOCUMENT) @@ -367,10 +392,12 @@ Nonetheless, the following guide may be helpful to get started with the raw API, * In many situations, callback functions come in handy.[^callback_usecases] Thanks to `ctypes`, it is seamlessly possible to use callbacks across Python/C language boundaries. - [^callback_usecases]: e. g. incremental reading/writing, progress bars, pausing of progressive tasks, ... + [^callback_usecases]: e. g. incremental read/write, management of progressive tasks, ... Example: Loading a document from a Python buffer. This way, file access can be controlled in Python while the whole data does not need to be in memory at once. ```python + import os + # Factory class to create callable objects holding a reference to a Python buffer class _reader_class: @@ -379,14 +406,14 @@ Nonetheless, the following guide may be helpful to get started with the raw API, def __call__(self, _, position, p_buf, size): # Write data from Python buffer into C buffer, as explained before - c_buffer = ctypes.cast(p_buf, ctypes.POINTER(ctypes.c_char * size)) + buffer_ptr = ctypes.cast(p_buf, ctypes.POINTER(ctypes.c_char * size)) self.py_buffer.seek(position) - self.py_buffer.readinto(c_buffer.contents) + self.py_buffer.readinto(buffer_ptr.contents) return 1 # non-zero return code for success # (Assuming py_buffer is a Python file buffer, e. g. io.BufferedReader) # Get the length of the buffer - py_buffer.seek(0, 2) + py_buffer.seek(0, os.SEEK_END) file_len = py_buffer.tell() py_buffer.seek(0) @@ -394,13 +421,8 @@ Nonetheless, the following guide may be helpful to get started with the raw API, fileaccess = pdfium_c.FPDF_FILEACCESS() fileaccess.m_FileLen = file_len - # Option A) Assign callback via lower-level helper (recommended) - # This automates extracting the CFUNCTYPE from the bindings and wrapping the callable - pdfium_i.set_callback(fileaccess, "m_GetBlock", _reader_class(py_buffer)) - - # Option B) Alternatively, you could copy-paste the CFUNCTYPE (discouraged) - functype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(None), ctypes.c_ulong, ctypes.POINTER(ctypes.c_ubyte), ctypes.c_ulong) - fileaccess.m_GetBlock = functype( _reader_class(py_buffer) ) + # Assign the callback, wrapped in its CFUNCTYPE + fileaccess.m_GetBlock = type(fileaccess.m_GetBlock)( _reader_class(py_buffer) ) # Finally, load the document pdf = pdfium_c.FPDF_LoadCustomDocument(fileaccess, None) @@ -410,9 +432,9 @@ Nonetheless, the following guide may be helpful to get started with the raw API, * When using the raw API, special care needs to be taken regarding object lifetime, considering that Python may garbage collect objects as soon as their reference count reaches zero. However, the interpreter has no way of magically knowing how long the underlying resources of a Python object might still be needed on the C side, so measures need to be taken to keep such objects referenced until PDFium does not depend on them anymore. - If resources need to remain valid after the time of a function call, PDFium documentation usually indicates this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segmentation fault). + If resources need to remain valid after the time of a function call, PDFium docs usually indicate this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segfault). - For instance, the documentation on `FPDF_LoadCustomDocument()` states that + For instance, the docs on `FPDF_LoadCustomDocument()` state that > The application must keep the file resources |pFileAccess| points to valid until the returned FPDF_DOCUMENT is closed. |pFileAccess| itself does not need to outlive the FPDF_DOCUMENT. This means that the callback function and the Python buffer need to be kept alive as long as the `FPDF_DOCUMENT` is used. @@ -446,7 +468,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, data_holder.close() ``` -* Finally, let's finish this guide with an example on how to render the first page of a document to a `PIL` image in `RGBA` color format. +* Finally, let's finish with an example how to render the first page of a document to a `PIL` image in `RGBA` color format. ```python import math import ctypes @@ -491,12 +513,12 @@ Nonetheless, the following guide may be helpful to get started with the raw API, pdfium_c.FPDF_RenderPageBitmap(*render_args) # Get a pointer to the first item of the buffer - first_item = pdfium_c.FPDFBitmap_GetBuffer(bitmap) + buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap) # Re-interpret the pointer to encompass the whole buffer - buffer = ctypes.cast(first_item, ctypes.POINTER(ctypes.c_ubyte * (width * height * 4))) + buffer_ptr = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (width * height * 4))) # Create a PIL image from the buffer contents - img = PIL.Image.frombuffer("RGBA", (width, height), buffer.contents, "raw", "BGRA", 0, 1) + img = PIL.Image.frombuffer("RGBA", (width, height), buffer_ptr.contents, "raw", "BGRA", 0, 1) # Save it as file img.save("out.png") @@ -526,46 +548,38 @@ To the author's knowledge, pypdfium2 is one of the rare Python libraries that ar As of early 2023, a single developer is author and rightsholder of the code base (apart from a few minor [code contributions](https://github.com/pypdfium2-team/pypdfium2/graphs/contributors)). -[^liberal_pdf_renderlibs]: The only other liberal-licensed PDF rendering libraries known to the authors are [`pdf.js`](https://github.com/mozilla/pdf.js/) (JavaScript) and [`Apache PDFBox`](https://github.com/apache/pdfbox) (Java). `pdf.js` is limited to a web environment. Creating Python bindings to `PDFBox` might be possible but there is no serious solution yet (apart from amateurish wrappers around its command-line API). +[^liberal_pdf_renderlibs]: The only other liberal-licensed PDF rendering libraries known to the author are [`pdf.js`](https://github.com/mozilla/pdf.js/) (JavaScript) and [`Apache PDFBox`](https://github.com/apache/pdfbox) (Java), but python bindings packages don't exist yet or are unsatisfactory. However, we wrote some gists that show it'd be possible in principle: [pdfbox](https://gist.github.com/mara004/51c3216a9eabd3dcbc78a86d877a61dc) (+ [setup](https://gist.github.com/mara004/881d0c5a99b8444fd5d1d21a333b70f8)), [pdfjs](https://gist.github.com/mara004/87276da4f8be31c80c38036c6ab667d7). ## Issues While using pypdfium2, you might encounter bugs or missing features. -In this case, please file an issue report. Remember to include applicable details such as tracebacks, operating system and CPU architecture, as well as the versions of pypdfium2 and used dependencies. - -In case your issue could be tracked down to a third-party dependency, we will accompany or conduct subsequent measures. +In this case, feel free to file an issue. If applicable, include details such as tracebacks, OS and CPU type, as well as the versions of pypdfium2 and used dependencies. -Here is a roadmap of relevant places: +Roadmap: * pypdfium2 - - [Issues panel](https://github.com/pypdfium2-team/pypdfium2/issues): Initial reports of specific issues. - May need to be transferred to other projects if not caused by or fixable in pypdfium2 code alone. + - [Issues panel](https://github.com/pypdfium2-team/pypdfium2/issues): Initial bug reports and feature requests. May need to be transferred to dependencies. - [Discussions page](https://github.com/pypdfium2-team/pypdfium2/discussions): General questions and suggestions. - - In case you do not want to publicly disclose the issue or your code, you may also contact the maintainers privately via e-mail. * PDFium - - [Bug tracker](https://bugs.chromium.org/p/pdfium/issues/list): Defects in PDFium. - Beware: The bridge between Python and C increases the probability of integration issues or API misuse. - The symptoms can often make it look like a PDFium bug while it is not. In some cases, this may be quite difficult to distinguish. + - [Bug tracker](https://bugs.chromium.org/p/pdfium/issues/list): Issues in PDFium. + Beware: The bridge between Python and C increases the probability of integration issues or API misuse. The symptoms can often make it look like a PDFium bug while it is not. - [Mailing list](https://groups.google.com/g/pdfium/): Questions regarding PDFium usage. * [pdfium-binaries](https://github.com/bblanchon/pdfium-binaries/issues): Binary builder. * [ctypesgen](https://github.com/ctypesgen/ctypesgen/issues): Bindings generator. ### Known limitations -pypdfium2 also has some drawbacks, of which you will be informed below. - #### Incompatibility with CPython 3.7.6 and 3.8.1 pypdfium2 built with mainstream ctypesgen cannot be used with releases 3.7.6 and 3.8.1 of the CPython interpreter due to a [regression](https://github.com/python/cpython/pull/16799#issuecomment-612353119) that [broke](https://github.com/ctypesgen/ctypesgen/issues/77) ctypesgen-created string handling code. -However, we are currently [making efforts](https://github.com/ctypesgen/ctypesgen/pull/162) to remove ctypesgen's wonky string code. -Since version 4, pypdfium2 releases will be built with a patched variant of ctypesgen. +Since version 4, pypdfium2 is built with a patched fork of ctypesgen that removes ctypesgen's problematic string code. #### Risk of unknown object lifetime violations As outlined in the raw API section, it is essential that Python-managed resources remain available as long as they are needed by PDFium. -The problem is that the Python interpreter may garbage collect objects with reference count zero at any time. Thus, it can happen that an unreferenced but still required object by chance stays around long enough before it is garbage collected. Such dangling objects are likely to cause non-deterministic segmentation faults. +The problem is that the Python interpreter may garbage collect objects with reference count zero at any time, so an unreferenced but still required object may either by chance stay around long enough or disappear too soon, resulting in non-deterministic memory issues that are hard to debug. If the timeframe between reaching reference count zero and removal is sufficiently large and roughly consistent across different runs, it is even possible that mistakes regarding object lifetime remain unnoticed for a long time. Although we intend to develop helpers carefully, it cannot be fully excluded that unknown object lifetime violations are still lurking around somewhere, especially if unexpected requirements were not documented by the time the code was written. @@ -583,26 +597,36 @@ With special platforms and/or code, sometimes unforeseen problems can occur [(ca ## Development + This section contains some key information relevant for project maintainers. - +### Long lines + +The pypdfium2 codebase does not hard wrap long lines. +It is recommended to set up automatic word wrap in your text editor, e.g. VS Code: +``` +editor.wordWrap = bounded +editor.wordWrapColumn = 100 +``` -### Documentation +### Docs -pypdfium2 provides API documentation using [Sphinx](https://github.com/sphinx-doc/sphinx/). It can be rendered to various formats, including HTML: +pypdfium2 provides API documentation using [Sphinx](https://github.com/sphinx-doc/sphinx/), which can be rendered to various formats, including HTML: ```bash sphinx-build -b html ./docs/source ./docs/build/html/ +# short alias +./run build ``` -Built documentation is primarily hosted on [`readthedocs.org`](https://readthedocs.org/projects/pypdfium2/). +Built docs are primarily hosted on [`readthedocs.org`](https://readthedocs.org/projects/pypdfium2/). It may be configured using a [`.readthedocs.yaml`](.readthedocs.yaml) file (see [instructions](https://docs.readthedocs.io/en/stable/config-file/v2.html)), and the administration page on the web interface. RTD supports hosting multiple versions, so we currently have one linked to the `main` branch and another to `stable`. New builds are automatically triggered by a webhook whenever you push to a linked branch. -Additionally, one documentation build can also be hosted on [GitHub Pages](https://pypdfium2-team.github.io/pypdfium2/index.html). +Additionally, one doc build can also be hosted on [GitHub Pages](https://pypdfium2-team.github.io/pypdfium2/index.html). It is implemented with a CI workflow, which is supposed to be triggered automatically on release. -This provides us with full control over the build environment and the used commands, whereas RTD is kind of limited in this regard. +This provides us with full control over the build env and the used commands, whereas RTD may be less liberal in this regard. ### Testing @@ -616,7 +640,7 @@ Note that ... * you can pass `-sv` to get more detailed output. * `$DEBUG_AUTOCLOSE=1` may be set to get debugging information on automatic object finalization. -To get code coverage statistics, you can run +To get code coverage statistics, you may call ```bash ./run coverage ``` @@ -625,7 +649,7 @@ Sometimes, it can also be helpful to test code on many PDFs.[^testing_corpora] In this case, the command-line interface and `find` come in handy: ```bash # Example A: Analyse PDF images (in the current working directory) -find . -name '*.pdf' -exec bash -c "echo \"{}\" && pypdfium2 pageobjects \"{}\" --types image" \; +find . -name '*.pdf' -exec bash -c "echo \"{}\" && pypdfium2 pageobjects \"{}\" --filter image" \; # Example B: Parse PDF table of contents find . -name '*.pdf' -exec bash -c "echo \"{}\" && pypdfium2 toc \"{}\"" \; ``` @@ -634,13 +658,12 @@ find . -name '*.pdf' -exec bash -c "echo \"{}\" && pypdfium2 toc \"{}\"" \; ### Release workflow -The release process is fully automated using Python scripts and a CI setup for GitHub Actions. -A new release is triggered every Tuesday, one day after `pdfium-binaries`. +The release process is fully automated using Python scripts and scheduled release workflows. You may also trigger the workflow manually using the GitHub Actions panel or the [`gh`](https://cli.github.com/) command-line tool. Python release scripts are located in the folder `setupsrc/pypdfium2_setup`, along with custom setup code: * `update_pdfium.py` downloads binaries and generates the bindings. -* `craft_packages.py` builds platform-specific wheel packages and a source distribution suitable for PyPI upload. +* `craft_packages.py pypi` builds platform-specific wheel packages and a source distribution suitable for PyPI upload. * `autorelease.py` takes care of versioning, changelog, release note generation and VCS checkin. The autorelease script has some peculiarities maintainers should know about: @@ -673,7 +696,7 @@ In case of necessity, you may also forego autorelease/CI and do the release manu * Build the packages ```bash python3 setupsrc/pypdfium2_setup/update_pdfium.py - python3 setupsrc/pypdfium2_setup/craft_packages.py + python3 setupsrc/pypdfium2_setup/craft_packages.py pypi ``` * Upload to PyPI ```bash diff --git a/conda/bundle/recipe/meta.yaml b/conda/bundle/recipe/meta.yaml new file mode 100644 index 000000000..9afeca9e9 --- /dev/null +++ b/conda/bundle/recipe/meta.yaml @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: 2023 geisserml +# SPDX-License-Identifier: CC-BY-4.0 + +# NOTE +# This is an attempt to bundle the pdfium binaries with conda packages, like we do for pypi wheels. +# However, conda does not support CPU specific but Python version independent packages, meaning we'd have to build for each Python separately, so the better solution is probably to unbundle pdfium. +# For now, let's retain this passively as a second option just in case a situation arises where bundling would be desired anyway. +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +{% set helpers_ver = environ["M_HELPERS_VER"] %} +{% set git_depth = environ["M_GIT_DEPTH"] %} +{% set pl_spec = environ["IN_PDFIUM_PLATFORM"] %} +# {% set setup_cfg = load_file_data("setup.cfg") %} + +package: + name: pypdfium2_bundle + version: {{ helpers_ver }} + +source: + git_url: ../../.. + git_depth: {{ git_depth }} + +build: + number: 0 + entry_points: + - pypdfium2 = pypdfium2.__main__:cli_main + script_env: + - PDFIUM_PLATFORM=prepared!{{ pl_spec }} + script: + - {{ PYTHON }} conda/prepare_script.py + - {{ PYTHON }} -m pip install . -v --no-deps --no-build-isolation + +requirements: + # NOTE conda theoretically offers an additional host section, but for our purposes this is effectively the same as the build section + build: + - git + - python + - pip + - setuptools + - wheel !=0.38.0,!=0.38.1 + run: + - python + +# pass --no-test if cross-building for non-host target +# NOTE not embedding the whole helpers test suite to avoid blowing upload size +test: + requires: + - pip + - python + imports: + - pypdfium2 + - pypdfium2_raw # bundled + source_files: + - conda/raw/minitest.py + commands: + - pip check + - pypdfium2 --help + - python conda/raw/minitest.py + +about: + summary: Python bindings to PDFium (bundled helpers/binary/bindings) + license: (Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty + license_file: + - LICENSES/Apache-2.0.txt + - LICENSES/BSD-3-Clause.txt + - LICENSES/CC-BY-4.0.txt + - LICENSES/LicenseRef-PdfiumThirdParty.txt + - .reuse/dep5-wheel + dev_url: https://github.com/pypdfium2-team/pypdfium2 + doc_url: https://pypdfium2.readthedocs.io + +extra: + recipe-maintainers: + - pypdfium2-team + - mara004 diff --git a/conda/helpers/recipe/meta.yaml b/conda/helpers/recipe/meta.yaml new file mode 100644 index 000000000..00d4028e7 --- /dev/null +++ b/conda/helpers/recipe/meta.yaml @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: 2023 geisserml +# SPDX-License-Identifier: CC-BY-4.0 + +{% set pdfium_max = environ["PDFIUM_MAX"] %} +{% set helpers_ver = environ["M_HELPERS_VER"] %} +{% set git_depth = environ["M_GIT_DEPTH"] %} + +package: + name: pypdfium2 + version: {{ helpers_ver }} + +source: + git_url: ../../.. + git_depth: {{ git_depth }} + +build: + number: 0 + noarch: python + entry_points: + - pypdfium2 = pypdfium2.__main__:cli_main + script_env: + - PYPDFIUM_MODULES=helpers + script: + - {{ PYTHON }} conda/prepare_script.py + - {{ PYTHON }} -m pip install . -v --no-deps --no-build-isolation + +requirements: + build: + - git + - python + - pip + - setuptools + - wheel !=0.38.0,!=0.38.1 + run: + # Set an upper boundary for pypdfium2_raw as defined in craft_packages.py + # NOTE There currently is no significant minimum pdfium requirement, but we could add one should the necessity arise. + - python + - pypdfium2_team::pypdfium2_raw <={{ pdfium_max }} + +# FIXME this will embed the whole test suite in the package - not sure if that's a good idea +test: + requires: + - pip + - pytest + - pillow + - numpy + imports: + - pypdfium2 + source_files: + - tests/ + - tests_old/ + commands: + - pip check + - pypdfium2 --help + - pytest tests/ tests_old/ + +about: + summary: Python bindings to PDFium (helpers, external bindings) + description: | + This package provides python helpers around pdfium. + Dependants are suggested to pin to a major version, but any tighter pinning is discouraged since it increases the risk for conflicts, and would lock you out from future fixes. + license: Apache-2.0 OR BSD-3-Clause + license_file: + - LICENSES/Apache-2.0.txt + - LICENSES/BSD-3-Clause.txt + - LICENSES/CC-BY-4.0.txt + dev_url: https://github.com/pypdfium2-team/pypdfium2 + doc_url: https://pypdfium2.readthedocs.io + +extra: + recipe-maintainers: + - pypdfium2-team + - mara004 diff --git a/conda/prepare_script.py b/conda/prepare_script.py new file mode 100644 index 000000000..215e49220 --- /dev/null +++ b/conda/prepare_script.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2023 geisserml +# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parents[1] / "setupsrc")) +from pypdfium2_setup.craft_packages import TmpCommitCtx + +def main(): + if TmpCommitCtx.FILE.exists(): + TmpCommitCtx.undo() + +if __name__ == "__main__": + main() diff --git a/conda/raw/minitest.py b/conda/raw/minitest.py new file mode 100644 index 000000000..142fd4658 --- /dev/null +++ b/conda/raw/minitest.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: 2023 geisserml +# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause + +# minimal test confirming we can call the library + +import pypdfium2_raw as pdfium + +# see pypdfium2::_library_scope.py +init_config = pdfium.FPDF_LIBRARY_CONFIG( + version = 2, + m_pUserFontPaths = None, + m_pIsolate = None, + m_v8EmbedderSlot = 0, +) +pdfium.FPDF_InitLibraryWithConfig(init_config) + +doc = pdfium.FPDF_CreateNewDocument() +page = pdfium.FPDFPage_New(doc, 0, 595, 842) +pdfium.FPDF_ClosePage(page) +pdfium.FPDF_CloseDocument(doc) + +pdfium.FPDF_DestroyLibrary() diff --git a/conda/raw/recipe/meta.yaml b/conda/raw/recipe/meta.yaml new file mode 100644 index 000000000..4b0ecd833 --- /dev/null +++ b/conda/raw/recipe/meta.yaml @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: 2023 geisserml +# SPDX-License-Identifier: CC-BY-4.0 + +{% set pdfium_short = environ["PDFIUM_SHORT"] %} +{% set pdfium_full = environ["PDFIUM_FULL"] %} +{% set git_depth = environ["M_GIT_DEPTH"] %} + +package: + name: pypdfium2_raw + version: {{ pdfium_short }} + +source: + git_url: ../../.. + git_depth: {{ git_depth }} + +build: + number: 0 + noarch: python + script_env: + - PYPDFIUM_MODULES=raw + - PDFIUM_PLATFORM=prepared!system:{{ pdfium_short }} + script: + - {{ PYTHON }} conda/prepare_script.py + - {{ PYTHON }} -m pip install . -v --no-deps --no-build-isolation + +requirements: + build: + - git + - python + - pip + - setuptools + - wheel !=0.38.0,!=0.38.1 + run: + # Pin pdfium-binaries to an exact version to ensure bindings/binary ABI match. + # As long as we rebuild pypdfium2_raw continuously in sync with pdfium-binaries, this should not become a flexibility problem. + - python + - bblanchon::pdfium-binaries =={{ pdfium_full }} + +test: + requires: + - pip + - python + imports: + - pypdfium2_raw + source_files: + - conda/raw/minitest.py + commands: + - pip check + - python conda/raw/minitest.py + +about: + summary: Python bindings to PDFium (raw, external binary) + description: | + This package provides raw ctypes bindings to pdfium. + Important: DO NOT PIN to an exact version, as pypdfium2_raw itself pins pdfium-binaries to achieve ABI safety. + license: Apache-2.0 OR BSD-3-Clause + license_file: + - LICENSES/Apache-2.0.txt + - LICENSES/BSD-3-Clause.txt + - LICENSES/CC-BY-4.0.txt + dev_url: https://github.com/pypdfium2-team/pypdfium2 + doc_url: https://pypdfium2.readthedocs.io + +extra: + recipe-maintainers: + - pypdfium2-team + - mara004 diff --git a/data/.gitkeep b/data/.gitkeep deleted file mode 100644 index 8d1c8b69c..000000000 --- a/data/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/devel/2023_10_setup_tasks.md b/docs/devel/2023_10_setup_tasks.md deleted file mode 100644 index 3833d9f08..000000000 --- a/docs/devel/2023_10_setup_tasks.md +++ /dev/null @@ -1,22 +0,0 @@ - - - -# PR 263 (versioning improvements) - -## pre-merge -*The points below are kind of addressed now, but the considerations are retained for informational purposes.* -- Properly integrate version data source (git/supply/fallback) and editable info. - * What is the cleanest way of embedding this the version file? - * Maybe we'll want to add add an abstracted `uncertain` property in the version file? - * How should we add this to the version str? We'd need some separator other than dot to keep it parsable, but it seems like PEP 440 doesn't allow. Do we have to forego PEP 440 in the library, and exclude the info from PyPA tools? - -## post-merge -- Fix polluted integration of sourcebuild version (see comments in `version.py`). -- Change autorelease to swap minor/patch versioning logic. In terms of API, the helpers version is probably more significant than the pdfium version. -- Consider including auto-generated third-party license file from pdfium binaries. -- Progress `system` target to build/include bindings for a given version. Integrate version file. Allow managing through `run emplace`. -- Extract only the binaries in question, not whole archives. Download headers from pdfium directly. Build bindings only once and share in a `data/` cache. -- Include binary/bindings hashes in the pdfium version file. For performance reasons, we should not validate this on init, but rather provide the caller with a validate function. -- Think about how we will handle versioning/tagging with the future Conda packages. We'll probably need new tag formats for the two conda packages, e.g. `conda_{raw,helpers}/$PYPI_TAG-$BUILD`. -- Consider a git pull hook to auto-update helpers version file of editable install. -- Make CLI a separate module? diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 527081c97..5f2e0507d 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -5,4 +5,11 @@ # Changelog for next release - Fixed faulty version repr (avoid trailing `+` if desc is empty). -- Added `PDFIUM_BINDINGS=reference` to use pre-built bindings when installing from source. +- Merged conda packaging code. The packages build, but there is no CI integration yet. +- Updated setup code, mainly to support conda. + * Independent bindings cache. Download headers from pdfium. Extract archive members explicitly. + * Cleaned up version integration of sourcebuild. + * Changed `system` platform to generate files according to given version, instead of expecting given files. + * Added `provided!` prefix to platform spec, allowing to install with given files. + * Added `PDFIUM_BINDINGS=reference` to use pre-built bindings when installing from source. +- Updated Readme. diff --git a/docs/devel/tasks.md b/docs/devel/tasks.md deleted file mode 100644 index 54093839f..000000000 --- a/docs/devel/tasks.md +++ /dev/null @@ -1,34 +0,0 @@ - - - -# Tasks - -These are various tasks for the maintainer to keep in mind, in no specific order. -Also see the issues panel and inline `TODO`/`FIXME` marks in source code. - -### Main Code -* Add a matrix-based rendering method, and perhaps a support method around it for common transformations (crop, margins, rotate, mirror, ...). -* Add helpers for interruptible rendering. -* Check if we should use `FPDFPage_HasTransparency()` on rendering. - -### Setup Infrastructure -* update_pdfium: build bindings on native OS hosts so we can use OS preprocessor defines, to expose OS-specific members -* Add means to plug in PDFium headers/binaries from an arbitrary location. -* craft_packages: add means to skip platforms for which artefacts are missing. -* update_pdfium/setup: re-think `data/` cache. Consider including version and V8 status in data dirname? Consider caching multiple versions? -* Use the logging module rather than `print()`. - -### Tests -* Rewrite tests completely -* Test auto-casting -* Add an improved image test file that should ideally contain all kinds of PDF images - -### Documentation -* Add/rewrite remaining Readme sections. - -### GitHub Workflows -* build_packages: Try to avoid setting a temporary tag during the build stage, to prevent confusion -* Add a testing workflow to be run on PRs (Test suite, code coverage, ...) -* Consider testing PyPy interpreter as well - -### Miscellaneous diff --git a/run b/run index 263bc7134..5f70f5e75 100755 --- a/run +++ b/run @@ -14,16 +14,16 @@ function check() { } function clean() { - rm -rf pypdfium2*.egg-info/ src/pypdfium2*.egg-info/ build/ dist/ data/* tests/output/* tests_old/output/* + rm -rf pypdfium2*.egg-info/ src/pypdfium2*.egg-info/ build/ dist/ data/* tests/output/* tests_old/output/* conda/bundle/out/ conda/helpers/out/ conda/raw/out/ } -function packaging() { +function packaging_pypi() { clean check # calling update_pdfium is not strictly necessary, but may improve performance because downloads are done in parallel, rather than linear with each package python3 setupsrc/pypdfium2_setup/update_pdfium.py - python3 setupsrc/pypdfium2_setup/craft_packages.py + python3 setupsrc/pypdfium2_setup/craft_packages.py pypi twine check dist/* # ignore W002: erroneous detection of __init__.py files as duplicates @@ -53,8 +53,14 @@ check) clean) clean;; -packaging) - packaging;; +packaging_pypi) + packaging_pypi;; + +update) + python3 setupsrc/pypdfium2_setup/update_pdfium.py $args;; + +craft) + python3 setupsrc/pypdfium2_setup/craft_packages.py $args;; build) python3 setupsrc/pypdfium2_setup/build_pdfium.py $args;; diff --git a/setup.cfg b/setup.cfg index 310850139..4a1c69390 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,10 +27,6 @@ classifiers = Topic :: Multimedia :: Graphics Topic :: Software Development :: Libraries -[options.entry_points] -console_scripts = - pypdfium2 = pypdfium2.__main__:cli_main - # NOTE We use requirements files instead of [options.extras_require]. Rationale: # - BUG(177): PyPI refuses upload if custom deps ($DEP @ git+$URL) are specified. # - Installation is independent of pypdfium2 (allows to install setup deps beforehand and then use --no-build-isolation). diff --git a/setup.py b/setup.py index ba3484deb..98b2a3643 100644 --- a/setup.py +++ b/setup.py @@ -6,15 +6,13 @@ import os import sys -import traceback -import subprocess from pathlib import Path import setuptools from wheel.bdist_wheel import bdist_wheel from setuptools.command.build_py import build_py as build_py_orig sys.path.insert(0, str(Path(__file__).parent / "setupsrc")) -from pypdfium2_setup.emplace import get_pdfium +from pypdfium2_setup.emplace import prepare_setup from pypdfium2_setup.packaging_base import * @@ -43,46 +41,16 @@ class pypdfium_build_py (build_py_orig): def run(self, *args, **kwargs): - if self.editable_mode: + if hasattr(self, "editable_mode"): helpers_info = read_json(ModuleDir_Helpers/VersionFN) - helpers_info["is_editable"] = True + helpers_info["is_editable"] = bool(self.editable_mode) write_json(ModuleDir_Helpers/VersionFN, helpers_info) + else: + print("!!! Warning: cmdclass does not provide `editable_mode` attribute. Please file a bug report.") build_py_orig.run(self, *args, **kwargs) -def get_helpers_info(): - - # TODO consider adding some checks against record - - have_git_describe = False - if HAVE_GIT_REPO: - try: - helpers_info = parse_git_tag() - except subprocess.CalledProcessError: - print("Version uncertain: git describe failure - possibly a shallow checkout", file=sys.stderr) - traceback.print_exc() - else: - have_git_describe = True - helpers_info["data_source"] = "git" - else: - print("Version uncertain: git repo not available.") - - if not have_git_describe: - ver_file = ModuleDir_Helpers / VersionFN - if ver_file.exists(): - print("Falling back to given version info (e.g. sdist).", file=sys.stderr) - helpers_info = read_json(ver_file) - helpers_info["data_source"] = "given" - else: - print("Falling back to autorelease record.", file=sys.stderr) - record = read_json(AR_RecordFile) - helpers_info = parse_given_tag(record["tag"]) - helpers_info["data_source"] = "record" - - return helpers_info - - # semi-static metadata PROJECT_DESC = "Python bindings to PDFium" LICENSES_SHARED = ( @@ -99,17 +67,16 @@ def get_helpers_info(): ".reuse/dep5", ) +PLATFILES_GLOB = [BindingsFN, VersionFN, *LibnameForSystem.values()] -def main(): - - pl_name = os.environ.get(PlatSpec_EnvVar, "") - modnames = os.environ.get(ModulesSpec_EnvVar, "") - if modnames: - modnames = modnames.split(",") - assert set(modnames).issubset(ModulesAll) - else: - modnames = ModulesAll - assert len(modnames) in (1, 2) + +def assert_exists(dir, data_files): + missing = [f for f in data_files if not (dir/f).exists()] + if missing: + assert False, f"Missing data files: {missing}" + + +def run_setup(modnames, pl_name, pdfium_ver): kwargs = dict( name = "pypdfium2", @@ -123,39 +90,34 @@ def main(): install_requires = [], ) - # TODO consider using pdfium version for raw-only? - helpers_info = get_helpers_info() - kwargs["version"] = merge_tag(helpers_info, mode="py") - if modnames == [ModuleHelpers]: kwargs["description"] += " (helpers module)" kwargs["install_requires"] += ["pypdfium2_raw"] elif modnames == [ModuleRaw]: kwargs["name"] += "_raw" kwargs["description"] += " (raw module)" + kwargs["version"] = str(pdfium_ver) - with_helpers = ModuleHelpers in modnames - with_raw = ModuleRaw in modnames and pl_name != PlatTarget_None - if with_helpers: - helpers_info["is_editable"] = False # default + if ModuleHelpers in modnames: + # is_editable = None: unknown/fallback in case the cmdclass is not reached + helpers_info = get_helpers_info() + kwargs["version"] = merge_tag(helpers_info, mode="py") + helpers_info["is_editable"] = None write_json(ModuleDir_Helpers/VersionFN, helpers_info) kwargs["cmdclass"]["build_py"] = pypdfium_build_py kwargs["package_dir"]["pypdfium2"] = "src/pypdfium2" kwargs["package_data"]["pypdfium2"] = [VersionFN] - if with_raw: + kwargs["entry_points"] = dict(console_scripts=["pypdfium2 = pypdfium2.__main__:cli_main"]) + if ModuleRaw in modnames: kwargs["package_dir"]["pypdfium2_raw"] = "src/pypdfium2_raw" - if "pypdfium2_raw" not in kwargs["package_dir"]: - kwargs["exclude_package_data"] = {"pypdfium2_raw": [VersionFN, BindingsFN, *LibnameForSystem.values()]} - if pl_name == PlatTarget_None: + if ModuleRaw not in modnames: + kwargs["exclude_package_data"] = {"pypdfium2_raw": PLATFILES_GLOB} + if pl_name == ExtPlats.none: kwargs["license_files"] += LICENSES_SDIST - elif pl_name == PlatTarget_System: - # TODO generate bindings/version here according to some caller input? - assert (ModuleDir_Raw/BindingsFN).exists() and (ModuleDir_Raw/VersionFN).exists(), f"Bindings and version currently must be prepared by caller for {PlatTarget_System} target." - kwargs["package_data"]["pypdfium2_raw"] = [BindingsFN, VersionFN] + elif pl_name == ExtPlats.system: + kwargs["package_data"]["pypdfium2_raw"] = [VersionFN, BindingsFN] else: - pl_name = get_pdfium(pl_name) - emplace_platfiles(pl_name) sys_name = plat_to_system(pl_name) libname = LibnameForSystem[sys_name] kwargs["package_data"]["pypdfium2_raw"] = [VersionFN, BindingsFN, libname] @@ -164,8 +126,27 @@ def main(): kwargs["license"] = f"({kwargs['license']}) AND LicenseRef-PdfiumThirdParty" kwargs["license_files"] += LICENSES_WHEEL + if "pypdfium2" in kwargs["package_data"]: + assert_exists(ModuleDir_Helpers, kwargs["package_data"]["pypdfium2"]) + if "pypdfium2_raw" in kwargs["package_data"]: + assert_exists(ModuleDir_Raw, kwargs["package_data"]["pypdfium2_raw"]) + setuptools.setup(**kwargs) +def main(): + + pl_spec = os.environ.get(PlatSpec_EnvVar, "") + modspec = os.environ.get(ModulesSpec_EnvVar, "") + + # TODO embed is_prepared in version file? - in principle, it could be arbitrary caller-given files + with_prepare, pl_name, pdfium_ver, use_v8 = parse_pl_spec(pl_spec) + modnames = parse_modspec(modspec, pl_name) + + if ModuleRaw in modnames and with_prepare: + prepare_setup(pl_name, pdfium_ver, use_v8) + run_setup(modnames, pl_name, pdfium_ver) + + if __name__ == "__main__": main() diff --git a/setupsrc/pypdfium2_setup/_compat.py b/setupsrc/pypdfium2_setup/_compat.py deleted file mode 100644 index e2ed3a10f..000000000 --- a/setupsrc/pypdfium2_setup/_compat.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-FileCopyrightText: 2023 geisserml -# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc. -# Tries to use the most elegant strategy available in the caller's python version (>= 3.6) - -__all__ = ["safer_tar_unpack"] - -import sys - - -if sys.version_info >= (3, 11, 4): # PEP 706 - import shutil - def safer_tar_unpack(archive_path, dest_dir): - shutil.unpack_archive(archive_path, dest_dir, format="tar", filter="data") - -else: # workaround - - if sys.version_info >= (3, 9): - _is_within_dir = lambda path, dir: path.is_relative_to(dir) - else: - import os.path - _is_within_dir = lambda path, dir: os.path.commonpath([dir, path]) == str(dir) - - import tarfile - - def safer_tar_unpack(archive_path, dest_dir): - dest_dir = dest_dir.resolve() - with tarfile.open(archive_path) as tar: - for m in tar.getmembers(): - if not (m.isfile() or m.isdir()) or not _is_within_dir((dest_dir/m.name).resolve(), dest_dir): - raise RuntimeError("Path traversal, symlink or special member in tar archive (probably malicious).") - tar.extractall(dest_dir) diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py index 13142177a..0277c5eb6 100644 --- a/setupsrc/pypdfium2_setup/autorelease.py +++ b/setupsrc/pypdfium2_setup/autorelease.py @@ -11,7 +11,6 @@ from copy import deepcopy sys.path.insert(0, str(Path(__file__).parents[1])) -from pypdfium2_setup import update_pdfium # TODO consider dotted access? from pypdfium2_setup.packaging_base import * @@ -23,16 +22,10 @@ def run_local(*args, **kws): def update_refbindings(version): - - # re-generate host bindings - # TODO download headers from pdfium repo and call ctypesgen directly - host_bindings = DataDir / Host.platform / BindingsFN - host_bindings.unlink(missing_ok=True) - update_pdfium.main([Host.platform], version=version, ctypesgen_kws=dict(guard_symbols=True)) - assert host_bindings.exists() - - # update reference bindings - shutil.copyfile(host_bindings, RefBindingsFile) # yes this overwrites + RefBindingsFile.unlink() + build_pdfium_bindings(version, guard_symbols=True) + shutil.copyfile(DataDir_Bindings/BindingsFN, RefBindingsFile) + assert RefBindingsFile.exists() def do_versioning(config, record, prev_helpers, new_pdfium): diff --git a/setupsrc/pypdfium2_setup/build_pdfium.py b/setupsrc/pypdfium2_setup/build_pdfium.py index 0c46ee4c3..b85597a18 100755 --- a/setupsrc/pypdfium2_setup/build_pdfium.py +++ b/setupsrc/pypdfium2_setup/build_pdfium.py @@ -8,7 +8,7 @@ import sys import shutil import argparse -import urllib.request +import urllib.request as url_request from pathlib import Path, WindowsPath sys.path.insert(0, str(Path(__file__).parents[1])) @@ -20,7 +20,6 @@ DepotToolsDir = SBDir / "depot_tools" PDFiumDir = SBDir / "pdfium" PDFiumBuildDir = PDFiumDir / "out" / "Default" -OutputDir = DataDir / PlatNames.sourcebuild PatchesMain = [ (PatchDir/"shared_library.patch", PDFiumDir), @@ -99,58 +98,53 @@ def dl_pdfium(GClient, do_update, revision): run_cmd([GClient, "config", "--custom-var", "checkout_configuration=minimal", "--unmanaged", PdfiumURL], cwd=SBDir) if is_sync: - run_cmd([GClient, "sync", "--revision", f"origin/{revision}", "--no-history", "--with_branch_heads"], cwd=SBDir) + # TODO consider passing -D ? + run_cmd([GClient, "sync", "--revision", f"origin/{revision}", "--no-history", "--shallow"], cwd=SBDir) + # quick & dirty fix to make a version annotated commit available - pdfium gets versioned very frequently, so this should be more than enough + # TODO tighten to check out only up to the latest tag + # FIXME the repository is still left in a very confusing state - maybe we should just drop --no-history --shallow for simplicity? + run_cmd(["git", "fetch", "--depth=100"], cwd=PDFiumDir) return is_sync def _dl_unbundler(): - # Workaround: download missing tools for unbundle/replace_gn_files.py (syslibs build) + # Workaround: download missing tools for unbundle/replace_gn_files.py (to use ICU syslib) + # TODO get this fixed upstream tool_dir = PDFiumDir / "tools" / "generate_shim_headers" tool_file = tool_dir / "generate_shim_headers.py" tool_url = "https://raw.githubusercontent.com/chromium/chromium/main/tools/generate_shim_headers/generate_shim_headers.py" - tool_dir.mkdir(parents=True, exist_ok=True) if not tool_file.exists(): - urllib.request.urlretrieve(tool_url, tool_file) + tool_dir.mkdir(parents=True, exist_ok=True) + url_request.urlretrieve(tool_url, tool_file) -def get_pdfium_version(): +def identify_pdfium(): + # if not updated, we'll always be dirty because of the patches, so not much point checking it + desc = run_cmd(["git", "describe", "--all"], cwd=PDFiumDir, capture=True) + desc = desc.rsplit("/", maxsplit=1)[-1] + build, *id_parts = desc.split("-") + assert len(id_parts) < 2 - # FIXME awkward mix of local/remote git - this will fail to identify the tag if local and remote state do not match + # FIXME some duplication with base::parse_given_tag() + info = dict(build=build, n_commits=0, hash=None) + if len(id_parts) > 0: + info["n_commits"] = int(id_parts[0]) + if len(id_parts) > 1: + info["hash"] = id_parts[1] - head_commit = run_cmd(["git", "rev-parse", "--short", "HEAD"], cwd=PDFiumDir, capture=True) - refs_string = run_cmd(["git", "ls-remote", "--heads", PdfiumURL, "chromium/*"], cwd=None, capture=True) - - latest = refs_string.split("\n")[-1] - tag_commit, ref = latest.split("\t") - tag_commit = tag_commit[:7] - tag = ref.split("/")[-1] - - print(f"Current head {head_commit}, latest tagged commit {tag_commit} ({tag})", file=sys.stderr) - v_libpdfium = int(tag) if head_commit == tag_commit else head_commit - - return v_libpdfium + return info -def update_version(v_libpdfium): - write_pdfium_info(OutputDir, version=v_libpdfium, origin="sourcebuild", flags=[]) - - -def _create_resources_rc(v_libpdfium): - +def _create_resources_rc(pdfium_build): input_path = PatchDir / "win" / "resources.rc" output_path = PDFiumDir / "resources.rc" content = input_path.read_text() - - # NOTE RC does not seem to tolerate commit hash, so set a dummy version instead - if not v_libpdfium.isnumeric(): - v_libpdfium = "1.0" - - content = content.replace("$VERSION_CSV", v_libpdfium.replace(".", ",")) - content = content.replace("$VERSION", v_libpdfium) + content = content.replace("$VERSION_CSV", str(pdfium_build)) + content = content.replace("$VERSION", str(pdfium_build)) output_path.write_text(content) @@ -159,11 +153,11 @@ def _apply_patchset(patchset, check=True): run_cmd(["git", "apply", "--ignore-space-change", "--ignore-whitespace", "-v", patch], cwd=cwd, check=check) -def patch_pdfium(v_libpdfium): +def patch_pdfium(pdfium_build): _apply_patchset(PatchesMain) if sys.platform.startswith("win32"): _apply_patchset(PatchesWindows) - _create_resources_rc(v_libpdfium) + _create_resources_rc(pdfium_build) def configure(GN, config): @@ -176,46 +170,18 @@ def build(Ninja, target): run_cmd([Ninja, "-C", PDFiumBuildDir, target], cwd=PDFiumDir) -def find_lib(src_libname=None, directory=PDFiumBuildDir): - - if src_libname is not None: - path = directory / src_libname - if path.exists(): - return path - else: - print("Warning: Binary not found under given name.", file=sys.stderr) +def pack(pdfium_info): - if sys.platform.startswith("linux"): - libname = "libpdfium.so" - elif sys.platform.startswith("darwin"): - libname = "libpdfium.dylib" - elif sys.platform.startswith("win32"): - libname = "pdfium.dll" - else: - # TODO implement fallback artifact detection - raise RuntimeError(f"Not sure how pdfium artifact is called on platform '{sys.platform}'") + dest_dir = DataDir / ExtPlats.sourcebuild + dest_dir.mkdir(parents=True, exist_ok=True) - libpath = directory / libname - assert libpath.exists() + libname = LibnameForSystem[Host.system] + shutil.copy(PDFiumBuildDir/libname, dest_dir/libname) + write_pdfium_info(dest_dir, origin="sourcebuild", **pdfium_info) - return libpath - - -def pack(src_libpath, v_libpdfium, destname=None): - - # TODO remove existing binary/bindings, just to be safe - - if destname is None: - destname = LibnameForSystem[Host.system] - - OutputDir.mkdir(parents=True, exist_ok=True) - destpath = OutputDir / destname - shutil.copy(src_libpath, destpath) - - update_version(v_libpdfium) - - include_dir = PDFiumDir / "public" - call_ctypesgen(OutputDir, include_dir, pl_name=Host.platform) + # We want to use local headers instead of downloading with build_pdfium_bindings(), therefore call run_ctypesgen() directly + # FIXME PDFIUM_BINDINGS=reference not honored + run_ctypesgen(dest_dir, headers_dir=PDFiumDir/"public", compile_lds=[dest_dir]) def get_tool(name): @@ -243,8 +209,6 @@ def serialise_config(config_dict): def main( - b_src_libname = None, - b_dest_libname = None, b_update = False, b_revision = None, b_target = None, @@ -273,10 +237,10 @@ def main( Ninja = get_tool("ninja") pdfium_dl_done = dl_pdfium(GClient, b_update, b_revision) - v_libpdfium = get_pdfium_version() + pdfium_info = identify_pdfium() if pdfium_dl_done: - patch_pdfium(v_libpdfium) + patch_pdfium(pdfium_info["build"]) if b_use_syslibs: _dl_unbundler() @@ -291,8 +255,7 @@ def main( configure(GN, config_str) build(Ninja, b_target) - libpath = find_lib(b_src_libname) - pack(libpath, v_libpdfium, b_dest_libname) + pack(pdfium_info) def parse_args(argv): @@ -300,15 +263,6 @@ def parse_args(argv): parser = argparse.ArgumentParser( description = "A script to automate building PDFium from source and generating bindings with ctypesgen.", ) - - parser.add_argument( - "--src-libname", - help = "Name of the generated PDFium binary file. This script tries to automatically find the binary, which should usually work. If it does not, however, this option may be used to explicitly provide the file name to look for.", - ) - parser.add_argument( - "--dest-libname", - help = "Rename the binary. Must be a name recognised by packaging code.", - ) parser.add_argument( "--update", "-u", action = "store_true", diff --git a/setupsrc/pypdfium2_setup/craft_packages.py b/setupsrc/pypdfium2_setup/craft_packages.py index 0fbe050de..6c9f8088c 100644 --- a/setupsrc/pypdfium2_setup/craft_packages.py +++ b/setupsrc/pypdfium2_setup/craft_packages.py @@ -7,34 +7,99 @@ import argparse import tempfile from pathlib import Path +from functools import partial sys.path.insert(0, str(Path(__file__).parents[1])) # TODO consider dotted access? from pypdfium2_setup.packaging_base import * +from pypdfium2_setup.emplace import prepare_setup + + +P_PYPI = "pypi" +P_CONDA_BUNDLE = "conda_bundle" +P_CONDA_RAW = "conda_raw" +P_CONDA_HELPERS = "conda_helpers" + +CondaDir = ProjectDir / "conda" + + +def parse_args(): + + root_parser = argparse.ArgumentParser( + description = "Craft PyPI or conda packages for pypdfium2.", + ) + root_parser.add_argument( + "--pdfium-ver", + type = int, + default = None, + ) + subparsers = root_parser.add_subparsers(dest="parser") + pypi = subparsers.add_parser(P_PYPI) + conda_raw = subparsers.add_parser(P_CONDA_RAW) + conda_helpers = subparsers.add_parser(P_CONDA_HELPERS) + conda_bundle = subparsers.add_parser(P_CONDA_BUNDLE) + conda_bundle.add_argument( + "--platforms", + nargs = "+", + default = [Host.platform], + ) + conda_bundle.add_argument( + "--py-variants", + nargs = "*", + ) + + for p in (pypi, conda_bundle): + p.add_argument( + "--use-v8", + action = "store_true", + ) + + args = root_parser.parse_args() + if not args.pdfium_ver: + args.pdfium_ver = PdfiumVer.get_latest() + if args.parser == P_CONDA_BUNDLE: + if args.platforms and args.platforms[0] == "all": + args.platforms = list(CondaNames.keys()) + if args.py_variants and args.py_variants[0] == "all": + args.py_variants = ["3.8", "3.9", "3.10", "3.11", "3.12"] + + return args + + +def run_pypi_build(args): + run_cmd([sys.executable, "-m", "build", "--skip-dependency-check", "--no-isolation"] + args, cwd=ProjectDir, env=os.environ) + +def main_pypi(args): + + os.environ[PlatSpec_EnvVar] = ExtPlats.none + run_pypi_build(["--sdist"]) + + suffix = build_pl_suffix(args.pdfium_ver, args.use_v8) + for plat in ReleaseNames.keys(): + os.environ[PlatSpec_EnvVar] = plat + suffix + run_pypi_build(["--wheel"]) + clean_platfiles() class ArtifactStash: # Preserve in-tree aftefacts from editable install - def __init__(self): + def __enter__(self): self.tmpdir = None - # FIXME some degree of duplication with base::get_platfiles() file_names = [VersionFN, BindingsFN, LibnameForSystem[Host.system]] self.files = [fp for fp in [ModuleDir_Raw / fn for fn in file_names] if fp.exists()] if len(self.files) == 0: return - elif len(self.files) != len(file_names): - print(f"Warning: Expected exactly 2 platform files, but found {len(self.files)}.", file=sys.stderr) self.tmpdir = tempfile.TemporaryDirectory(prefix="pypdfium2_artifact_stash_") self.tmpdir_path = Path(self.tmpdir.name) for fp in self.files: shutil.move(fp, self.tmpdir_path) - def pop(self): + def __exit__(self, *_): if self.tmpdir is None: return for fp in self.files: @@ -42,41 +107,156 @@ def pop(self): self.tmpdir.cleanup() -def run_build(args): - run_cmd([sys.executable, "-m", "build", "--skip-dependency-check", "--no-isolation"] + args, cwd=ProjectDir, env=os.environ) +def run_conda_build(recipe_dir, out_dir, args=[]): + with TmpCommitCtx(): + run_cmd(["conda", "build", recipe_dir, "--output-folder", out_dir, *args], cwd=ProjectDir, env=os.environ) -def main(): +CondaNames = { + # NOTE looks like conda doesn't support musllinux yet ... + PlatNames.darwin_x64 : "osx-64", + PlatNames.darwin_arm64 : "osx-arm64", + PlatNames.linux_x64 : "linux-64", + PlatNames.linux_x86 : "linux-32", + PlatNames.linux_arm64 : "linux-aarch64", + PlatNames.linux_arm32 : "linux-armv7l", + PlatNames.windows_x64 : "win-64", + PlatNames.windows_x86 : "win-32", + PlatNames.windows_arm64 : "win-arm64", +} + +def _run_conda_bundle(args, pl_name, suffix, conda_args): - parser = argparse.ArgumentParser( - description = "Craft sdist and wheels for pypdfium2, using `python3 -m build`.", - ) - parser.add_argument( - "--use-v8", - action = "store_true", - ) - parser.add_argument( - "--version", - type = int, - default = None, - ) - args = parser.parse_args() - if not args.version: - args.version = PdfiumVer.get_latest() + os.environ["IN_"+PlatSpec_EnvVar] = pl_name + suffix + emplace_func = partial(prepare_setup, pl_name, args.pdfium_ver, args.use_v8) + + with CondaExtPlatfiles(emplace_func): + run_conda_build(CondaDir/"bundle", CondaDir/"bundle"/"out", conda_args) + + +def main_conda_bundle(args): + + helpers_ver = merge_tag(parse_git_tag(), "py") + os.environ["VERSION"] = helpers_ver - stash = ArtifactStash() + platforms = args.platforms.copy() + with_host = Host.platform in platforms + if with_host: + platforms.remove(Host.platform) + conda_host = CondaNames[Host.platform] + host_files = None - os.environ[PlatSpec_EnvVar] = PlatTarget_None - run_build(["--sdist"]) - clean_platfiles() + conda_args = [] + if args.py_variants: + conda_args += ["--variants", "{python: %s}" % (args.py_variants, )] + suffix = build_pl_suffix(args.pdfium_ver, args.use_v8) - suffix = (PlatSpec_V8Sym if args.use_v8 else "") + PlatSpec_VerSep + str(args.version) - for plat in BinaryPlatforms: - os.environ[PlatSpec_EnvVar] = plat + suffix - run_build(["--wheel"]) - clean_platfiles() + for pl_name in platforms: + + _run_conda_bundle(args, pl_name, suffix, [*conda_args, "--no-test"]) + if not host_files: + host_files = list((CondaDir/"bundle"/"out"/conda_host).glob(f"pypdfium2_bundle-{helpers_ver}-*.tar.bz2")) + + run_cmd(["conda", "convert", "-f", *host_files, "-p", CondaNames[pl_name], "-o", CondaDir/"bundle"/"out"], cwd=ProjectDir, env=os.environ) + for hf in host_files: + hf.unlink() + + if with_host: + _run_conda_bundle(args, Host.platform, suffix, conda_args) + + +def main_conda_raw(args): + os.environ["PDFIUM_SHORT"] = str(args.pdfium_ver) + os.environ["PDFIUM_FULL"] = PdfiumVer.to_full(args.pdfium_ver, type=str) + emplace_func = partial(prepare_setup, ExtPlats.system, args.pdfium_ver, use_v8=None) + with CondaExtPlatfiles(emplace_func): + run_conda_build(CondaDir/"raw", CondaDir/"raw"/"out") + + +def main_conda_helpers(args): + # Set the current pdfium version as upper boundary, for inherent API safety. + # Unfortunately, pdfium does not do semantic versioning, so it is hard to achieve safe upward flexibility. + # See also https://groups.google.com/g/pdfium/c/kCmgW_gTFYE/m/BPoJgbwOCQAJ + # In case risk of conflicts becomes a problem, we could estimate an increase based on pdfium's deprecation period. + # Relevant variables for such a calculation would be + # - version increment speed (guess: average 2 per day) + # - pdfium's lowest regular deprecation period (say: 6 months, as indicated by pdfium/CONTRIBUTING.md) + # - a buffer zone for us to recognize a deprecation (say: 2 months) + # Assuming a month has 30 days, this would result in + # 2 * 30 * (6-2) = 240 + os.environ["PDFIUM_MAX"] = str(args.pdfium_ver) + run_conda_build(CondaDir/"helpers", CondaDir/"helpers"/"out") + + +class TmpCommitCtx: + + # https://github.com/conda/conda-build/issues/5045 + # Work around local conda `git_url` not including uncommitted changes + # We can't reasonably use `path` since it does not honor gitignore rules, but would copy all files, including big generated directories like sourcebuild/ + # On the other hand, transferring generated files with `git_url` tends to be problematic, as a tmp commit renders the initial repo state invalid. + # Alternatively, we could perhaps make a clean copy of required files (e.g. using the sdist) and using `path` instead of git hacks? + + # use a tmp control file so we can also undo the commit in conda's isolated clone + FILE = CondaDir / "with_tmp_commit.txt" + + def __enter__(self): + # determine if there are any modified or new files + out = run_cmd(["git", "status", "--porcelain"], capture=True, cwd=ProjectDir) + self.have_mods = bool(out) + if self.have_mods: # make tmp commit + self.FILE.touch() + run_cmd(["git", "add", "."], cwd=ProjectDir) + run_cmd(["git", "commit", "-m", "!!! tmp commit for conda-build", "-m", "make conda-build include uncommitted changes"], cwd=ProjectDir) + + @classmethod + def undo(cls): + # assuming FILE exists (promised by callers) + run_cmd(["git", "reset", "--soft", "HEAD^"], cwd=ProjectDir) + run_cmd(["git", "reset", cls.FILE], cwd=ProjectDir) + cls.FILE.unlink() + + def __exit__(self, *_): + if self.have_mods: # pop tmp commit, if any + self.undo() + + +class CondaExtPlatfiles: + + def __init__(self, emplace_func): + self.emplace_func = emplace_func + + def __enter__(self): + self.platfiles = self.emplace_func() + self.platfiles = [ModuleDir_Raw/f for f in self.platfiles] + run_cmd(["git", "add", "-f"] + [str(f) for f in self.platfiles], cwd=ProjectDir) + + def __exit__(self, *_): + run_cmd(["git", "reset"] + [str(f) for f in self.platfiles], cwd=ProjectDir) + for fp in self.platfiles: + fp.unlink() + + +def main(): + + args = parse_args() - stash.pop() + with ArtifactStash(): + if args.parser == P_PYPI: + main_pypi(args) + elif args.parser.startswith("conda"): + helpers_info = parse_git_tag() + os.environ["M_GIT_DEPTH"] = str(helpers_info["n_commits"] + 2) + os.environ["M_HELPERS_VER"] = merge_tag(helpers_info, "py") + if args.parser == P_CONDA_BUNDLE: + main_conda_bundle(args) + elif args.parser == P_CONDA_RAW: + main_conda_raw(args) + elif args.parser == P_CONDA_HELPERS: + main_conda_helpers(args) + else: + assert False + else: + assert False if __name__ == '__main__': diff --git a/setupsrc/pypdfium2_setup/emplace.py b/setupsrc/pypdfium2_setup/emplace.py index 2ba93591a..95333e29e 100644 --- a/setupsrc/pypdfium2_setup/emplace.py +++ b/setupsrc/pypdfium2_setup/emplace.py @@ -15,71 +15,68 @@ # CONSIDER Linux/macOS: check that minimum OS version requirements are fulfilled # TODO add direct support for emplacing local pdfium from file -# FIXME V8 integration is a bit polluted (flags vs. bool) -def _repr_info(version, flags): - return str(version) + (":{%s}" % ','.join(flags) if flags else "") - - -def get_pdfium(plat_spec, force_rebuild=False): +def _get_pdfium_with_cache(pl_name, req_ver, req_flags, use_v8): - if plat_spec == PlatNames.sourcebuild: - # for now, require that callers ran build_pdfium.py beforehand so they are in charge of the build config - don't trigger sourcebuild in here if platform files don't exist - return PlatNames.sourcebuild + # TODO inline binary cache logic into update_pdfium ? - req_ver = None - req_flags = [] - use_v8 = False - if PlatSpec_VerSep in plat_spec: - plat_spec, req_ver = plat_spec.rsplit(PlatSpec_VerSep) - if plat_spec.endswith(PlatSpec_V8Sym): - plat_spec = plat_spec[:-len(PlatSpec_V8Sym)] - req_flags += ["V8", "XFA"] - use_v8 = True + system = plat_to_system(pl_name) + pl_dir = DataDir / pl_name + binary = pl_dir / LibnameForSystem[system] + binary_ver = pl_dir / VersionFN + bindings = DataDir_Bindings / BindingsFN - if not plat_spec or plat_spec.lower() == PlatTarget_Auto: - pl_name = Host.platform - if pl_name is None: - raise RuntimeError(f"No pre-built binaries available for {Host}. You may place custom binaries & bindings in data/sourcebuild and install with `{PlatSpec_EnvVar}=sourcebuild`.") - elif hasattr(PlatNames, plat_spec): - pl_name = getattr(PlatNames, plat_spec) + if all(f.exists() for f in (binary, binary_ver, bindings)): + prev_info = read_json(binary_ver) + update = prev_info["build"] != req_ver or set(prev_info["flags"]) != set(req_flags) else: - raise ValueError(f"Invalid binary spec '{plat_spec}'") + update = True - if not req_ver or req_ver.lower() == VerTarget_Latest: - req_ver = PdfiumVer.get_latest() + if update: # TODO better repr + print(f"Updating to {req_ver} {req_flags}", file=sys.stderr) + update_pdfium.main([pl_name], version=req_ver, use_v8=use_v8) else: - assert req_ver.isnumeric() - req_ver = int(req_ver) + print("Using cache") - prev_repr = "(Unknown)" - if force_rebuild: - need_rebuild = True - prev_repr = "(Ignored)" - else: - pl_dir = DataDir / pl_name - ver_file = pl_dir / VersionFN - if ver_file.exists() and all(fp.exists() for fp in get_platfiles(pl_name)): - prev_info = read_json(ver_file) - need_rebuild = prev_info["build"] != req_ver or set(prev_info["flags"]) != set(req_flags) - prev_repr = _repr_info(prev_info["build"], prev_info["flags"]) - else: - need_rebuild = True + # build_pdfium_bindings() has its own cache logic, so always call to ensure bindings match + compile_lds = [DataDir/Host.platform] if pl_name == Host.platform else [] + build_pdfium_bindings(req_ver, flags=req_flags, compile_lds=compile_lds) + + +def prepare_setup(pl_name, pdfium_ver, use_v8): - req_repr = _repr_info(req_ver, req_flags) + clean_platfiles() + flags = ["V8", "XFA"] if use_v8 else [] - if need_rebuild: - print(f"Switch from pdfium {prev_repr} to {req_repr}", file=sys.stderr) - update_pdfium.main([pl_name], version=req_ver, use_v8=use_v8) + if pl_name == ExtPlats.system: + # TODO add option for caller to pass in custom run_lds and headers_dir + build_pdfium_bindings(pdfium_ver, flags=flags, guard_symbols=True, run_lds=[]) + shutil.copyfile(DataDir_Bindings/BindingsFN, ModuleDir_Raw/BindingsFN) + write_pdfium_info(ModuleDir_Raw, pdfium_ver, origin="system", flags=flags) + return [BindingsFN, VersionFN] else: - print(f"Use existing cache for pdfium {req_repr}", file=sys.stderr) - - return pl_name + platfiles = [] + pl_dir = DataDir/pl_name + system = plat_to_system(pl_name) + + if pl_name == ExtPlats.sourcebuild: + # - sourcebuild bindings are captured once and can't really be re-generated, hence keep them in the platform directory so they are not overwritten + platfiles += [pl_dir/BindingsFN] + else: + platfiles += [DataDir_Bindings/BindingsFN] + _get_pdfium_with_cache(pl_name, pdfium_ver, flags, use_v8) + + platfiles += [pl_dir/LibnameForSystem[system], pl_dir/VersionFN] + for fp in platfiles: + shutil.copyfile(fp, ModuleDir_Raw/fp.name) + + return [fp.name for fp in platfiles] def main(): + # TODO add option to force rebuild parser = argparse.ArgumentParser( description = "Manage in-tree artifacts from an editable install.", ) @@ -89,20 +86,16 @@ def main(): nargs = "?", help = f"The platform specifier. Same format as of ${PlatSpec_EnvVar} on setup. 'none' removes existing artifacts.", ) - parser.add_argument( - "--force-rebuild", "-f", - action = "store_true", - help = "If given, always rebuild platform files even if a matching cache exists already.", - ) args = parser.parse_args() - if args.plat_spec == PlatTarget_None: + if args.plat_spec == ExtPlats.none: print("Remove existing in-tree platform files, if any.", file=sys.stderr) clean_platfiles() return - pl_name = get_pdfium(args.plat_spec, args.force_rebuild) - emplace_platfiles(pl_name) + need_prepare, *pl_info = parse_pl_spec(args.plat_spec) + assert need_prepare, "Can't use prepared target with emplace, would be no-op." + prepare_setup(*pl_info) if __name__ == "__main__": diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 5a3271fe0..1ad7101f3 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -9,23 +9,19 @@ import sys import json import shutil +import tarfile import platform import functools import sysconfig +import traceback import subprocess from pathlib import Path import urllib.request as url_request -sys.path.insert(0, str(Path(__file__).parents[1])) - # TODO(apibreak) consider renaming PDFIUM_PLATFORM to PDFIUM_BINARY ? -PlatSpec_EnvVar = "PDFIUM_PLATFORM" -PlatSpec_VerSep = ":" -PlatSpec_V8Sym = "-v8" -PlatTarget_None = "none" # sdist, no binary -PlatTarget_System = "system" # pdfium provided by system (if available) -PlatTarget_Auto = "auto" # pdfium-binaries for host -VerTarget_Latest = "latest" +PlatSpec_EnvVar = "PDFIUM_PLATFORM" +PlatSpec_VerSep = ":" +PlatSpec_V8Sym = "-v8" BindSpec_EnvVar = "PDFIUM_BINDINGS" BindTarget_Ref = "reference" @@ -42,6 +38,7 @@ ProjectDir = Path(__file__).parents[2] DataDir = ProjectDir / "data" +DataDir_Bindings = DataDir / "bindings" SourcebuildDir = ProjectDir / "sourcebuild" ModuleDir_Raw = ProjectDir / "src" / "pypdfium2_raw" ModuleDir_Helpers = ProjectDir / "src" / "pypdfium2" @@ -49,10 +46,10 @@ ChangelogStaging = ProjectDir / "docs" / "devel" / "changelog_staging.md" HAVE_GIT_REPO = (ProjectDir / ".git").exists() -AutoreleaseDir = ProjectDir / "autorelease" -AR_RecordFile = AutoreleaseDir / "record.json" # TODO verify contents on before merge -AR_ConfigFile = AutoreleaseDir / "config.json" -RefBindingsFile = AutoreleaseDir / BindingsFN +AutoreleaseDir = ProjectDir / "autorelease" +AR_RecordFile = AutoreleaseDir / "record.json" # TODO verify contents on before merge +AR_ConfigFile = AutoreleaseDir / "config.json" +RefBindingsFile = AutoreleaseDir / BindingsFN RepositoryURL = "https://github.com/pypdfium2-team/pypdfium2" PdfiumURL = "https://pdfium.googlesource.com/pdfium" @@ -84,7 +81,13 @@ class PlatNames: windows_x64 = SysNames.windows + "_x64" windows_x86 = SysNames.windows + "_x86" windows_arm64 = SysNames.windows + "_arm64" - sourcebuild = "sourcebuild" + + +class ExtPlats: + sourcebuild = "sourcebuild" + system = "system" + none = "none" + auto = "auto" ReleaseNames = { @@ -110,15 +113,13 @@ class PlatNames: BinaryPlatforms = list(ReleaseNames.keys()) BinarySystems = list(LibnameForSystem.keys()) -MainLibnames = list(LibnameForSystem.values()) class PdfiumVer: - # TODO consider namedtuple? V_KEYS = ("major", "minor", "build", "patch") + _refs_cache = {"lines": None, "dict": {}, "cursor": None} - # TODO consider cached property? @staticmethod @functools.lru_cache(maxsize=1) def get_latest(): @@ -126,31 +127,36 @@ def get_latest(): tag = git_ls.split("\t")[-1] return int( tag.split("/")[-1] ) - @staticmethod - def to_full(v_short, origin): + @classmethod + def to_full(cls, v_short, type=dict): - if origin == "pdfium-binaries": - # NOTE(future:conda) we may need this info to pin pdfium-binaries - info = url_request.urlopen(f"{ReleaseInfoURL}{v_short}").read().decode("utf-8") - info = json.loads(info) - title = info["name"] - match = re.match(rf"PDFium (\d+.\d+.{v_short}.\d+)", title) - v_string = match.group(1) - v_parts = [int(v) for v in v_string.split(".")] - v_short = int(v_short) - elif origin == "sourcebuild": - # For sourcebuild, we don't actually set the full version. Retrieving it from chromium is a bit complicated. Also note v_short may be a commit hash if building from an untagged commit. - v_parts = (None, None, v_short, None) - else: - assert False + v_short = int(v_short) + rc = cls._refs_cache - v_info = dict(zip(PdfiumVer.V_KEYS, v_parts)) - assert v_info["build"] == v_short + if rc["lines"] is None: + ChromiumURL = "https://chromium.googlesource.com/chromium/src" + rc["lines"] = run_cmd(["git", "ls-remote", "--sort", "-version:refname", "--tags", ChromiumURL, '*.*.*.0'], cwd=None, capture=True).split("\n") - return v_info - + if rc["cursor"] is None or rc["cursor"] > v_short: + for i, line in enumerate(rc["lines"]): + ref = line.split("\t")[-1].rsplit("/", maxsplit=1)[-1] + major, minor, build, patch = [int(v) for v in ref.split(".")] + rc["dict"][build] = (major, minor, build, patch) + if build == v_short: + rc["cursor"] = build + rc["lines"] = rc["lines"][i+1:] + break + + v_parts = rc["dict"][v_short] + if type in (tuple, list): + return v_parts + elif type is str: + return ".".join([str(v) for v in v_parts]) + elif type is dict: + return dict(zip(PdfiumVer.V_KEYS, v_parts)) + else: + assert False -# TODO Could consider adding a checksum to our JSON files as an barrier against corruption def read_json(fp): with open(fp, "r") as buf: @@ -161,18 +167,17 @@ def write_json(fp, data, indent=2): return json.dump(data, buf, indent=indent) -def write_pdfium_info(dir, version, origin, flags=[]): - # TODO(future) embed library search path for use with a custom ctypesgen loader - # TODO consider embedding ctypesgen version info, probably using a separate file and class? - info = dict(**PdfiumVer.to_full(version, origin), origin=origin, flags=flags) - info["bindings"] = BindTarget_Ref if BindTarget == BindTarget_Ref else "generated" +def write_pdfium_info(dir, build, origin, flags=[], n_commits=0, hash=None): + info = dict(**PdfiumVer.to_full(build, type=dict), n_commits=n_commits, hash=hash, origin=origin, flags=flags) write_json(dir/VersionFN, info) + return info def parse_given_tag(full_tag): info = dict() + # TODO looks like `git describe --dirty` does not account for new committable files, but this could be relevant - consider evaluating dirty ourselves by checking if `git status --porcelain` is non-empty tag = full_tag dirty = tag.endswith("-dirty") if dirty: @@ -226,7 +231,7 @@ def merge_tag(info, mode): def plat_to_system(pl_name): - if pl_name == PlatNames.sourcebuild: + if pl_name == ExtPlats.sourcebuild: # FIXME If doing a sourcebuild on an unknown host system, this returns None, which will cause binary detection code to fail (we need to know the platform-specific binary name) - handle this downsteam with fallback value? return Host.system result = [s for s in BinarySystems if pl_name.startswith(s)] @@ -336,7 +341,7 @@ def get_wheel_tag(pl_name): return "win_arm64" elif pl_name == PlatNames.windows_x86: return "win32" - elif pl_name == PlatNames.sourcebuild: + elif pl_name == ExtPlats.sourcebuild: # sysconfig.get_platform() may return universal2 on macOS. However, the binaries built here should be considered architecture-specific. # The reason why we don't simply do `if Host.platform: return get_wheel_tag(Host.platform) else ...` is that version info for pdfium-binaries does not have to match the sourcebuild host. # NOTE On Linux, this just returns f"linux_{arch}" (which is a valid wheel tag). Leave it as-is since we don't know the build's lowest compatible libc. The caller may re-tag using the wheel module's CLI. @@ -364,16 +369,13 @@ def run_cmd(command, cwd, capture=False, check=True, str_cast=True, **kwargs): return comp_process -def call_ctypesgen(target_dir, include_dir, pl_name, use_v8xfa=False, guard_symbols=False): - - # quick and dirty patch to allow using the pre-built bindings instead of calling ctypesgen - if BindTarget == BindTarget_Ref: - print("Using ref bindings as requested by env var.",file=sys.stderr) - if use_v8xfa: - print("Warning: default ref bindings are not V8/XFA compatible, expecting prior overwrite.") - shutil.copyfile(RefBindingsFile, target_dir/BindingsFN) - return - +def tar_extract_file(tar, src, dst_path): + src_buf = tar.extractfile(src) # src: path or tar member + with open(dst_path, "wb") as dst_buf: + shutil.copyfileobj(src_buf, dst_buf) + + +def run_ctypesgen(target_dir, headers_dir, flags=[], guard_symbols=False, compile_lds=[], run_lds=["."]): # The commands below are tailored to our fork of ctypesgen, so make sure we have that # Import ctypesgen only in this function so it does not have to be available for other setup tasks import ctypesgen @@ -381,28 +383,74 @@ def call_ctypesgen(target_dir, include_dir, pl_name, use_v8xfa=False, guard_symb bindings = target_dir / BindingsFN - args = ["ctypesgen", f"--strip-build-path={include_dir}", "--no-srcinfo", "--library", "pdfium", "--runtime-libdirs", "."] - if pl_name == Host.platform: - # assuming the binary already lies in target_dir - args += ["--compile-libdirs", target_dir] + args = ["ctypesgen", f"--strip-build-path={headers_dir}", "--no-srcinfo", "--library", "pdfium"] + + if run_lds: + args += ["--runtime-libdirs", *run_lds] + if compile_lds: + args += ["--compile-libdirs", *compile_lds] else: args += ["--no-load-library"] + if not guard_symbols: args += ["--no-symbol-guards"] - if use_v8xfa: - args += ["-D", "PDF_ENABLE_V8", "PDF_ENABLE_XFA"] - if pl_name.startswith(SysNames.windows) and Host.system == SysNames.windows: + if flags: + args += ["-D"] + [f"PDF_ENABLE_{f}" for f in flags] + if Host.system == SysNames.windows: args += ["-D", "_WIN32"] - args += ["--headers"] + [h.name for h in sorted(include_dir.glob("*.h"))] + ["-o", bindings] - run_cmd(args, cwd=include_dir) + args += ["--headers"] + [h.name for h in sorted(headers_dir.glob("*.h"))] + ["-o", bindings] + + run_cmd(args, cwd=headers_dir) text = bindings.read_text() - text = text.replace(str(include_dir), ".") + text = text.replace(str(headers_dir), ".") text = text.replace(str(Path.home()), "~") bindings.write_text(text) +def build_pdfium_bindings(version, headers_dir=None, flags=[], run_lds=["."], **kwargs): + + ver_path = DataDir_Bindings/VersionFN + bind_path = DataDir_Bindings/BindingsFN + + # TODO move refbindings handling into run_ctypesgen on behalf of sourcebuild? + # quick and dirty patch to allow using the pre-built bindings instead of calling ctypesgen + if BindTarget == BindTarget_Ref: + print("Using refbindings as requested by env var.",file=sys.stderr) + if flags: + print("Warning: default refbindings are not flags-compatible.") + shutil.copyfile(RefBindingsFile, DataDir_Bindings/BindingsFN) + ar_record = read_json(AR_RecordFile) + write_json(ver_path, dict(version=ar_record["pdfium"], flags=[], run_lds=["."], source="reference")) + return + + curr_info = dict(version=version, flags=list(flags), run_lds=list(run_lds), source="generated") + if bind_path.exists() and ver_path.exists(): + prev_info = read_json(ver_path) + if prev_info == curr_info: + return + else: + print(f"{prev_info} != {curr_info}") + + if not headers_dir: + headers_dir = DataDir_Bindings / "headers" + if headers_dir.exists(): + shutil.rmtree(headers_dir) + headers_dir.mkdir(parents=True, exist_ok=True) + archive_url = f"{PdfiumURL}/+archive/refs/heads/chromium/{version}/public.tar.gz" + archive_path = DataDir_Bindings / "pdfium_public.tar.gz" + url_request.urlretrieve(archive_url, archive_path) + with tarfile.open(archive_path) as tar: + for m in tar.getmembers(): + if m.isfile() and re.fullmatch(r"fpdf(\w+)\.h", m.name, flags=re.ASCII): + tar_extract_file(tar, m, headers_dir/m.name) + archive_path.unlink() + + run_ctypesgen(DataDir_Bindings, headers_dir, flags=flags, run_lds=run_lds, **kwargs) + write_json(ver_path, curr_info) + + def clean_platfiles(): deletables = [ @@ -410,7 +458,7 @@ def clean_platfiles(): ModuleDir_Raw / BindingsFN, ModuleDir_Raw / VersionFN, ] - deletables += [ModuleDir_Raw / fn for fn in MainLibnames] + deletables += [ModuleDir_Raw / fn for fn in LibnameForSystem.values()] for fp in deletables: if fp.is_file(): @@ -419,29 +467,89 @@ def clean_platfiles(): shutil.rmtree(fp) -def get_platfiles(pl_name): - system = plat_to_system(pl_name) - platfiles = ( - DataDir / pl_name / BindingsFN, - DataDir / pl_name / LibnameForSystem[system], - ) - return platfiles +def get_helpers_info(): + + # TODO consider adding some checks against record + + have_git_describe = False + if HAVE_GIT_REPO: + try: + helpers_info = parse_git_tag() + except subprocess.CalledProcessError: + print("Version uncertain: git describe failure - possibly a shallow checkout", file=sys.stderr) + traceback.print_exc() + else: + have_git_describe = True + helpers_info["data_source"] = "git" + else: + print("Version uncertain: git repo not available.") + + if not have_git_describe: + ver_file = ModuleDir_Helpers / VersionFN + if ver_file.exists(): + print("Falling back to given version info (e.g. sdist).", file=sys.stderr) + helpers_info = read_json(ver_file) + helpers_info["data_source"] = "given" + else: + print("Falling back to autorelease record.", file=sys.stderr) + record = read_json(AR_RecordFile) + helpers_info = parse_given_tag(record["tag"]) + helpers_info["data_source"] = "record" + + return helpers_info + + +def build_pl_suffix(version, use_v8): + return (PlatSpec_V8Sym if use_v8 else "") + PlatSpec_VerSep + str(version) -def emplace_platfiles(pl_name): +def parse_pl_spec(pl_spec, need_prepare=True): + + if pl_spec.startswith("prepared!"): + _, pl_spec = pl_spec.split("!", maxsplit=1) + return parse_pl_spec(pl_spec, need_prepare=False) + + req_ver = None + use_v8 = False + if PlatSpec_VerSep in pl_spec: + pl_spec, req_ver = pl_spec.rsplit(PlatSpec_VerSep) + if pl_spec.endswith(PlatSpec_V8Sym): + pl_spec = pl_spec[:-len(PlatSpec_V8Sym)] + use_v8 = True + + if not pl_spec or pl_spec == ExtPlats.auto: + pl_name = Host.platform + if pl_name is None: + raise RuntimeError(f"No pre-built binaries available for {Host}. You may place custom binaries & bindings in data/sourcebuild and install with `{PlatSpec_EnvVar}=sourcebuild`.") + elif hasattr(ExtPlats, pl_spec): + pl_name = getattr(ExtPlats, pl_spec) + elif hasattr(PlatNames, pl_spec): + pl_name = getattr(PlatNames, pl_spec) + else: + raise ValueError(f"Invalid binary spec '{pl_spec}'") - pl_dir = DataDir / pl_name - ver_file = pl_dir / VersionFN - if not pl_dir.exists(): - raise RuntimeError(f"Missing platform directory {pl_name}") - if not ver_file.exists(): - raise RuntimeError(f"Missing PDFium version file for {pl_name}") + if pl_name == ExtPlats.system: + assert req_ver, "Version must be given explicitly for system target." + + if req_ver: + assert req_ver.isnumeric() + req_ver = int(req_ver) + else: + req_ver = PdfiumVer.get_latest() + + return need_prepare, pl_name, req_ver, use_v8 + + +def parse_modspec(modspec, pl_name): + if modspec: + modnames = modspec.split(",") + assert set(modnames).issubset(ModulesAll) + assert len(modnames) in (1, 2) + else: + modnames = ModulesAll - clean_platfiles() - platfiles = get_platfiles(pl_name) - shutil.copyfile(ver_file, ModuleDir_Raw/VersionFN) + modnames = list(modnames) + if pl_name == ExtPlats.none and ModuleRaw in modnames: + modnames.remove(ModuleRaw) - for fp in platfiles: - if not fp.exists(): - raise RuntimeError(f"Platform file missing: {fp}") - shutil.copy(fp, ModuleDir_Raw) + return modnames diff --git a/setupsrc/pypdfium2_setup/update_pdfium.py b/setupsrc/pypdfium2_setup/update_pdfium.py index 20273ac40..fe71a1725 100755 --- a/setupsrc/pypdfium2_setup/update_pdfium.py +++ b/setupsrc/pypdfium2_setup/update_pdfium.py @@ -4,15 +4,15 @@ import sys import shutil +import tarfile import argparse import traceback import functools from pathlib import Path -from urllib import request +import urllib.request as url_request from concurrent.futures import ThreadPoolExecutor sys.path.insert(0, str(Path(__file__).parents[1])) -from pypdfium2_setup._compat import safer_tar_unpack # TODO consider dotted access? from pypdfium2_setup.packaging_base import * @@ -39,7 +39,7 @@ def _get_package(pl_name, version, robust, use_v8): print(f"'{fu}' -> '{fp}'") try: - request.urlretrieve(fu, fp) + url_request.urlretrieve(fu, fp) except Exception: if robust: traceback.print_exc() @@ -50,7 +50,7 @@ def _get_package(pl_name, version, robust, use_v8): return pl_name, fp -def download_releases(platforms, version, use_v8, max_workers, robust): +def download(platforms, version, use_v8, max_workers, robust): if not max_workers: max_workers = len(platforms) @@ -65,43 +65,22 @@ def download_releases(platforms, version, use_v8, max_workers, robust): return archives -# TODO Do not unpack whole archives, instead extract only the binaries we need and retrieve headers from pdfium directly. This would allow us to get rid of the tar compat code. - -def unpack_archives(archives): - for pl_name, archive_path in archives.items(): - dest_dir = DataDir / pl_name / "build_tar" - safer_tar_unpack(archive_path, dest_dir) - archive_path.unlink() - - -def generate_bindings(archives, version, use_v8, ctypesgen_kws): - - flags = [] - if use_v8: - flags += ["V8", "XFA"] +def extract(archives, version, flags): - for pl_name in archives.keys(): + for pl_name, arc_path in archives.items(): - pl_dir = DataDir / pl_name - build_dir = pl_dir / "build_tar" - bin_dir = build_dir / "lib" - - system = plat_to_system(pl_name) - if system == SysNames.windows: - bin_dir = build_dir / "bin" - - libname = LibnameForSystem[system] - src_libpath = bin_dir / libname - assert src_libpath.is_file() - shutil.copyfile(src_libpath, pl_dir/libname) + with tarfile.open(arc_path) as tar: + pl_dir = DataDir/pl_name + system = plat_to_system(pl_name) + libname = LibnameForSystem[system] + tar_libdir = "lib" if system != SysNames.windows else "bin" + tar_extract_file(tar, f"{tar_libdir}/{libname}", pl_dir/libname) + write_pdfium_info(pl_dir, version, origin="pdfium-binaries", flags=flags) - call_ctypesgen(pl_dir, build_dir/"include", pl_name=pl_name, use_v8xfa=use_v8, **ctypesgen_kws) - write_pdfium_info(DataDir/pl_name, version=version, origin="pdfium-binaries", flags=flags) - - shutil.rmtree(build_dir) + arc_path.unlink() -def main(platforms, version=None, robust=False, max_workers=None, use_v8=False, ctypesgen_kws={}): +def main(platforms, version=None, robust=False, max_workers=None, use_v8=False): if not version: version = PdfiumVer.get_latest() @@ -110,10 +89,11 @@ def main(platforms, version=None, robust=False, max_workers=None, use_v8=False, if len(platforms) != len(set(platforms)): raise ValueError("Duplicate platforms not allowed.") + flags = ["V8", "XFA"] if use_v8 else [] + clear_data(platforms) - archives = download_releases(platforms, version, use_v8, max_workers, robust) - unpack_archives(archives) - generate_bindings(archives, version, use_v8, ctypesgen_kws) + archives = download(platforms, version, use_v8, max_workers, robust) + extract(archives, version, flags) # low-level CLI interface for testing - users should go with higher-level emplace.py or setup.py diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 498d32ed4..499477c17 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -153,7 +153,7 @@ def init_forms(self, config=None): # safety check for older binaries to prevent a segfault (could be removed at some point) # https://github.com/bblanchon/pdfium-binaries/issues/105 - if "V8" in PDFIUM_INFO.flags and PDFIUM_INFO.origin == "pdfium-binaries" and PDFIUM_INFO.build <= 5677: + if "V8" in PDFIUM_INFO.flags and PDFIUM_INFO.origin != "sourcebuild" and PDFIUM_INFO.build <= 5677: raise RuntimeError("V8 enabled pdfium-binaries builds <= 5677 crash on init_forms().") if not config: diff --git a/src/pypdfium2/_library_scope.py b/src/pypdfium2/_library_scope.py index 86f596b15..3407b9e80 100644 --- a/src/pypdfium2/_library_scope.py +++ b/src/pypdfium2/_library_scope.py @@ -8,11 +8,22 @@ def init_lib(): - # NOTE PDFium developers plan changes to the initialisation API (see https://crbug.com/pdfium/1446) assert not pdfium_i.LIBRARY_AVAILABLE if pdfium_i.DEBUG_AUTOCLOSE: print("Initialize PDFium (auto)", file=sys.stderr) - pdfium_c.FPDF_InitLibrary() + + # PDFium init API may change in the future: https://crbug.com/pdfium/1446 + # NOTE Technically, FPDF_InitLibrary() would be sufficient for our purposes, but since it's formally marked for deprecation, don't use it to be on the safe side. Also, avoid experimental config versions that might not be promoted to stable. + config = pdfium_c.FPDF_LIBRARY_CONFIG( + version = 2, + m_pUserFontPaths = None, + m_pIsolate = None, + m_v8EmbedderSlot = 0, + # m_pPlatform = None, # v3 + # m_RendererType = pdfium_c.FPDF_RENDERERTYPE_AGG, # v4 + ) + pdfium_c.FPDF_InitLibraryWithConfig(config) + pdfium_i.LIBRARY_AVAILABLE.value = True diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index d43a7a34f..11403a552 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -40,6 +40,25 @@ def __setattr__(self, name, value): def __repr__(self): return self.version + @cached_property + def api_tag(self): + return tuple(self._data[k] for k in self._TAG_FIELDS) + + def _craft_tag(self): + return ".".join(str(v) for v in self.api_tag) + + def _craft_desc(self, extra=[]): + + local_ver = [] + if self.n_commits > 0: + local_ver += [str(self.n_commits), str(self.hash)] + local_ver += extra + + desc = "" + if local_ver: + desc += "+" + ".".join(local_ver) + return desc + @cached_property def version(self): return self.tag + self.desc @@ -50,13 +69,9 @@ class _version_pypdfium2 (_abc_version): _FILE = Path(__file__).parent / "version.json" _TAG_FIELDS = ("major", "minor", "patch") - @cached_property - def api_tag(self): - return tuple(self._data[k] for k in self._TAG_FIELDS) - @cached_property def tag(self): - tag = ".".join(str(v) for v in self.api_tag) + tag = self._craft_tag() if self.beta is not None: tag += f"b{self.beta}" return tag @@ -64,15 +79,11 @@ def tag(self): @cached_property def desc(self): - desc = "" - local_ver = [] - if self.n_commits > 0: - local_ver += [str(self.n_commits), str(self.hash)] + extra = [] if self.dirty: - local_ver += ["dirty"] + extra += ["dirty"] - if local_ver: - desc = "+" + ".".join(local_ver) + desc = self._craft_desc(extra) if self.data_source != "git": desc += f":{self.data_source}" if self.is_editable: @@ -89,31 +100,21 @@ class _version_pdfium (_abc_version): def _process_data(self, data): data["flags"] = tuple(data["flags"]) - @cached_property - def api_tag(self): - if self.origin == "pdfium-binaries": - return tuple(self._data[k] for k in self._TAG_FIELDS) - else: - return self.build - @cached_property def tag(self): - if self.origin == "pdfium-binaries": - return ".".join(str(v) for v in self.api_tag) - else: - return str(self.build) + return self._craft_tag() @cached_property def desc(self): - desc = "" - if self.origin != "pdfium-binaries": - desc += f"+{self.origin}" + desc = self._craft_desc() if self.flags: desc += ":{%s}" % ",".join(self.flags) - if self.bindings != "generated": - desc += f"@bindings:{self.bindings}" + if self.origin != "pdfium-binaries": + desc += f"@{self.origin}" return desc +# TODO(future) add bindings info (e.g. ctypesgen version, reference/generated, runtime libdirs) + # Current API @@ -141,7 +142,6 @@ def desc(self): # Docs - PYPDFIUM_INFO = PYPDFIUM_INFO """ pypdfium2 helpers version. @@ -177,17 +177,12 @@ def desc(self): - ``given``: Pre-supplied version file (e.g. packaged with sdist, or else created by caller). - ``record``: Parsed from autorelease record. Implies that possible changes after tag are unknown.\n Note that *given* and *record* are not "trustworthy", they can be easily abused to pass arbitrary values. *git* should be correct provided the installed version file is not corrupted. - is_editable (bool): - True for editable install, False otherwise.\n + is_editable (bool | None): + True for editable install, False otherwise. None if unknown.\n If True, the version info is the one captured at install time. An arbitrary number of forward or reverse changes may have happened since. The actual current state is unknown. """ -# FIXME Integration of sourcebuild is quite polluted. Possible improvements: -# - Always use the latest available tag, and add hash/n_commits similar to PYPDFIUM_INFO. This would require a sufficiently deep checkout, though. -# - the build script might fail to check out tags - investigate and fix this -# - Determine major/minor/patch on sourcebuild (pdfium-binaries show how to do this) - PDFIUM_INFO = PDFIUM_INFO """ PDFium version. @@ -198,30 +193,31 @@ def desc(self): version (str): Joined tag and desc, forming the full version. tag (str): - Version ciphers joined as str. Just *str(build)* if other ciphers are unknown. + Version ciphers joined as str. desc (str): Descriptors (origin, flags) represented as str. - api_tag (tuple[int] | int | str): - Version ciphers joined as tuple, or just the build value (without tuple) if other ciphers are unknown. - major (int | None): + api_tag (tuple[int]): + Version ciphers joined as tuple. + major (int): Chromium major cipher. - minor (int | None): + minor (int): Chromium minor cipher. - build (int | str): - PDFium tag rsp. Chromium build cipher (int), or commit hash (str). + build (int): + Chromium/pdfium build cipher. This value allows to uniquely identify the pdfium sources the binary was built from. - For origin pdfium-binaries: always tag. For origin sourcebuild: tag if available, head commit otherwise. - patch (int | None): + patch (int): Chromium patch cipher. + n_commits (int): + Number of commits after tag at install time. 0 for tagged build commit. + hash (str | None): + Hash of head commit if n_commits > 0, None otherwise. origin (str): The pdfium binary's origin. Possible values:\n - - ``pdfium-binaries``: Compiled by bblanchon/pdfium-binaries, and bundled into pypdfium2. Chromium ciphers known. - - ``sourcebuild``: Provided by the caller (commonly compiled using pypdfium2's integrated build script), and bundled into pypdfium2. Chromium ciphers unknown. + - ``pdfium-binaries``: Compiled by bblanchon/pdfium-binaries, and bundled into pypdfium2. + - ``sourcebuild``: Provided by the caller (commonly compiled using pypdfium2's integrated build script), and bundled into pypdfium2. - ``system``: Dynamically loaded from a standard system location using :func:`ctypes.util.find_library`. flags (tuple[str]): Tuple of pdfium feature flags. Empty for default build. (V8, XFA) for pdfium-binaries V8 build. - bindings (str): - Info on the used bindings (generated, reference). Note that the reference bindings can be ABI-unsafe. (This field is experimental. In the future, we may want to integrate bindings info separately with ctypesgen version.) """ # ----- diff --git a/tests_old/conftest.py b/tests_old/conftest.py index e2fafa298..33b86c705 100644 --- a/tests_old/conftest.py +++ b/tests_old/conftest.py @@ -15,9 +15,6 @@ ResourceDir = TestDir / "resources" OutputDir = TestDir / "output" -sys.path.insert(0, str(ProjectDir / "setupsrc")) - - class TestFiles: render = ResourceDir / "render.pdf" encrypted = ResourceDir / "encrypted.pdf" diff --git a/tests_old/test_setup.py b/tests_old/test_setup.py deleted file mode 100644 index d9cd94e3a..000000000 --- a/tests_old/test_setup.py +++ /dev/null @@ -1,118 +0,0 @@ -# SPDX-FileCopyrightText: 2023 geisserml -# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -import re -import pytest -# import sysconfig -# from pathlib import Path -# from wheel.bdist_wheel import bdist_wheel -from pypdfium2_setup import ( - # setup_base, - packaging_base as pkg_base, -) -from pypdfium2_setup.packaging_base import ( - PlatNames, - BinaryPlatforms, - ReleaseNames, -) -from .conftest import ProjectDir, get_members - - -@pytest.fixture -def all_platnames(): - return list( get_members(PlatNames) ) - - -# module - -# NOTE migrated to pyproject.toml -# def test_entrypoint(): - -# setup_cfg = configparser.ConfigParser() -# setup_cfg.read( join(ProjectDir, "setup.cfg") ) -# console_scripts = setup_cfg["options.entry_points"]["console_scripts"] - -# entry_point = console_scripts.split("=")[-1].strip().split(":") -# module_path = entry_point[0] -# method_name = entry_point[1] - -# namespace = {} -# exec("from %s import %s" % (module_path, method_name), namespace) -# assert method_name in namespace - -# function = namespace[method_name] -# assert callable(function) - - -# setup_base - -ExpectedTags = ( - (PlatNames.linux_x64, "manylinux_2_17_x86_64"), - (PlatNames.linux_x86, "manylinux_2_17_i686"), - (PlatNames.linux_arm64, "manylinux_2_17_aarch64"), - (PlatNames.linux_arm32, "manylinux_2_17_armv7l"), - (PlatNames.linux_musl_x64, "musllinux_1_1_x86_64"), - (PlatNames.linux_musl_x86, "musllinux_1_1_i686"), - (PlatNames.linux_musl_arm64, "musllinux_1_1_aarch64"), - (PlatNames.darwin_x64, "macosx_10_13_x86_64"), - (PlatNames.darwin_arm64, "macosx_11_0_arm64"), - (PlatNames.windows_x64, "win_amd64"), - (PlatNames.windows_arm64, "win_arm64"), - (PlatNames.windows_x86, "win32"), - # FIXME not sure how to test this - # (PlatNames.sourcebuild, ...), -) - - -def test_expected_tags(all_platnames): - assert len(all_platnames) == len(ExpectedTags)+1 - for platform, tag in ExpectedTags: - assert hasattr(PlatNames, platform) - assert isinstance(tag, str) - - -def test_get_tag(): - for platform, tag in ExpectedTags: - assert pkg_base.get_wheel_tag(platform) == tag - -def test_unknown_tag(): - plat_dir = "win_amd74" - with pytest.raises(ValueError, match=re.escape("Unknown platform name %s" % plat_dir)): - pkg_base.get_wheel_tag(plat_dir) - -# def test_get_bdist(): -# for platform, _ in ExpectedTags: -# bdist_cls = setup_base.bdist_factory(platform) -# assert issubclass(bdist_cls, bdist_wheel) - - -# packaging_base -# TODO update/extend - -def test_libnames(): - for name in pkg_base.MainLibnames: - assert "pdfium" in name - -def test_PlatNames(all_platnames): - # make sure variable names and values are identical - for name in all_platnames: - assert name == getattr(PlatNames, name) - -def test_paths(): - # FIXME not much point doing this? - assert pkg_base.ProjectDir == ProjectDir - assert pkg_base.DataDir == ProjectDir / "data" - assert pkg_base.SourcebuildDir == ProjectDir / "sourcebuild" - assert pkg_base.ModuleDir_Helpers == ProjectDir / "src" / "pypdfium2" - - -# update_pdfium - -def test_releasenames(all_platnames): - assert len(ReleaseNames) == len(BinaryPlatforms) == len(all_platnames) - 1 - for key, value in ReleaseNames.items(): - assert key in BinaryPlatforms - assert hasattr(PlatNames, key) - system, cpu = value.replace("linux-musl", "musllinux").split("-", maxsplit=3) - assert system in ("linux", "musllinux", "mac", "win") - assert cpu in ("x64", "x86", "arm64", "arm")