diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..cf9f09c --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,32 @@ +name: CI +on: + # Trigger the workflow on push or pull request events but only for the master branch + push: + branches: [ master ] + pull_request: + branches: [ master ] + # Allow running this workflow manually from the Actions tab + workflow_dispatch: +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install libarchive + run: sudo apt-get install -y libarchive13 + - name: Install Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Install Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install tox + run: pip install tox + - name: Run the tests + run: tox diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 55770e6..0000000 --- a/.travis.yml +++ /dev/null @@ -1,42 +0,0 @@ -language: python -matrix: - include: - - python: 3.6 - env: TOXENV=py36 - - python: 3.7 - env: TOXENV=py37 - - python: 3.8 - env: TOXENV=py38 - - python: 3.9 - env: TOXENV=py39 - -branches: - only: - - master - -cache: - directories: - - /opt/python-libarchive-c - -env: - global: - - LIBARCHIVE=/opt/python-libarchive-c/lib/libarchive.so - -before_install: - - sudo apt-get install -y zlib1g-dev liblzma-dev libbz2-dev libxml2-dev nettle-dev libattr1-dev libacl1-dev - - "if [ ! -e $LIBARCHIVE ]; then - wget http://libarchive.org/downloads/libarchive-3.3.2.tar.gz && - tar -xf libarchive-3.3.2.tar.gz && cd libarchive-3.3.2 && - ./configure --prefix=/opt/python-libarchive-c --disable-bsdcpio --disable-bsdtar && - make && sudo make install && cd .. ; - fi" - -install: pip install tox - -script: tox - -notifications: - email: false - -sudo: required -dist: xenial diff --git a/README.rst b/README.rst index 15e62ca..bdffa35 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,3 @@ -.. image:: https://travis-ci.org/Changaco/python-libarchive-c.svg - :target: https://travis-ci.org/Changaco/python-libarchive-c - A Python interface to libarchive. It uses the standard ctypes_ module to dynamically load and access the C library. @@ -17,7 +14,7 @@ Compatibility python ------ -python-libarchive-c is currently tested with python 3.6, 3.7, 3.8, and 3.9. +python-libarchive-c is currently tested with python 3.7, 3.8, and 3.9. If you find an incompatibility with older versions you can send us a small patch, but we won't accept big changes. @@ -36,32 +33,86 @@ Import:: import libarchive -To extract an archive to the current directory:: +Extracting archives +------------------- + +To extract an archive, use the ``extract_file`` function:: + os.chdir('/path/to/target/directory') libarchive.extract_file('test.zip') -``extract_memory`` extracts from a buffer instead, and ``extract_fd`` extracts -from a file descriptor. +Alternatively, the ``extract_memory`` function can be used to extract from a buffer, +and ``extract_fd`` from a file descriptor. + +The ``extract_*`` functions all have an integer ``flags`` argument which is passed +directly to the C function ``archive_write_disk_set_options()``. You can import +the ``EXTRACT_*`` constants from the ``libarchive.extract`` module and see the +official description of each flag in the ``archive_write_disk(3)`` man page. + +By default, when the ``flags`` argument is ``None``, the ``SECURE_NODOTDOT``, +``SECURE_NOABSOLUTEPATHS`` and ``SECURE_SYMLINKS`` flags are passed to +libarchive, unless the current directory is the root (``/``). -To read an archive:: +Reading archives +---------------- + +To read an archive, use the ``file_reader`` function:: with libarchive.file_reader('test.7z') as archive: for entry in archive: for block in entry.get_blocks(): ... -``memory_reader`` reads from a memory buffer instead, and ``fd_reader`` reads -from a file descriptor. +Alternatively, the ``memory_reader`` function can be used to read from a buffer, +``fd_reader`` from a file descriptor, ``stream_reader`` from a stream object +(which must support the standard ``readinto`` method), and ``custom_reader`` +from anywhere using callbacks. -To create an archive:: +To learn about the attributes of the ``entry`` object, see the ``libarchive/entry.py`` +source code or run ``help(libarchive.entry.ArchiveEntry)`` in a Python shell. - with libarchive.file_writer('test.tar.gz', 'ustar', 'gzip') as archive: - archive.add_files('libarchive/', 'README.rst') +Displaying progress +~~~~~~~~~~~~~~~~~~~ -``memory_writer`` writes to a memory buffer instead, ``fd_writer`` writes to a -file descriptor, and ``custom_writer`` sends the data to a callback function. +If your program processes large archives, you can keep track of its progress +with the ``bytes_read`` attribute. Here's an example of a progress bar using +`tqdm `_:: -You can also find more thorough examples in the ``tests/`` directory. + with tqdm(total=os.stat(archive_path).st_size, unit='bytes') as pbar, \ + libarchive.file_reader(archive_path) as archive: + for entry in archive: + ... + pbar.update(archive.bytes_read - pbar.n) + +Creating archives +----------------- + +To create an archive, use the ``file_writer`` function:: + + from libarchive.entry import FileType + + with libarchive.file_writer('test.tar.gz', 'ustar', 'gzip') as archive: + # Add the `libarchive/` directory and everything in it (recursively), + # then the `README.rst` file. + archive.add_files('libarchive/', 'README.rst') + # Add a regular file defined from scratch. + data = b'foobar' + archive.add_file_from_memory('../escape-test', len(data), data) + # Add a directory defined from scratch. + early_epoch = (42, 42) # 1970-01-01 00:00:42.000000042 + archive.add_file_from_memory( + 'metadata-test', 0, b'', + filetype=FileType.DIRECTORY, permission=0o755, uid=4242, gid=4242, + atime=early_epoch, mtime=early_epoch, ctime=early_epoch, birthtime=early_epoch, + ) + +Alternatively, the ``memory_writer`` function can be used to write to a memory buffer, +``fd_writer`` to a file descriptor, and ``custom_writer`` to a callback function. + +For each of those functions, the mandatory second argument is the archive format, +and the optional third argument is the compression format (called “filter” in +libarchive). The acceptable values are listed in ``libarchive.ffi.WRITE_FORMATS`` +and ``libarchive.ffi.WRITE_FILTERS``. License ======= diff --git a/libarchive/entry.py b/libarchive/entry.py index bd5e223..1b60668 100644 --- a/libarchive/entry.py +++ b/libarchive/entry.py @@ -1,9 +1,21 @@ from contextlib import contextmanager from ctypes import c_char_p, create_string_buffer +from enum import IntEnum +import math from . import ffi +class FileType(IntEnum): + NAMED_PIPE = AE_IFIFO = 0o010000 # noqa: E221 + CHAR_DEVICE = AE_IFCHR = 0o020000 # noqa: E221 + DIRECTORY = AE_IFDIR = 0o040000 # noqa: E221 + BLOCK_DEVICE = AE_IFBLK = 0o060000 # noqa: E221 + REGULAR_FILE = AE_IFREG = 0o100000 # noqa: E221 + SYMBOLINK_LINK = AE_IFLNK = 0o120000 # noqa: E221 + SOCKET = AE_IFSOCK = 0o140000 # noqa: E221 + + @contextmanager def new_archive_entry(): entry_p = ffi.entry_new() @@ -24,27 +36,111 @@ class ArchiveEntry: __slots__ = ('_archive_p', '_entry_p') - def __init__(self, archive_p, entry_p): + def __init__(self, archive_p=None, **attributes): + """Allocate memory for an `archive_entry` struct. + + The attributes are passed to the `modify` method. + """ self._archive_p = archive_p - self._entry_p = entry_p + self._entry_p = ffi.entry_new() + if attributes: + self.modify(**attributes) + + def __del__(self): + """Free the C struct""" + ffi.entry_free(self._entry_p) def __str__(self): + """Returns the file's path""" return self.pathname + def modify(self, **attributes): + """Convenience method to modify the entry's attributes. + + Args: + filetype (int): the file's type, see the `FileType` class for values + pathname (str): the file's path + linkpath (str): the other path of the file, if the file is a link + size (int | None): the file's size, in bytes + perm (int): the file's permissions in standard Unix format, e.g. 0o640 + uid (int): the file owner's numerical identifier + gid (int): the file group's numerical identifier + uname (str | bytes): the file owner's name + gname (str | bytes): the file group's name + atime (int | Tuple[int, int] | float | None): + the file's most recent access time, + either in seconds or as a tuple (seconds, nanoseconds) + mtime (int | Tuple[int, int] | float | None): + the file's most recent modification time, + either in seconds or as a tuple (seconds, nanoseconds) + ctime (int | Tuple[int, int] | float | None): + the file's most recent metadata change time, + either in seconds or as a tuple (seconds, nanoseconds) + birthtime (int | Tuple[int, int] | float | None): + the file's creation time (for archive formats that support it), + either in seconds or as a tuple (seconds, nanoseconds) + rdev (int | Tuple[int, int]): device number, if the file is a device + rdevmajor (int): major part of the device number + rdevminor (int): minor part of the device number + """ + for name, value in attributes.items(): + setattr(self, name, value) + @property def filetype(self): return ffi.entry_filetype(self._entry_p) + @filetype.setter + def filetype(self, value): + ffi.entry_set_filetype(self._entry_p, value) + @property def uid(self): return ffi.entry_uid(self._entry_p) + @uid.setter + def uid(self, uid): + ffi.entry_set_uid(self._entry_p, uid) + @property def gid(self): return ffi.entry_gid(self._entry_p) + @gid.setter + def gid(self, gid): + ffi.entry_set_gid(self._entry_p, gid) + + @property + def uname(self): + return ffi.entry_uname_w(self._entry_p) + + @uname.setter + def uname(self, value): + if not isinstance(value, bytes): + value = value.encode('utf8') + ffi.entry_update_uname_utf8(self._entry_p, value) + + @property + def gname(self): + return ffi.entry_gname_w(self._entry_p) + + @gname.setter + def gname(self, value): + if not isinstance(value, bytes): + value = value.encode('utf8') + ffi.entry_update_gname_utf8(self._entry_p, value) + def get_blocks(self, block_size=ffi.page_size): + """Read the file's content, keeping only one chunk in memory at a time. + + Don't do anything like `list(entry.get_blocks())`, it would silently fail. + + Args: + block_size (int): the buffer's size, in bytes + """ archive_p = self._archive_p + if not archive_p: + raise TypeError("this entry isn't linked to any content") buf = create_string_buffer(block_size) read = ffi.read_data while 1: @@ -52,6 +148,7 @@ def get_blocks(self, block_size=ffi.page_size): if r == 0: break yield buf.raw[0:r] + self.__class__ = ConsumedArchiveEntry @property def isblk(self): @@ -78,16 +175,6 @@ def islnk(self): def issym(self): return self.filetype & 0o170000 == 0o120000 - def _linkpath(self): - return (ffi.entry_symlink_w(self._entry_p) or - ffi.entry_hardlink_w(self._entry_p) or - ffi.entry_symlink(self._entry_p) or - ffi.entry_hardlink(self._entry_p)) - - # aliases to get the same api as tarfile - linkpath = property(_linkpath) - linkname = property(_linkpath) - @property def isreg(self): return self.filetype & 0o170000 == 0o100000 @@ -106,77 +193,214 @@ def isdev(self): @property def atime(self): + if not ffi.entry_atime_is_set(self._entry_p): + return None sec_val = ffi.entry_atime(self._entry_p) nsec_val = ffi.entry_atime_nsec(self._entry_p) return format_time(sec_val, nsec_val) + @atime.setter + def atime(self, value): + if value is None: + ffi.entry_unset_atime(self._entry_p) + elif isinstance(value, int): + self.set_atime(value) + elif isinstance(value, tuple): + self.set_atime(*value) + else: + seconds, fraction = math.modf(value) + self.set_atime(int(seconds), int(fraction * 1_000_000_000)) + def set_atime(self, timestamp_sec, timestamp_nsec): - return ffi.entry_set_atime(self._entry_p, - timestamp_sec, timestamp_nsec) + "Kept for backward compatibility. `entry.atime = ...` is supported now." + return ffi.entry_set_atime(self._entry_p, timestamp_sec, timestamp_nsec) @property def mtime(self): + if not ffi.entry_mtime_is_set(self._entry_p): + return None sec_val = ffi.entry_mtime(self._entry_p) nsec_val = ffi.entry_mtime_nsec(self._entry_p) return format_time(sec_val, nsec_val) + @mtime.setter + def mtime(self, value): + if value is None: + ffi.entry_unset_mtime(self._entry_p) + elif isinstance(value, int): + self.set_mtime(value) + elif isinstance(value, tuple): + self.set_mtime(*value) + else: + seconds, fraction = math.modf(value) + self.set_mtime(int(seconds), int(fraction * 1_000_000_000)) + def set_mtime(self, timestamp_sec, timestamp_nsec): - return ffi.entry_set_mtime(self._entry_p, - timestamp_sec, timestamp_nsec) + "Kept for backward compatibility. `entry.mtime = ...` is supported now." + return ffi.entry_set_mtime(self._entry_p, timestamp_sec, timestamp_nsec) @property def ctime(self): + if not ffi.entry_ctime_is_set(self._entry_p): + return None sec_val = ffi.entry_ctime(self._entry_p) nsec_val = ffi.entry_ctime_nsec(self._entry_p) return format_time(sec_val, nsec_val) + @ctime.setter + def ctime(self, value): + if value is None: + ffi.entry_unset_ctime(self._entry_p) + elif isinstance(value, int): + self.set_ctime(value) + elif isinstance(value, tuple): + self.set_ctime(*value) + else: + seconds, fraction = math.modf(value) + self.set_ctime(int(seconds), int(fraction * 1_000_000_000)) + def set_ctime(self, timestamp_sec, timestamp_nsec): - return ffi.entry_set_ctime(self._entry_p, - timestamp_sec, timestamp_nsec) + "Kept for backward compatibility. `entry.ctime = ...` is supported now." + return ffi.entry_set_ctime(self._entry_p, timestamp_sec, timestamp_nsec) @property def birthtime(self): + if not ffi.entry_birthtime_is_set(self._entry_p): + return None sec_val = ffi.entry_birthtime(self._entry_p) nsec_val = ffi.entry_birthtime_nsec(self._entry_p) return format_time(sec_val, nsec_val) - def set_birthtime(self, timestamp_sec, timestamp_nsec): - return ffi.entry_set_birthtime(self._entry_p, - timestamp_sec, timestamp_nsec) + @birthtime.setter + def birthtime(self, value): + if value is None: + ffi.entry_unset_birthtime(self._entry_p) + elif isinstance(value, int): + self.set_birthtime(value) + elif isinstance(value, tuple): + self.set_birthtime(*value) + else: + seconds, fraction = math.modf(value) + self.set_birthtime(int(seconds), int(fraction * 1_000_000_000)) + + def set_birthtime(self, timestamp_sec, timestamp_nsec=0): + "Kept for backward compatibility. `entry.birthtime = ...` is supported now." + return ffi.entry_set_birthtime( + self._entry_p, timestamp_sec, timestamp_nsec + ) - def _getpathname(self): - return (ffi.entry_pathname_w(self._entry_p) or - ffi.entry_pathname(self._entry_p)) - - def _setpathname(self, value): + @property + def pathname(self): + path = ffi.entry_pathname_w(self._entry_p) + if not path: + path = ffi.entry_pathname(self._entry_p) + try: + path = path.decode() + except UnicodeError: + pass + return path + + @pathname.setter + def pathname(self, value): if not isinstance(value, bytes): value = value.encode('utf8') ffi.entry_update_pathname_utf8(self._entry_p, c_char_p(value)) - pathname = property(_getpathname, _setpathname) - # aliases to get the same api as tarfile - path = property(_getpathname, _setpathname) - name = property(_getpathname, _setpathname) + @property + def linkpath(self): + return (ffi.entry_symlink_w(self._entry_p) or + ffi.entry_hardlink_w(self._entry_p) or + ffi.entry_symlink(self._entry_p) or + ffi.entry_hardlink(self._entry_p)) + + @linkpath.setter + def linkpath(self, value): + ffi.entry_update_link_utf8(self._entry_p, value) + + # aliases for compatibility with the standard `tarfile` module + path = property(pathname.fget, pathname.fset, doc="alias of pathname") + name = path + linkname = property(linkpath.fget, linkpath.fset, doc="alias of linkpath") @property def size(self): if ffi.entry_size_is_set(self._entry_p): return ffi.entry_size(self._entry_p) + @size.setter + def size(self, value): + if value is None: + ffi.entry_unset_size(self._entry_p) + else: + ffi.entry_set_size(self._entry_p, value) + @property def mode(self): return ffi.entry_mode(self._entry_p) + @mode.setter + def mode(self, value): + ffi.entry_set_mode(self._entry_p, value) + @property def strmode(self): + """The file's mode as a string, e.g. '?rwxrwx---'""" # note we strip the mode because archive_entry_strmode # returns a trailing space: strcpy(bp, "?rwxrwxrwx "); return ffi.entry_strmode(self._entry_p).strip() + @property + def perm(self): + return ffi.entry_perm(self._entry_p) + + @perm.setter + def perm(self, value): + ffi.entry_set_perm(self._entry_p, value) + + @property + def rdev(self): + return ffi.entry_rdev(self._entry_p) + + @rdev.setter + def rdev(self, value): + if isinstance(value, tuple): + ffi.entry_set_rdevmajor(self._entry_p, value[0]) + ffi.entry_set_rdevminor(self._entry_p, value[1]) + else: + ffi.entry_set_rdev(self._entry_p, value) + @property def rdevmajor(self): return ffi.entry_rdevmajor(self._entry_p) + @rdevmajor.setter + def rdevmajor(self, value): + ffi.entry_set_rdevmajor(self._entry_p, value) + @property def rdevminor(self): return ffi.entry_rdevminor(self._entry_p) + + @rdevminor.setter + def rdevminor(self, value): + ffi.entry_set_rdevminor(self._entry_p, value) + + @property + def format_name(self): + return ffi.format_name(self._pointer) + + +class ConsumedArchiveEntry(ArchiveEntry): + + __slots__ = () + + def get_blocks(self, **kw): + raise TypeError("the content of this entry has already been read") + + +class PassedArchiveEntry(ArchiveEntry): + + __slots__ = () + + def get_blocks(self, **kw): + raise TypeError("this entry is passed, it's too late to read its content") diff --git a/libarchive/extract.py b/libarchive/extract.py index bbf64ff..bf0c703 100644 --- a/libarchive/extract.py +++ b/libarchive/extract.py @@ -1,5 +1,6 @@ from contextlib import contextmanager from ctypes import byref, c_longlong, c_size_t, c_void_p +import os from .ffi import ( write_disk_new, write_disk_set_options, write_free, write_header, @@ -27,6 +28,12 @@ EXTRACT_SECURE_NOABSOLUTEPATHS = 0x10000 EXTRACT_CLEAR_NOCHANGE_FFLAGS = 0x20000 +PREVENT_ESCAPE = ( + EXTRACT_SECURE_NOABSOLUTEPATHS | + EXTRACT_SECURE_NODOTDOT | + EXTRACT_SECURE_SYMLINKS +) + @contextmanager def new_archive_write_disk(flags): @@ -38,9 +45,16 @@ def new_archive_write_disk(flags): write_free(archive_p) -def extract_entries(entries, flags=0): +def extract_entries(entries, flags=None): """Extracts the given archive entries into the current directory. """ + if flags is None: + if os.getcwd() == '/': + # If the current directory is the root, then trying to prevent + # escaping is probably undesirable. + flags = 0 + else: + flags = PREVENT_ESCAPE buff, size, offset = c_void_p(), c_size_t(), c_longlong() buff_p, size_p, offset_p = byref(buff), byref(size), byref(offset) with new_archive_write_disk(flags) as write_p: @@ -55,20 +69,20 @@ def extract_entries(entries, flags=0): write_finish_entry(write_p) -def extract_fd(fd, flags=0): +def extract_fd(fd, flags=None): """Extracts an archive from a file descriptor into the current directory. """ with fd_reader(fd) as archive: extract_entries(archive, flags) -def extract_file(filepath, flags=0): +def extract_file(filepath, flags=None): """Extracts an archive from a file into the current directory.""" with file_reader(filepath) as archive: extract_entries(archive, flags) -def extract_memory(buffer_, flags=0): +def extract_memory(buffer_, flags=None): """Extracts an archive from memory into the current directory.""" with memory_reader(buffer_) as archive: extract_entries(archive, flags) diff --git a/libarchive/ffi.py b/libarchive/ffi.py index 7dc16b1..2b85794 100644 --- a/libarchive/ffi.py +++ b/libarchive/ffi.py @@ -34,8 +34,6 @@ ARCHIVE_WARN = -20 # Partial success. ARCHIVE_FAILED = -25 # Current operation cannot complete. ARCHIVE_FATAL = -30 # No more operations are possible. -REGULAR_FILE = 0o100000 -DEFAULT_UNIX_PERMISSION = 0o664 # Callback types @@ -163,6 +161,10 @@ def get_write_filter_function(filter_name): errno = ffi('errno', [c_archive_p], c_int) error_string = ffi('error_string', [c_archive_p], c_char_p) +ffi('filter_bytes', [c_archive_p, c_int], c_longlong) +ffi('filter_count', [c_archive_p], c_int) +ffi('filter_name', [c_archive_p, c_int], c_char_p) +ffi('format_name', [c_archive_p], c_char_p) # archive_entry @@ -177,6 +179,10 @@ def get_write_filter_function(filter_name): ffi('entry_birthtime_nsec', [c_archive_entry_p], c_long) ffi('entry_mtime_nsec', [c_archive_entry_p], c_long) ffi('entry_ctime_nsec', [c_archive_entry_p], c_long) +ffi('entry_atime_is_set', [c_archive_entry_p], c_int) +ffi('entry_birthtime_is_set', [c_archive_entry_p], c_int) +ffi('entry_mtime_is_set', [c_archive_entry_p], c_int) +ffi('entry_ctime_is_set', [c_archive_entry_p], c_int) ffi('entry_pathname', [c_archive_entry_p], c_char_p) ffi('entry_pathname_w', [c_archive_entry_p], c_wchar_p) ffi('entry_sourcepath', [c_archive_entry_p], c_char_p) @@ -184,24 +190,42 @@ def get_write_filter_function(filter_name): ffi('entry_size_is_set', [c_archive_entry_p], c_int) ffi('entry_mode', [c_archive_entry_p], c_int) ffi('entry_strmode', [c_archive_entry_p], c_char_p) +ffi('entry_perm', [c_archive_entry_p], c_int) ffi('entry_hardlink', [c_archive_entry_p], c_char_p) ffi('entry_hardlink_w', [c_archive_entry_p], c_wchar_p) ffi('entry_symlink', [c_archive_entry_p], c_char_p) ffi('entry_symlink_w', [c_archive_entry_p], c_wchar_p) +ffi('entry_rdev', [c_archive_entry_p], c_uint) ffi('entry_rdevmajor', [c_archive_entry_p], c_uint) ffi('entry_rdevminor', [c_archive_entry_p], c_uint) ffi('entry_uid', [c_archive_entry_p], c_longlong) ffi('entry_gid', [c_archive_entry_p], c_longlong) +ffi('entry_uname_w', [c_archive_entry_p], c_wchar_p) +ffi('entry_gname_w', [c_archive_entry_p], c_wchar_p) ffi('entry_set_size', [c_archive_entry_p, c_longlong], None) ffi('entry_set_filetype', [c_archive_entry_p, c_uint], None) +ffi('entry_set_uid', [c_archive_entry_p, c_longlong], None) +ffi('entry_set_gid', [c_archive_entry_p, c_longlong], None) +ffi('entry_set_mode', [c_archive_entry_p, c_int], None) ffi('entry_set_perm', [c_archive_entry_p, c_int], None) ffi('entry_set_atime', [c_archive_entry_p, c_time_t, c_long], None) ffi('entry_set_mtime', [c_archive_entry_p, c_time_t, c_long], None) ffi('entry_set_ctime', [c_archive_entry_p, c_time_t, c_long], None) ffi('entry_set_birthtime', [c_archive_entry_p, c_time_t, c_long], None) - -ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], None) +ffi('entry_set_rdev', [c_archive_entry_p, c_uint], None) +ffi('entry_set_rdevmajor', [c_archive_entry_p, c_uint], None) +ffi('entry_set_rdevminor', [c_archive_entry_p, c_uint], None) +ffi('entry_unset_size', [c_archive_entry_p], None) +ffi('entry_unset_atime', [c_archive_entry_p], None) +ffi('entry_unset_mtime', [c_archive_entry_p], None) +ffi('entry_unset_ctime', [c_archive_entry_p], None) +ffi('entry_unset_birthtime', [c_archive_entry_p], None) + +ffi('entry_update_pathname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_update_link_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_update_uname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) +ffi('entry_update_gname_utf8', [c_archive_entry_p, c_char_p], c_int, check_int) ffi('entry_clear', [c_archive_entry_p], c_archive_entry_p) ffi('entry_free', [c_archive_entry_p], None) diff --git a/libarchive/read.py b/libarchive/read.py index ff7a408..3e2dbcc 100644 --- a/libarchive/read.py +++ b/libarchive/read.py @@ -7,7 +7,7 @@ ARCHIVE_EOF, OPEN_CALLBACK, READ_CALLBACK, CLOSE_CALLBACK, SEEK_CALLBACK, NO_OPEN_CB, NO_CLOSE_CB, page_size, ) -from .entry import ArchiveEntry, new_archive_entry +from .entry import ArchiveEntry, PassedArchiveEntry class ArchiveRead: @@ -20,13 +20,22 @@ def __iter__(self): """ archive_p = self._pointer read_next_header2 = ffi.read_next_header2 - with new_archive_entry() as entry_p: - entry = ArchiveEntry(archive_p, entry_p) - while 1: - r = read_next_header2(archive_p, entry_p) - if r == ARCHIVE_EOF: - return - yield entry + while 1: + entry = ArchiveEntry(archive_p) + r = read_next_header2(archive_p, entry._entry_p) + if r == ARCHIVE_EOF: + return + yield entry + entry.__class__ = PassedArchiveEntry + + @property + def bytes_read(self): + return ffi.filter_bytes(self._pointer, -1) + + @property + def filter_names(self): + count = ffi.filter_count(self._pointer) + return [ffi.filter_name(self._pointer, i) for i in range(count - 1)] @contextmanager diff --git a/libarchive/write.py b/libarchive/write.py index 024f03c..42b1c7b 100644 --- a/libarchive/write.py +++ b/libarchive/write.py @@ -1,15 +1,16 @@ from contextlib import contextmanager from ctypes import byref, cast, c_char, c_size_t, c_void_p, POINTER +from posixpath import join import warnings from . import ffi -from .entry import ArchiveEntry, new_archive_entry +from .entry import ArchiveEntry, FileType from .ffi import ( OPEN_CALLBACK, WRITE_CALLBACK, CLOSE_CALLBACK, NO_OPEN_CB, NO_CLOSE_CB, - REGULAR_FILE, DEFAULT_UNIX_PERMISSION, ARCHIVE_EOF, + ARCHIVE_EOF, page_size, entry_sourcepath, entry_clear, read_disk_new, read_disk_open_w, read_next_header2, read_disk_descend, read_free, write_header, write_data, - write_finish_entry, entry_set_size, entry_set_filetype, entry_set_perm, + write_finish_entry, read_disk_set_behavior ) @@ -42,10 +43,26 @@ def add_entries(self, entries): write_data(write_p, block, len(block)) write_finish_entry(write_p) - def add_files(self, *paths, **kw): - """Read the given paths from disk and add them to the archive. - - The keyword arguments (`**kw`) are passed to `new_archive_read_disk`. + def add_files( + self, *paths, flags=0, lookup=False, pathname=None, **attributes + ): + """Read files through the OS and add them to the archive. + + Args: + paths (str): the paths of the files to add to the archive + flags (int): + passed to the C function `archive_read_disk_set_behavior`; + use the `libarchive.flags.READDISK_*` constants + lookup (bool): + when True, the C function `archive_read_disk_set_standard_lookup` + is called to enable the lookup of user and group names + pathname (str | None): + the path of the file in the archive, defaults to the source path + attributes (dict): passed to `ArchiveEntry.modify()` + + Raises: + ArchiveError: if a file doesn't exist or can't be accessed, or if + adding it to the archive fails """ write_p = self._pointer @@ -53,53 +70,58 @@ def add_files(self, *paths, **kw): if block_size <= 0: block_size = 10240 # pragma: no cover - with new_archive_entry() as entry_p: - entry = ArchiveEntry(None, entry_p) - for path in paths: - with new_archive_read_disk(path, **kw) as read_p: - while 1: - r = read_next_header2(read_p, entry_p) - if r == ARCHIVE_EOF: - break - entry.pathname = entry.pathname.lstrip('/') - read_disk_descend(read_p) - write_header(write_p, entry_p) - if entry.isreg: - with open(entry_sourcepath(entry_p), 'rb') as f: - while 1: - data = f.read(block_size) - if not data: - break - write_data(write_p, data, len(data)) - write_finish_entry(write_p) - entry_clear(entry_p) + entry = ArchiveEntry() + entry_p = entry._entry_p + destination_path = attributes.pop('pathname', None) + for path in paths: + with new_archive_read_disk(path, flags, lookup) as read_p: + while 1: + r = read_next_header2(read_p, entry_p) + if r == ARCHIVE_EOF: + break + entry_path = entry.pathname + if destination_path: + if entry_path == path: + entry_path = destination_path + else: + assert entry_path.startswith(path) + entry_path = join( + destination_path, + entry_path[len(path):].lstrip('/') + ) + entry.pathname = entry_path.lstrip('/') + if attributes: + entry.modify(**attributes) + read_disk_descend(read_p) + write_header(write_p, entry_p) + if entry.isreg: + with open(entry_sourcepath(entry_p), 'rb') as f: + while 1: + data = f.read(block_size) + if not data: + break + write_data(write_p, data, len(data)) + write_finish_entry(write_p) + entry_clear(entry_p) + + def add_file(self, path, **kw): + "Single-path alias of `add_files()`" + return self.add_files(path, **kw) def add_file_from_memory( self, entry_path, entry_size, entry_data, - filetype=REGULAR_FILE, permission=DEFAULT_UNIX_PERMISSION, - atime=None, mtime=None, ctime=None, birthtime=None, + filetype=FileType.REGULAR_FILE, permission=0o664, + **other_attributes ): """"Add file from memory to archive. - :param entry_path: where entry should be places in archive - :type entry_path: str - :param entry_size: entire size of entry in bytes - :type entry_size: int - :param entry_data: content of entry - :type entry_data: bytes or Iterable[bytes] - :param filetype: which type of file: normal, symlink etc. - should entry be created as - :type filetype: octal number - :param permission: with which permission should entry be created - :type permission: octal number - :param atime: Last access time - :type atime: int seconds or tuple (int seconds, int nanoseconds) - :param mtime: Last modified time - :type mtime: int seconds or tuple (int seconds, int nanoseconds) - :param ctime: Creation time - :type ctime: int seconds or tuple (int seconds, int nanoseconds) - :param birthtime: Birth time (for archive formats that support it) - :type birthtime: int seconds or tuple (int seconds, int nanoseconds) + Args: + entry_path (str): the file's path + entry_size (int): the file's size, in bytes + entry_data (bytes | Iterable[bytes]): the file's content + filetype (int): see `libarchive.entry.ArchiveEntry.modify()` + permission (int): see `libarchive.entry.ArchiveEntry.modify()` + other_attributes: see `libarchive.entry.ArchiveEntry.modify()` """ archive_pointer = self._pointer @@ -110,39 +132,18 @@ def add_file_from_memory( "entry_data: expected bytes, got %r" % type(entry_data) ) - with new_archive_entry() as archive_entry_pointer: - archive_entry = ArchiveEntry(None, archive_entry_pointer) - - archive_entry.pathname = entry_path - entry_set_size(archive_entry_pointer, entry_size) - entry_set_filetype(archive_entry_pointer, filetype) - entry_set_perm(archive_entry_pointer, permission) - - if atime is not None: - if not isinstance(atime, tuple): - atime = (atime, 0) - archive_entry.set_atime(*atime) - if mtime is not None: - if not isinstance(mtime, tuple): - mtime = (mtime, 0) - archive_entry.set_mtime(*mtime) - if ctime is not None: - if not isinstance(ctime, tuple): - ctime = (ctime, 0) - archive_entry.set_ctime(*ctime) - if birthtime is not None: - if not isinstance(birthtime, tuple): - birthtime = (birthtime, 0) - archive_entry.set_birthtime(*birthtime) - write_header(archive_pointer, archive_entry_pointer) - - for chunk in entry_data: - if not chunk: - break - write_data(archive_pointer, chunk, len(chunk)) - - write_finish_entry(archive_pointer) - entry_clear(archive_entry_pointer) + entry = ArchiveEntry( + pathname=entry_path, size=entry_size, filetype=filetype, + perm=permission, **other_attributes + ) + write_header(archive_pointer, entry._entry_p) + + for chunk in entry_data: + if not chunk: + break + write_data(archive_pointer, chunk, len(chunk)) + + write_finish_entry(archive_pointer) @contextmanager @@ -183,6 +184,10 @@ def new_archive_write(format_name, filter_name=None, options='', passphrase=None ffi.write_free(archive_p) raise + @property + def bytes_written(self): + return ffi.filter_bytes(self._pointer, -1) + @contextmanager def custom_writer( @@ -190,6 +195,11 @@ def custom_writer( open_func=None, close_func=None, block_size=page_size, archive_write_class=ArchiveWrite, options='', passphrase=None, ): + """Create an archive and send it in chunks to the `write_func` function. + + For formats and filters, see `WRITE_FORMATS` and `WRITE_FILTERS` in the + `libarchive.ffi` module. + """ def write_cb_internal(archive_p, context, buffer_, length): data = cast(buffer_, POINTER(c_char * length))[0] @@ -212,6 +222,11 @@ def fd_writer( fd, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, ): + """Create an archive and write it into a file descriptor. + + For formats and filters, see `WRITE_FORMATS` and `WRITE_FILTERS` in the + `libarchive.ffi` module. + """ with new_archive_write(format_name, filter_name, options, passphrase) as archive_p: ffi.write_open_fd(archive_p, fd) @@ -223,6 +238,11 @@ def file_writer( filepath, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, ): + """Create an archive and write it into a file. + + For formats and filters, see `WRITE_FORMATS` and `WRITE_FILTERS` in the + `libarchive.ffi` module. + """ with new_archive_write(format_name, filter_name, options, passphrase) as archive_p: ffi.write_open_filename_w(archive_p, filepath) @@ -234,6 +254,11 @@ def memory_writer( buf, format_name, filter_name=None, archive_write_class=ArchiveWrite, options='', passphrase=None, ): + """Create an archive and write it into a buffer. + + For formats and filters, see `WRITE_FORMATS` and `WRITE_FILTERS` in the + `libarchive.ffi` module. + """ with new_archive_write(format_name, filter_name, options, passphrase) as archive_p: used = byref(c_size_t()) diff --git a/tests/test_entry.py b/tests/test_entry.py index 3768e4e..efbbe0c 100644 --- a/tests/test_entry.py +++ b/tests/test_entry.py @@ -7,7 +7,10 @@ from os.path import join import unicodedata +import pytest + from libarchive import memory_reader, memory_writer +from libarchive.entry import ArchiveEntry, ConsumedArchiveEntry, PassedArchiveEntry from . import data_dir, get_entries, get_tarinfos @@ -100,3 +103,35 @@ def check_entries(test_file, regen=False, ignore=''): if isinstance(d[key], text_type): d[key] = unicodedata.normalize('NFC', d[key]) assert e1 == e2 + + +def test_the_life_cycle_of_archive_entries(): + """Check that the `get_blocks` method only works on the current entry, and only once. + """ + # Create a test archive in memory + buf = bytes(bytearray(10_000_000)) + with memory_writer(buf, 'gnutar') as archive: + archive.add_files( + 'README.rst', + 'libarchive/__init__.py', + 'libarchive/entry.py', + ) + # Read multiple entries of the test archive and check how the evolve + with memory_reader(buf) as archive: + archive_iter = iter(archive) + entry1 = next(archive_iter) + assert type(entry1) is ArchiveEntry + for block in entry1.get_blocks(): + pass + assert type(entry1) is ConsumedArchiveEntry + with pytest.raises(TypeError): + entry1.get_blocks() + entry2 = next(archive_iter) + assert type(entry2) is ArchiveEntry + assert type(entry1) is PassedArchiveEntry + with pytest.raises(TypeError): + entry1.get_blocks() + entry3 = next(archive_iter) + assert type(entry3) is ArchiveEntry + assert type(entry2) is PassedArchiveEntry + assert type(entry1) is PassedArchiveEntry diff --git a/tests/test_rwx.py b/tests/test_rwx.py index 5e30650..6b819ae 100644 --- a/tests/test_rwx.py +++ b/tests/test_rwx.py @@ -97,22 +97,6 @@ def test_custom_writer_and_stream_reader(): check_archive(archive, tree) -def test_custom_writer_and_seekable_stream_reader(): - # Collect information on what should be in the archive - tree = treestat('libarchive') - - # Create an archive of our libarchive/ directory - stream = io.BytesIO() - with libarchive.custom_writer(stream.write, '7zip') as archive: - archive.add_files('libarchive/') - stream.seek(0) - - # Read the archive and check that the data is correct - with libarchive.seekable_stream_reader(stream, '7zip') as archive: - paths = [entry.name.rstrip('/') for entry in archive] - assert sorted(paths) == sorted(tree) - - @patch('libarchive.ffi.write_fail') def test_write_fail(write_fail_mock): buf = bytes(bytearray(1000000)) @@ -133,6 +117,14 @@ def test_write_not_fail(write_fail_mock): assert not write_fail_mock.called +def test_adding_nonexistent_file_to_archive(): + stream = io.BytesIO() + with libarchive.custom_writer(stream.write, 'zip') as archive: + with pytest.raises(libarchive.ArchiveError): + archive.add_files('nonexistent') + archive.add_files('libarchive/') + + @pytest.mark.parametrize( 'archfmt,data_bytes', [('zip', b'content'), @@ -161,7 +153,8 @@ def write_callback(data): with libarchive.custom_writer(write_callback, archfmt) as archive: archive.add_file_from_memory( entry_path, entry_size, entry_data, - atime=atime, mtime=mtime, ctime=ctime, birthtime=btime + atime=atime, mtime=mtime, ctime=ctime, birthtime=btime, + uid=1000, gid=1000, ) buf = b''.join(blocks) @@ -178,3 +171,5 @@ def write_callback(data): assert archive_entry.birthtime in ( btime[0], format_time(*btime) ) + assert archive_entry.uid == 1000 + assert archive_entry.gid == 1000 diff --git a/tests/test_security_flags.py b/tests/test_security_flags.py index d0302da..f279eaf 100644 --- a/tests/test_security_flags.py +++ b/tests/test_security_flags.py @@ -3,8 +3,7 @@ import pytest import os -from libarchive import extract_file -from libarchive.ffi import version_number +from libarchive import extract_file, file_reader from libarchive.extract import ( EXTRACT_SECURE_NOABSOLUTEPATHS, EXTRACT_SECURE_NODOTDOT, ) @@ -12,26 +11,26 @@ from . import data_dir -def run_test(flag, filename): +def run_test(flags): archive_path = os.path.join(data_dir, 'flags.tar') try: - extract_file(archive_path) + extract_file(archive_path, 0) with pytest.raises(ArchiveError): - extract_file(archive_path, flag) + extract_file(archive_path, flags) finally: - if os.path.exists(filename): - os.remove(filename) + with file_reader(archive_path) as archive: + for entry in archive: + if os.path.exists(entry.pathname): + os.remove(entry.pathname) -def test_no_dot_dot(): - run_test(EXTRACT_SECURE_NODOTDOT, '../python-libarchive-c-test-dot-dot-file') +def test_extraction_is_secure_by_default(): + run_test(None) -def test_absolute(): - # EXTRACT_SECURE_NOABSOLUTEPATHS was only added in 3.1.900 - # 3.1.900 -> 3001009 - if version_number() >= 3001009: - run_test( - EXTRACT_SECURE_NOABSOLUTEPATHS, - '/tmp/python-libarchive-c-test-absolute-file' - ) +def test_explicit_no_dot_dot(): + run_test(EXTRACT_SECURE_NODOTDOT) + + +def test_explicit_no_absolute_paths(): + run_test(EXTRACT_SECURE_NOABSOLUTEPATHS) diff --git a/tox.ini b/tox.ini index ac6b4b7..9c0a989 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist=py36,py37,py38,py39 +envlist=py37,py38,py39 skipsdist=True [testenv]