diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml index f382af79ad..6d473e0af3 100644 --- a/.github/workflows/black.yaml +++ b/.github/workflows/black.yaml @@ -12,4 +12,4 @@ jobs: - uses: actions/checkout@v4 - uses: psf/black@stable with: - version: "~= 23.0" + version: "~= 24.0" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f91c3ed31..68fb3a5065 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,8 +104,7 @@ jobs: pip install -r requirements.d/development.txt - name: Install borgbackup run: | - # pip install -e . - python setup.py -v develop + pip install -e . - name: run tox env env: XDISTN: "4" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5b6d0d390a..16e3342170 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 24.8.0 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit diff --git a/README.rst b/README.rst index 00157d847c..f65cf9da80 100644 --- a/README.rst +++ b/README.rst @@ -69,7 +69,7 @@ Main features **Speed** * performance-critical code (chunking, compression, encryption) is implemented in C/Cython - * local caching of files/chunks index data + * local caching * quick detection of unmodified files **Data encryption** diff --git a/docs/changes.rst b/docs/changes.rst index 90be2b0bb2..3a330eedca 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -12,8 +12,8 @@ This section provides information about security and corruption issues. Upgrade Notes ============= -borg 1.2.x to borg 2.0 ----------------------- +borg 1.2.x/1.4.x to borg 2.0 +---------------------------- Compatibility notes: @@ -21,11 +21,11 @@ Compatibility notes: We tried to put all the necessary "breaking" changes into this release, so we hopefully do not need another breaking release in the near future. The changes - were necessary for improved security, improved speed, unblocking future - improvements, getting rid of legacy crap / design limitations, having less and - simpler code to maintain. + were necessary for improved security, improved speed and parallelism, + unblocking future improvements, getting rid of legacy crap and design + limitations, having less and simpler code to maintain. - You can use "borg transfer" to transfer archives from borg 1.1/1.2 repos to + You can use "borg transfer" to transfer archives from borg 1.2/1.4 repos to a new borg 2.0 repo, but it will need some time and space. Before using "borg transfer", you must have upgraded to borg >= 1.2.6 (or @@ -84,6 +84,7 @@ Compatibility notes: - removed --nobsdflags (use --noflags) - removed --noatime (default now, see also --atime) - removed --save-space option (does not change behaviour) +- removed --bypass-lock option - using --list together with --progress is now disallowed (except with --log-json), #7219 - the --glob-archives option was renamed to --match-archives (the short option name -a is unchanged) and extended to support different pattern styles: @@ -114,12 +115,61 @@ Compatibility notes: fail now that somehow "worked" before (but maybe didn't work as intended due to the contradicting options). - .. _changelog: Change Log 2.x ============== +Version 2.0.0b10 (2024-09-09) +----------------------------- + +TL;DR: this is a huge change and the first very fundamental change in how borg +works since ever: + +- you will need to create new repos. +- likely more exciting than previous betas, definitely not for production. + +New features: + +- borgstore based repository, file:, ssh: and sftp: for now, more possible. +- repository stores objects separately now, not using segment files. + this has more fs overhead, but needs much less I/O because no segment + files compaction is required anymore. also, no repository index is + needed anymore because we can directly find the objects by their ID. +- locking: new borgstore based repository locking with automatic stale + lock removal (if lock does not get refreshed, if lock owner process is dead). +- simultaneous repository access for many borg commands except check/compact. + the cache lock for adhocwithfiles is still exclusive though, so use + BORG_CACHE_IMPL=adhoc if you want to try that out using only 1 machine + and 1 user (that implementation doesn't use a cache lock). When using + multiple client machines or users, it also works with the default cache. +- delete/prune: much quicker now and can be undone. +- check --repair --undelete-archives: bring archives back from the dead. +- rspace: manage reserved space in repository (avoid dead-end situation if + repository fs runs full). + +Bugs/issues fixed: + +- a lot! all linked from PR #8332. + +Other changes: + +- repository: remove transactions, solved differently and much simpler now + (convergence and write order primarily). +- repository: replaced precise reference counting with "object exists in repo?" + and "garbage collection of unused objects". +- cache: remove transactions, remove chunks cache. + removed LocalCache, BORG_CACHE_IMPL=local, solving all related issues. + as in beta 9, adhowwithfiles is the default implementation. +- compact: needs the borg key now (run it clientside), -v gives nice stats. +- transfer: archive transfers from borg 1.x need the --from-borg1 option +- check: reimplemented / bigger changes. +- code: got rid of a metric ton of not needed complexity. + when borg does not need to read borg 1.x repos/archives anymore, after + users have transferred their archives, even much more can be removed. +- docs: updated / removed outdated stuff + + Version 2.0.0b9 (2024-07-20) ---------------------------- diff --git a/docs/changes_1.x.rst b/docs/changes_1.x.rst index 3366a90b8c..714726c1c4 100644 --- a/docs/changes_1.x.rst +++ b/docs/changes_1.x.rst @@ -3469,7 +3469,7 @@ Other changes: - archiver tests: add check_cache tool - lints refcounts - fixed cache sync performance regression from 1.1.0b1 onwards, #1940 -- syncing the cache without chunks.archive.d (see :ref:`disable_archive_chunks`) +- syncing the cache without chunks.archive.d now avoids any merges and is thus faster, #1940 - borg check --verify-data: faster due to linear on-disk-order scan - borg debug-xxx commands removed, we use "debug xxx" subcommands now, #1627 diff --git a/docs/deployment/automated-local.rst b/docs/deployment/automated-local.rst index d34a70a7f6..dbc871511a 100644 --- a/docs/deployment/automated-local.rst +++ b/docs/deployment/automated-local.rst @@ -105,7 +105,7 @@ modify it to suit your needs (e.g. more backup sets, dumping databases etc.). # # Options for borg create - BORG_OPTS="--stats --one-file-system --compression lz4 --checkpoint-interval 86400" + BORG_OPTS="--stats --one-file-system --compression lz4" # Set BORG_PASSPHRASE or BORG_PASSCOMMAND somewhere around here, using export, # if encryption is used. diff --git a/docs/deployment/hosting-repositories.rst b/docs/deployment/hosting-repositories.rst index 55fd3e15e4..b0efbf696c 100644 --- a/docs/deployment/hosting-repositories.rst +++ b/docs/deployment/hosting-repositories.rst @@ -68,8 +68,6 @@ can be filled to the specified quota. If storage quotas are used, ensure that all deployed Borg releases support storage quotas. -Refer to :ref:`internals_storage_quota` for more details on storage quotas. - **Specificities: Append-only repositories** Running ``borg init`` via a ``borg serve --append-only`` server will **not** diff --git a/docs/faq.rst b/docs/faq.rst index 0daa226ca6..e36fa3120e 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -14,7 +14,7 @@ What is the difference between a repo on an external hard drive vs. repo on a se If Borg is running in client/server mode, the client uses SSH as a transport to talk to the remote agent, which is another Borg process (Borg is installed on the server, too) started automatically by the client. The Borg server is doing -storage-related low-level repo operations (get, put, commit, check, compact), +storage-related low-level repo operations (list, load and store objects), while the Borg client does the high-level stuff: deduplication, encryption, compression, dealing with archives, backups, restores, etc., which reduces the amount of data that goes over the network. @@ -27,17 +27,7 @@ which is slower. Can I back up from multiple servers into a single repository? ------------------------------------------------------------- -In order for the deduplication used by Borg to work, it -needs to keep a local cache containing checksums of all file -chunks already stored in the repository. This cache is stored in -``~/.cache/borg/``. If Borg detects that a repository has been -modified since the local cache was updated it will need to rebuild -the cache. This rebuild can be quite time consuming. - -So, yes it's possible. But it will be most efficient if a single -repository is only modified from one place. Also keep in mind that -Borg will keep an exclusive lock on the repository while creating -or deleting archives, which may make *simultaneous* backups fail. +Yes, you can! Even simultaneously. Can I back up to multiple, swapped backup targets? -------------------------------------------------- @@ -124,50 +114,31 @@ Are there other known limitations? remove files which are in the destination, but not in the archive. See :issue:`4598` for a workaround and more details. -.. _checkpoints_parts: +.. _interrupted_backup: If a backup stops mid-way, does the already-backed-up data stay there? ---------------------------------------------------------------------- -Yes, Borg supports resuming backups. - -During a backup, a special checkpoint archive named ``.checkpoint`` -is saved at every checkpoint interval (the default value for this is 30 -minutes) containing all the data backed-up until that point. - -This checkpoint archive is a valid archive, but it is only a partial backup -(not all files that you wanted to back up are contained in it and the last file -in it might be a partial file). Having it in the repo until a successful, full -backup is completed is useful because it references all the transmitted chunks up -to the checkpoint. This means that in case of an interruption, you only need to -retransfer the data since the last checkpoint. +Yes, the data transferred into the repo stays there - just avoid running +``borg compact`` before you completed the backup, because that would remove +chunks that were already transferred to the repo, but not (yet) referenced +by an archive. If a backup was interrupted, you normally do not need to do anything special, -just invoke ``borg create`` as you always do. If the repository is still locked, -you may need to run ``borg break-lock`` before the next backup. You may use the -same archive name as in previous attempt or a different one (e.g. if you always -include the current datetime), it does not matter. +just invoke ``borg create`` as you always do. You may use the same archive name +as in previous attempt or a different one (e.g. if you always include the +current datetime), it does not matter. Borg always does full single-pass backups, so it will start again from the beginning - but it will be much faster, because some of the data was -already stored into the repo (and is still referenced by the checkpoint -archive), so it does not need to get transmitted and stored again. - -Once your backup has finished successfully, you can delete all -``.checkpoint`` archives. If you run ``borg prune``, it will -also care for deleting unneeded checkpoints. - -Note: the checkpointing mechanism may create a partial (truncated) last file -in a checkpoint archive named ``.borg_part``. Such partial files -won't be contained in the final archive. -This is done so that checkpoints work cleanly and promptly while a big -file is being processed. +already stored into the repo, so it does not need to get transmitted and stored +again. How can I back up huge file(s) over a unstable connection? ---------------------------------------------------------- -Yes. For more details, see :ref:`checkpoints_parts`. +Yes. For more details, see :ref:`interrupted_backup`. How can I restore huge file(s) over an unstable connection? ----------------------------------------------------------- @@ -220,23 +191,6 @@ Yes, if you want to detect accidental data damage (like bit rot), use the If you want to be able to detect malicious tampering also, use an encrypted repo. It will then be able to check using CRCs and HMACs. -Can I use Borg on SMR hard drives? ----------------------------------- - -SMR (shingled magnetic recording) hard drives are very different from -regular hard drives. Applications have to behave in certain ways or -performance will be heavily degraded. - -Borg ships with default settings suitable for SMR drives, -and has been successfully tested on *Seagate Archive v2* drives -using the ext4 file system. - -Some Linux kernel versions between 3.19 and 4.5 had various bugs -handling device-managed SMR drives, leading to IO errors, unresponsive -drives and unreliable operation in general. - -For more details, refer to :issue:`2252`. - .. _faq-integrityerror: I get an IntegrityError or similar - what now? @@ -355,7 +309,7 @@ Why is the time elapsed in the archive stats different from wall clock time? ---------------------------------------------------------------------------- Borg needs to write the time elapsed into the archive metadata before finalizing -the archive and committing the repo & cache. +the archive and saving the files cache. This means when Borg is run with e.g. the ``time`` command, the duration shown in the archive stats may be shorter than the full time the command runs for. @@ -391,8 +345,7 @@ will of course delete everything in the archive, not only some files. :ref:`borg_recreate` command to rewrite all archives with a different ``--exclude`` pattern. See the examples in the manpage for more information. -Finally, run :ref:`borg_compact` with the ``--threshold 0`` option to delete the -data chunks from the repository. +Finally, run :ref:`borg_compact` to delete the data chunks from the repository. Can I safely change the compression level or algorithm? -------------------------------------------------------- @@ -402,6 +355,7 @@ are calculated *before* compression. New compression settings will only be applied to new chunks, not existing chunks. So it's safe to change them. +Use ``borg rcompress`` to efficiently recompress a complete repository. Security ######## @@ -704,38 +658,6 @@ serialized way in a single script, you need to give them ``--lock-wait N`` (with being a bit more than the time the server needs to terminate broken down connections and release the lock). -.. _disable_archive_chunks: - -The borg cache eats way too much disk space, what can I do? ------------------------------------------------------------ - -This may especially happen if borg needs to rebuild the local "chunks" index - -either because it was removed, or because it was not coherent with the -repository state any more (e.g. because another borg instance changed the -repository). - -To optimize this rebuild process, borg caches per-archive information in the -``chunks.archive.d/`` directory. It won't help the first time it happens, but it -will make the subsequent rebuilds faster (because it needs to transfer less data -from the repository). While being faster, the cache needs quite some disk space, -which might be unwanted. - -You can disable the cached archive chunk indexes by setting the environment -variable ``BORG_USE_CHUNKS_ARCHIVE`` to ``no``. - -This has some pros and cons, though: - -- much less disk space needs for ~/.cache/borg. -- chunk cache resyncs will be slower as it will have to transfer chunk usage - metadata for all archives from the repository (which might be slow if your - repo connection is slow) and it will also have to build the hashtables from - that data. - chunk cache resyncs happen e.g. if your repo was written to by another - machine (if you share same backup repo between multiple machines) or if - your local chunks cache was lost somehow. - -The long term plan to improve this is called "borgception", see :issue:`474`. - Can I back up my root partition (/) with Borg? ---------------------------------------------- @@ -779,7 +701,7 @@ This can make creation of the first archive slower, but saves time and disk space on subsequent runs. Here what Borg does when you run ``borg create``: - Borg chunks the file (using the relatively expensive buzhash algorithm) -- It then computes the "id" of the chunk (hmac-sha256 (often slow, except +- It then computes the "id" of the chunk (hmac-sha256 (slow, except if your CPU has sha256 acceleration) or blake2b (fast, in software)) - Then it checks whether this chunk is already in the repo (local hashtable lookup, fast). If so, the processing of the chunk is completed here. Otherwise it needs to @@ -790,9 +712,8 @@ and disk space on subsequent runs. Here what Borg does when you run ``borg creat - Transmits to repo. If the repo is remote, this usually involves an SSH connection (does its own encryption / authentication). - Stores the chunk into a key/value store (the key is the chunk id, the value - is the data). While doing that, it computes CRC32 / XXH64 of the data (repo low-level - checksum, used by borg check --repository) and also updates the repo index - (another hashtable). + is the data). While doing that, it computes XXH64 of the data (repo low-level + checksum, used by borg check --repository). Subsequent backups are usually very fast if most files are unchanged and only a few are new or modified. The high performance on unchanged files primarily depends @@ -826,10 +747,9 @@ If you feel your Borg backup is too slow somehow, here is what you can do: - Don't use any expensive compression. The default is lz4 and super fast. Uncompressed is often slower than lz4. - Just wait. You can also interrupt it and start it again as often as you like, - it will converge against a valid "completed" state (see ``--checkpoint-interval``, - maybe use the default, but in any case don't make it too short). It is starting + it will converge against a valid "completed" state. It is starting from the beginning each time, but it is still faster then as it does not store - data into the repo which it already has there from last checkpoint. + data into the repo which it already has there. - If you don’t need additional file attributes, you can disable them with ``--noflags``, ``--noacls``, ``--noxattrs``. This can lead to noticeable performance improvements when your backup consists of many small files. @@ -1021,6 +941,12 @@ To achieve this, run ``borg create`` within the mountpoint/snapshot directory: cd /mnt/rootfs borg create rootfs_backup . +Another way (without changing the directory) is to use the slashdot hack: + +:: + + borg create rootfs_backup /mnt/rootfs/./ + I am having troubles with some network/FUSE/special filesystem, why? -------------------------------------------------------------------- @@ -1100,16 +1026,6 @@ to make it behave correctly:: .. _workaround: https://unix.stackexchange.com/a/123236 -Can I disable checking for free disk space? -------------------------------------------- - -In some cases, the free disk space of the target volume is reported incorrectly. -This can happen for CIFS- or FUSE shares. If you are sure that your target volume -will always have enough disk space, you can use the following workaround to disable -checking for free disk space:: - - borg config -- additional_free_space -2T - How do I rename a repository? ----------------------------- @@ -1126,26 +1042,6 @@ It may be useful to set ``BORG_RELOCATED_REPO_ACCESS_IS_OK=yes`` to avoid the prompts when renaming multiple repositories or in a non-interactive context such as a script. See :doc:`deployment` for an example. -The repository quota size is reached, what can I do? ----------------------------------------------------- - -The simplest solution is to increase or disable the quota and resume the backup: - -:: - - borg config /path/to/repo storage_quota 0 - -If you are bound to the quota, you have to free repository space. The first to -try is running :ref:`borg_compact` to free unused backup space (see also -:ref:`separate_compaction`): - -:: - - borg compact /path/to/repo - -If your repository is already compacted, run :ref:`borg_prune` or -:ref:`borg_delete` to delete archives that you do not need anymore, and then run -``borg compact`` again. My backup disk is full, what can I do? -------------------------------------- @@ -1159,11 +1055,6 @@ conditions, but generally this should be avoided. If your backup disk is already full when Borg starts a write command like `borg create`, it will abort immediately and the repository will stay as-is. -If you run a backup that stops due to a disk running full, Borg will roll back, -delete the new segment file and thus freeing disk space automatically. There -may be a checkpoint archive left that has been saved before the disk got full. -You can keep it to speed up the next backup or delete it to get back more disk -space. Miscellaneous ############# diff --git a/docs/internals/compaction.odg b/docs/internals/compaction.odg deleted file mode 100644 index 8d193e009e..0000000000 Binary files a/docs/internals/compaction.odg and /dev/null differ diff --git a/docs/internals/compaction.png b/docs/internals/compaction.png deleted file mode 100644 index d5c53a680e..0000000000 Binary files a/docs/internals/compaction.png and /dev/null differ diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 8d1562ff23..45de9ee348 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -19,63 +19,51 @@ discussion about internals`_ and also on static code analysis. Repository ---------- -.. Some parts of this description were taken from the Repository docstring - -Borg stores its data in a `Repository`, which is a file system based -transactional key-value store. Thus the repository does not know about -the concept of archives or items. - -Each repository has the following file structure: - -README - simple text file telling that this is a Borg repository - -config - repository configuration +Borg stores its data in a `Repository`, which is a key-value store and has +the following structure: + +config/ + readme + simple text object telling that this is a Borg repository + id + the unique repository ID encoded as hexadecimal number text + version + the repository version encoded as decimal number text + manifest + some data about the repository, binary + last-key-checked + repository check progress (partial checks, full checks' checkpointing), + path of last object checked as text + space-reserve.N + purely random binary data to reserve space, e.g. for disk-full emergencies + +There is a list of pointers to archive objects in this directory: + +archives/ + 0000... .. ffff... + +The actual data is stored into a nested directory structure, using the full +object ID as name. Each (encrypted and compressed) object is stored separately. data/ - directory where the actual data is stored - -hints.%d - hints for repository compaction - -index.%d - repository index - -lock.roster and lock.exclusive/* - used by the locking system to manage shared and exclusive locks - -Transactionality is achieved by using a log (aka journal) to record changes. The log is a series of numbered files -called segments_. Each segment is a series of log entries. The segment number together with the offset of each -entry relative to its segment start establishes an ordering of the log entries. This is the "definition" of -time for the purposes of the log. - -.. _config-file: - -Config file -~~~~~~~~~~~ + 00/ .. ff/ + 00/ .. ff/ + 0000... .. ffff... -Each repository has a ``config`` file which is a ``INI``-style file -and looks like this:: +keys/ + repokey + When using encryption in repokey mode, the encrypted, passphrase protected + key is stored here as a base64 encoded text. - [repository] - version = 2 - segments_per_dir = 1000 - max_segment_size = 524288000 - id = 57d6c1d52ce76a836b532b0e42e677dec6af9fca3673db511279358828a21ed6 +locks/ + used by the locking system to manage shared and exclusive locks. -This is where the ``repository.id`` is stored. It is a unique -identifier for repositories. It will not change if you move the -repository around so you can make a local transfer then decide to move -the repository to another (even remote) location at a later time. Keys ~~~~ -Repository keys are byte-strings of fixed length (32 bytes), they -don't have a particular meaning (except for the Manifest_). - -Normally the keys are computed like this:: +Repository object IDs (which are used as key into the key-value store) are +byte-strings of fixed length (256bit, 32 bytes), computed like this:: key = id = id_hash(plaintext_data) # plain = not encrypted, not compressed, not obfuscated @@ -84,247 +72,68 @@ The id_hash function depends on the :ref:`encryption mode `. As the id / key is used for deduplication, id_hash must be a cryptographically strong hash or MAC. -Segments -~~~~~~~~ - -Objects referenced by a key are stored inline in files (`segments`) of approx. -500 MB size in numbered subdirectories of ``repo/data``. The number of segments -per directory is controlled by the value of ``segments_per_dir``. If you change -this value in a non-empty repository, you may also need to relocate the segment -files manually. - -A segment starts with a magic number (``BORG_SEG`` as an eight byte ASCII string), -followed by a number of log entries. Each log entry consists of (in this order): - -* crc32 checksum (uint32): - - for PUT2: CRC32(size + tag + key + digest) - - for PUT: CRC32(size + tag + key + payload) - - for DELETE: CRC32(size + tag + key) - - for COMMIT: CRC32(size + tag) -* size (uint32) of the entry (including the whole header) -* tag (uint8): PUT(0), DELETE(1), COMMIT(2) or PUT2(3) -* key (256 bit) - only for PUT/PUT2/DELETE -* payload (size - 41 bytes) - only for PUT -* xxh64 digest (64 bit) = XXH64(size + tag + key + payload) - only for PUT2 -* payload (size - 41 - 8 bytes) - only for PUT2 - -PUT2 is new since repository version 2. For new log entries PUT2 is used. -PUT is still supported to read version 1 repositories, but not generated any more. -If we talk about ``PUT`` in general, it shall usually mean PUT2 for repository -version 2+. +Repository objects +~~~~~~~~~~~~~~~~~~ -Those files are strictly append-only and modified only once. +Each repository object is stored separately, under its ID into data/xx/yy/xxyy... -When an object is written to the repository a ``PUT`` entry is written -to the file containing the object id and payload. If an object is deleted -a ``DELETE`` entry is appended with the object id. +A repo object has a structure like this: -A ``COMMIT`` tag is written when a repository transaction is -committed. The segment number of the segment containing -a commit is the **transaction ID**. +* 32bit meta size +* 32bit data size +* 64bit xxh64(meta) +* 64bit xxh64(data) +* meta +* data -When a repository is opened any ``PUT`` or ``DELETE`` operations not -followed by a ``COMMIT`` tag are discarded since they are part of a -partial/uncommitted transaction. +The size and xxh64 hashes can be used for server-side corruption checks without +needing to decrypt anything (which would require the borg key). -The size of individual segments is limited to 4 GiB, since the offset of entries -within segments is stored in a 32-bit unsigned integer in the repository index. +The overall size of repository objects varies from very small (a small source +file will be stored as a single repo object) to medium (big source files will +be cut into medium sized chunks of some MB). -Objects / Payload structure -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Metadata and data are separately encrypted and authenticated (depending on +the user's choices). -All data (the manifest, archives, archive item stream chunks and file data -chunks) is compressed, optionally obfuscated and encrypted. This produces some -additional metadata (size and compression information), which is separately -serialized and also encrypted. +See :ref:`data-encryption` for a graphic outlining the anatomy of the +encryption. -See :ref:`data-encryption` for a graphic outlining the anatomy of the encryption in Borg. -What you see at the bottom there is done twice: once for the data and once for the metadata. +Repo object metadata +~~~~~~~~~~~~~~~~~~~~ -An object (the payload part of a segment file log entry) must be like: +Metadata is a msgpacked (and encrypted/authenticated) dict with: -- length of encrypted metadata (16bit unsigned int) -- encrypted metadata (incl. encryption header), when decrypted: +- ctype (compression type 0..255) +- clevel (compression level 0..255) +- csize (overall compressed (and maybe obfuscated) data size) +- psize (only when obfuscated: payload size without the obfuscation trailer) +- size (uncompressed size of the data) - - msgpacked dict with: - - - ctype (compression type 0..255) - - clevel (compression level 0..255) - - csize (overall compressed (and maybe obfuscated) data size) - - psize (only when obfuscated: payload size without the obfuscation trailer) - - size (uncompressed size of the data) -- encrypted data (incl. encryption header), when decrypted: - - - compressed data (with an optional all-zero-bytes obfuscation trailer) - -This new, more complex repo v2 object format was implemented to be able to query the -metadata efficiently without having to read, transfer and decrypt the (usually much bigger) -data part. - -The metadata is encrypted not to disclose potentially sensitive information that could be -used for e.g. fingerprinting attacks. +Having this separately encrypted metadata makes it more efficient to query +the metadata without having to read, transfer and decrypt the (usually much +bigger) data part. The compression `ctype` and `clevel` is explained in :ref:`data-compression`. -Index, hints and integrity -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The **repository index** is stored in ``index.`` and is used to -determine an object's location in the repository. It is a HashIndex_, -a hash table using open addressing. - -It maps object keys_ to: - -* segment number (unit32) -* offset of the object's entry within the segment (uint32) -* size of the payload, not including the entry header (uint32) -* flags (uint32) - -The **hints file** is a msgpacked file named ``hints.``. -It contains: - -* version -* list of segments -* compact -* shadow_index -* storage_quota_use - -The **integrity file** is a msgpacked file named ``integrity.``. -It contains checksums of the index and hints files and is described in the -:ref:`Checksumming data structures ` section below. - -If the index or hints are corrupted, they are re-generated automatically. -If they are outdated, segments are replayed from the index state to the currently -committed transaction. - Compaction ~~~~~~~~~~ -For a given key only the last entry regarding the key, which is called current (all other entries are called -superseded), is relevant: If there is no entry or the last entry is a DELETE then the key does not exist. -Otherwise the last PUT defines the value of the key. - -By superseding a PUT (with either another PUT or a DELETE) the log entry becomes obsolete. A segment containing -such obsolete entries is called sparse, while a segment containing no such entries is called compact. - -Since writing a ``DELETE`` tag does not actually delete any data and -thus does not free disk space any log-based data store will need a -compaction strategy (somewhat analogous to a garbage collector). - -Borg uses a simple forward compacting algorithm, which avoids modifying existing segments. -Compaction runs when a commit is issued with ``compact=True`` parameter, e.g. -by the ``borg compact`` command (unless the :ref:`append_only_mode` is active). - -The compaction algorithm requires two inputs in addition to the segments themselves: - -(i) Which segments are sparse, to avoid scanning all segments (impractical). - Further, Borg uses a conditional compaction strategy: Only those - segments that exceed a threshold sparsity are compacted. - - To implement the threshold condition efficiently, the sparsity has - to be stored as well. Therefore, Borg stores a mapping ``(segment - id,) -> (number of sparse bytes,)``. - -(ii) Each segment's reference count, which indicates how many live objects are in a segment. - This is not strictly required to perform the algorithm. Rather, it is used to validate - that a segment is unused before deleting it. If the algorithm is incorrect, or the reference - count was not accounted correctly, then an assertion failure occurs. - -These two pieces of information are stored in the hints file (`hints.N`) -next to the index (`index.N`). - -Compaction may take some time if a repository has been kept in append-only mode -or ``borg compact`` has not been used for a longer time, which both has caused -the number of sparse segments to grow. - -Compaction processes sparse segments from oldest to newest; sparse segments -which don't contain enough deleted data to justify compaction are skipped. This -avoids doing e.g. 500 MB of writing current data to a new segment when only -a couple kB were deleted in a segment. - -Segments that are compacted are read in entirety. Current entries are written to -a new segment, while superseded entries are omitted. After each segment an intermediary -commit is written to the new segment. Then, the old segment is deleted -(asserting that the reference count diminished to zero), freeing disk space. - -A simplified example (excluding conditional compaction and with simpler -commit logic) showing the principal operation of compaction: - -.. figure:: compaction.png - :figwidth: 100% - :width: 100% - -(The actual algorithm is more complex to avoid various consistency issues, refer to -the ``borg.repository`` module for more comments and documentation on these issues.) - -.. _internals_storage_quota: - -Storage quotas -~~~~~~~~~~~~~~ - -Quotas are implemented at the Repository level. The active quota of a repository -is determined by the ``storage_quota`` `config` entry or a run-time override (via :ref:`borg_serve`). -The currently used quota is stored in the hints file. Operations (PUT and DELETE) during -a transaction modify the currently used quota: - -- A PUT adds the size of the *log entry* to the quota, - i.e. the length of the data plus the 41 byte header. -- A DELETE subtracts the size of the deleted log entry from the quota, - which includes the header. - -Thus, PUT and DELETE are symmetric and cancel each other out precisely. - -The quota does not track on-disk size overheads (due to conditional compaction -or append-only mode). In normal operation the inclusion of the log entry headers -in the quota act as a faithful proxy for index and hints overheads. - -By tracking effective content size, the client can *always* recover from a full quota -by deleting archives. This would not be possible if the quota tracked on-disk size, -since journaling DELETEs requires extra disk space before space is freed. -Tracking effective size on the other hand accounts DELETEs immediately as freeing quota. - -.. rubric:: Enforcing the quota - -The storage quota is meant as a robust mechanism for service providers, therefore -:ref:`borg_serve` has to enforce it without loopholes (e.g. modified clients). -The following sections refer to using quotas on remotely accessed repositories. -For local access, consider *client* and *serve* the same. -Accordingly, quotas cannot be enforced with local access, -since the quota can be changed in the repository config. - -The quota is enforcible only if *all* :ref:`borg_serve` versions -accessible to clients support quotas (see next section). Further, quota is -per repository. Therefore, ensure clients can only access a defined set of repositories -with their quotas set, using ``--restrict-to-repository``. +``borg compact`` is used to free repository space. It will: -If the client exceeds the storage quota the ``StorageQuotaExceeded`` exception is -raised. Normally a client could ignore such an exception and just send a ``commit()`` -command anyway, circumventing the quota. However, when ``StorageQuotaExceeded`` is raised, -it is stored in the ``transaction_doomed`` attribute of the repository. -If the transaction is doomed, then commit will re-raise this exception, aborting the commit. +- list all object IDs present in the repository +- read all archives and determine which object IDs are in use +- remove all unused objects from the repository +- inform / warn about anything remarkable it found: -The transaction_doomed indicator is reset on a rollback (which erases the quota-exceeding -state). + - warn about IDs used, but not present (data loss!) + - inform about IDs that reappeared that were previously lost +- compute statistics about: -.. rubric:: Compatibility with older servers and enabling quota after-the-fact + - compression and deduplication factors + - repository space usage and space freed -If no quota data is stored in the hints file, Borg assumes zero quota is used. -Thus, if a repository with an enabled quota is written to with an older ``borg serve`` -version that does not understand quotas, then the quota usage will be erased. - -The client version is irrelevant to the storage quota and has no part in it. -The form of error messages due to exceeding quota varies with client versions. - -A similar situation arises when upgrading from a Borg release that did not have quotas. -Borg will start tracking quota use from the time of the upgrade, starting at zero. - -If the quota shall be enforced accurately in these cases, either - -- delete the ``index.N`` and ``hints.N`` files, forcing Borg to rebuild both, - re-acquiring quota data in the process, or -- edit the msgpacked ``hints.N`` file (not recommended and thus not - documented further). The object graph ---------------- @@ -344,10 +153,10 @@ More on how this helps security in :ref:`security_structural_auth`. The manifest ~~~~~~~~~~~~ -The manifest is the root of the object hierarchy. It references -all archives in a repository, and thus all data in it. -Since no object references it, it cannot be stored under its ID key. -Instead, the manifest has a fixed all-zero key. +Compared to borg 1.x: + +- the manifest moved from object ID 0 to config/manifest +- the archives list has been moved from the manifest to archives/* The manifest is rewritten each time an archive is created, deleted, or modified. It looks like this: @@ -523,17 +332,18 @@ these may/may not be implemented and purely serve as examples. Archives ~~~~~~~~ -Each archive is an object referenced by the manifest. The archive object -itself does not store any of the data contained in the archive it describes. +Each archive is an object referenced by an entry below archives/. +The archive object itself does not store any of the data contained in the +archive it describes. Instead, it contains a list of chunks which form a msgpacked stream of items_. The archive object itself further contains some metadata: * *version* -* *name*, which might differ from the name set in the manifest. +* *name*, which might differ from the name set in the archives/* object. When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds more than one archive object with the same name, it adds a counter to the name - in the manifest, but leaves the *name* field of the archives as it was. + in archives/*, but leaves the *name* field of the archives as they were. * *item_ptrs*, a list of "pointer chunk" IDs. Each "pointer chunk" contains a list of chunk IDs of item metadata. * *command_line*, the command line which was used to create the archive @@ -676,7 +486,7 @@ In memory, the files cache is a key -> value mapping (a Python *dict*) and conta - file size - file ctime_ns (or mtime_ns) - age (0 [newest], 1, 2, 3, ..., BORG_FILES_CACHE_TTL - 1) - - list of chunk ids representing the file's contents + - list of chunk (id, size) tuples representing the file's contents To determine whether a file has not changed, cached values are looked up via the key in the mapping and compared to the current file attribute values. @@ -717,9 +527,9 @@ The on-disk format of the files cache is a stream of msgpacked tuples (key, valu Loading the files cache involves reading the file, one msgpack object at a time, unpacking it, and msgpacking the value (in an effort to save memory). -The **chunks cache** is stored in ``cache/chunks`` and is used to determine -whether we already have a specific chunk, to count references to it and also -for statistics. +The **chunks cache** is not persisted to disk, but dynamically built in memory +by querying the existing object IDs from the repository. +It is used to determine whether we already have a specific chunk. The chunks cache is a key -> value mapping and contains: @@ -728,14 +538,10 @@ The chunks cache is a key -> value mapping and contains: - chunk id_hash * value: - - reference count - - size + - reference count (always MAX_VALUE as we do not refcount anymore) + - size (0 for prev. existing objects, we can't query their plaintext size) -The chunks cache is a HashIndex_. Due to some restrictions of HashIndex, -the reference count of each given chunk is limited to a constant, MAX_VALUE -(introduced below in HashIndex_), approximately 2**32. -If a reference count hits MAX_VALUE, decrementing it yields MAX_VALUE again, -i.e. the reference count is pinned to MAX_VALUE. +The chunks cache is a HashIndex_. .. _cache-memory-usage: @@ -747,14 +553,12 @@ Here is the estimated memory usage of Borg - it's complicated:: chunk_size ~= 2 ^ HASH_MASK_BITS (for buzhash chunker, BLOCK_SIZE for fixed chunker) chunk_count ~= total_file_size / chunk_size - repo_index_usage = chunk_count * 48 - chunks_cache_usage = chunk_count * 40 - files_cache_usage = total_file_count * 240 + chunk_count * 80 + files_cache_usage = total_file_count * 240 + chunk_count * 165 - mem_usage ~= repo_index_usage + chunks_cache_usage + files_cache_usage - = chunk_count * 164 + total_file_count * 240 + mem_usage ~= chunks_cache_usage + files_cache_usage + = chunk_count * 205 + total_file_count * 240 Due to the hashtables, the best/usual/worst cases for memory allocation can be estimated like that:: @@ -772,11 +576,9 @@ It is also assuming that typical chunk size is 2^HASH_MASK_BITS (if you have a lot of files smaller than this statistical medium chunk size, you will have more chunks than estimated above, because 1 file is at least 1 chunk). -If a remote repository is used the repo index will be allocated on the remote side. - -The chunks cache, files cache and the repo index are all implemented as hash -tables. A hash table must have a significant amount of unused entries to be -fast - the so-called load factor gives the used/unused elements ratio. +The chunks cache and files cache are all implemented as hash tables. +A hash table must have a significant amount of unused entries to be fast - +the so-called load factor gives the used/unused elements ratio. When a hash table gets full (load factor getting too high), it needs to be grown (allocate new, bigger hash table, copy all elements over to it, free old @@ -802,7 +604,7 @@ b) with ``create --chunker-params buzhash,19,23,21,4095`` (default): HashIndex --------- -The chunks cache and the repository index are stored as hash tables, with +The chunks cache is implemented as a hash table, with only one slot per bucket, spreading hash collisions to the following buckets. As a consequence the hash is just a start position for a linear search. If a key is looked up that is not in the table, then the hash table @@ -905,7 +707,7 @@ Both modes ~~~~~~~~~~ Encryption keys (and other secrets) are kept either in a key file on the client -('keyfile' mode) or in the repository config on the server ('repokey' mode). +('keyfile' mode) or in the repository under keys/repokey ('repokey' mode). In both cases, the secrets are generated from random and then encrypted by a key derived from your passphrase (this happens on the client before the key is stored into the keyfile or as repokey). @@ -923,8 +725,7 @@ Key files When initializing a repository with one of the "keyfile" encryption modes, Borg creates an associated key file in ``$HOME/.config/borg/keys``. -The same key is also used in the "repokey" modes, which store it in the repository -in the configuration file. +The same key is also used in the "repokey" modes, which store it in the repository. The internal data structure is as follows: @@ -1016,11 +817,10 @@ methods in one repo does not influence deduplication. See ``borg create --help`` about how to specify the compression level and its default. -Lock files ----------- +Lock files (fslocking) +---------------------- -Borg uses locks to get (exclusive or shared) access to the cache and -the repository. +Borg uses filesystem locks to get (exclusive or shared) access to the cache. The locking system is based on renaming a temporary directory to `lock.exclusive` (for @@ -1037,24 +837,46 @@ to `lock.exclusive`, it has the lock for it. If renaming fails denotes a thread on the host which is still alive), lock acquisition fails. The cache lock is usually in `~/.cache/borg/REPOID/lock.*`. -The repository lock is in `repository/lock.*`. + +Locks (storelocking) +-------------------- + +To implement locking based on ``borgstore``, borg stores objects below locks/. + +The objects contain: + +- a timestamp when lock was created (or refreshed) +- host / process / thread information about lock owner +- lock type: exclusive or shared + +Using that information, borg implements: + +- lock auto-expiry: if a lock is old and has not been refreshed in time, + it will be automatically ignored and deleted. the primary purpose of this + is to get rid of stale locks by borg processes on other machines. +- lock auto-removal if the owner process is dead. the primary purpose of this + is to quickly get rid of stale locks by borg processes on the same machine. + +Breaking the locks +------------------ In case you run into troubles with the locks, you can use the ``borg break-lock`` command after you first have made sure that no Borg process is running on any machine that accesses this resource. Be very careful, the cache or repository might get damaged if multiple processes use it at the same time. +If there is an issue just with the repository lock, it will usually resolve +automatically (see above), just retry later. + + Checksumming data structures ---------------------------- As detailed in the previous sections, Borg generates and stores various files -containing important meta data, such as the repository index, repository hints, -chunks caches and files cache. +containing important meta data, such as the files cache. -Data corruption in these files can damage the archive data in a repository, -e.g. due to wrong reference counts in the chunks cache. Only some parts of Borg -were designed to handle corrupted data structures, so a corrupted files cache -may cause crashes or write incorrect archives. +Data corruption in the files cache could create incorrect archives, e.g. due +to wrong object IDs or sizes in the files cache. Therefore, Borg calculates checksums when writing these files and tests checksums when reading them. Checksums are generally 64-bit XXH64 hashes. @@ -1086,11 +908,11 @@ xxHash was expressly designed for data blocks of these sizes. Lower layer — file_integrity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To accommodate the different transaction models used for the cache and repository, -there is a lower layer (borg.crypto.file_integrity.IntegrityCheckedFile) -wrapping a file-like object, performing streaming calculation and comparison of checksums. -Checksum errors are signalled by raising an exception (borg.crypto.file_integrity.FileIntegrityError) -at the earliest possible moment. +There is a lower layer (borg.crypto.file_integrity.IntegrityCheckedFile) +wrapping a file-like object, performing streaming calculation and comparison +of checksums. +Checksum errors are signalled by raising an exception at the earliest possible +moment (borg.crypto.file_integrity.FileIntegrityError). .. rubric:: Calculating checksums @@ -1134,19 +956,13 @@ The *digests* key contains a mapping of part names to their digests. Integrity data is generally stored by the upper layers, introduced below. An exception is the DetachedIntegrityCheckedFile, which automatically writes and reads it from a ".integrity" file next to the data file. -It is used for archive chunks indexes in chunks.archive.d. Upper layer ~~~~~~~~~~~ -Storage of integrity data depends on the component using it, since they have -different transaction mechanisms, and integrity data needs to be -transacted with the data it is supposed to protect. - .. rubric:: Main cache files: chunks and files cache -The integrity data of the ``chunks`` and ``files`` caches is stored in the -cache ``config``, since all three are transacted together. +The integrity data of the ``files`` cache is stored in the cache ``config``. The ``[integrity]`` section is used: @@ -1162,7 +978,7 @@ The ``[integrity]`` section is used: [integrity] manifest = 10e...21c - chunks = {"algorithm": "XXH64", "digests": {"HashHeader": "eab...39e3", "final": "e2a...b24"}} + files = {"algorithm": "XXH64", "digests": {"HashHeader": "eab...39e3", "final": "e2a...b24"}} The manifest ID is duplicated in the integrity section due to the way all Borg versions handle the config file. Instead of creating a "new" config file from @@ -1182,52 +998,6 @@ easy to tell whether the checksums concern the current state of the cache. Integrity errors are fatal in these files, terminating the program, and are not automatically corrected at this time. -.. rubric:: chunks.archive.d - -Indices in chunks.archive.d are not transacted and use DetachedIntegrityCheckedFile, -which writes the integrity data to a separate ".integrity" file. - -Integrity errors result in deleting the affected index and rebuilding it. -This logs a warning and increases the exit code to WARNING (1). - -.. _integrity_repo: - -.. rubric:: Repository index and hints - -The repository associates index and hints files with a transaction by including the -transaction ID in the file names. Integrity data is stored in a third file -("integrity."). Like the hints file, it is msgpacked: - -.. code-block:: python - - { - 'version': 2, - 'hints': '{"algorithm": "XXH64", "digests": {"final": "411208db2aa13f1a"}}', - 'index': '{"algorithm": "XXH64", "digests": {"HashHeader": "846b7315f91b8e48", "final": "cb3e26cadc173e40"}}' - } - -The *version* key started at 2, the same version used for the hints. Since Borg has -many versioned file formats, this keeps the number of different versions in use -a bit lower. - -The other keys map an auxiliary file, like *index* or *hints* to their integrity data. -Note that the JSON is stored as-is, and not as part of the msgpack structure. - -Integrity errors result in deleting the affected file(s) (index/hints) and rebuilding the index, -which is the same action taken when corruption is noticed in other ways (e.g. HashIndex can -detect most corrupted headers, but not data corruption). A warning is logged as well. -The exit code is not influenced, since remote repositories cannot perform that action. -Raising the exit code would be possible for local repositories, but is not implemented. - -Unlike the cache design this mechanism can have false positives whenever an older version -*rewrites* the auxiliary files for a transaction created by a newer version, -since that might result in a different index (due to hash-table resizing) or hints file -(hash ordering, or the older version 1 format), while not invalidating the integrity file. - -For example, using 1.1 on a repository, noticing corruption or similar issues and then running -``borg-1.0 check --repair``, which rewrites the index and hints, results in this situation. -Borg 1.1 would erroneously report checksum errors in the hints and/or index files and trigger -an automatic rebuild of these files. HardLinkManager and the hlid concept ------------------------------------ diff --git a/docs/internals/object-graph.odg b/docs/internals/object-graph.odg index c4060e6ee4..7b7cadfa0b 100644 Binary files a/docs/internals/object-graph.odg and b/docs/internals/object-graph.odg differ diff --git a/docs/internals/object-graph.png b/docs/internals/object-graph.png index f0d1f0001c..503f164998 100644 Binary files a/docs/internals/object-graph.png and b/docs/internals/object-graph.png differ diff --git a/docs/internals/security.rst b/docs/internals/security.rst index 68bd647b29..aada49e3f2 100644 --- a/docs/internals/security.rst +++ b/docs/internals/security.rst @@ -31,14 +31,14 @@ deleted between attacks). Under these circumstances Borg guarantees that the attacker cannot 1. modify the data of any archive without the client detecting the change -2. rename, remove or add an archive without the client detecting the change +2. rename or add an archive without the client detecting the change 3. recover plain-text data 4. recover definite (heuristics based on access patterns are possible) structural information such as the object graph (which archives refer to what chunks) The attacker can always impose a denial of service per definition (he could -forbid connections to the repository, or delete it entirely). +forbid connections to the repository, or delete it partly or entirely). .. _security_structural_auth: @@ -47,12 +47,12 @@ Structural Authentication ------------------------- Borg is fundamentally based on an object graph structure (see :ref:`internals`), -where the root object is called the manifest. +where the root objects are the archives. Borg follows the `Horton principle`_, which states that not only the message must be authenticated, but also its meaning (often expressed through context), because every object used is referenced by a -parent object through its object ID up to the manifest. The object ID in +parent object through its object ID up to the archive list entry. The object ID in Borg is a MAC of the object's plaintext, therefore this ensures that an attacker cannot change the context of an object without forging the MAC. @@ -64,8 +64,8 @@ represent packed file metadata. On their own, it's not clear that these objects would represent what they do, but by the archive item referring to them in a particular part of its own data structure assigns this meaning. -This results in a directed acyclic graph of authentication from the manifest -to the data chunks of individual files. +This results in a directed acyclic graph of authentication from the archive +list entry to the data chunks of individual files. Above used to be all for borg 1.x and was the reason why it needed the tertiary authentication mechanism (TAM) for manifest and archives. @@ -80,11 +80,23 @@ the object ID (via giving the ID as AAD), there is no way an attacker (without access to the borg key) could change the type of the object or move content to a different object ID. -This effectively 'anchors' the manifest (and also other metadata, like archives) -to the key, which is controlled by the client, thereby anchoring the entire DAG, -making it impossible for an attacker to add, remove or modify any part of the +This effectively 'anchors' each archive to the key, which is controlled by the +client, thereby anchoring the DAG starting from the archives list entry, +making it impossible for an attacker to add or modify any part of the DAG without Borg being able to detect the tampering. +Please note that removing an archive by removing an entry from archives/* +is possible and is done by ``borg delete`` and ``borg prune`` within their +normal operation. An attacker could also remove some entries there, but, due to +encryption, would not know what exactly they are removing. An attacker with +repository access could also remove other parts of the repository or the whole +repository, so there is not much point in protecting against archive removal. + +The borg 1.x way of having the archives list within the manifest chunk was +problematic as it required a read-modify-write operation on the manifest, +requiring a lock on the repository. We want to try less locking and more +parallelism in future. + Passphrase notes ---------------- diff --git a/docs/man/borg-benchmark-cpu.1 b/docs/man/borg-benchmark-cpu.1 index 50cca65ace..7d55244ed6 100644 --- a/docs/man/borg-benchmark-cpu.1 +++ b/docs/man/borg-benchmark-cpu.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-BENCHMARK-CPU" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-BENCHMARK-CPU" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-benchmark-cpu \- Benchmark CPU bound operations. .SH SYNOPSIS diff --git a/docs/man/borg-benchmark-crud.1 b/docs/man/borg-benchmark-crud.1 index e4efc752fa..2e066e7971 100644 --- a/docs/man/borg-benchmark-crud.1 +++ b/docs/man/borg-benchmark-crud.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-BENCHMARK-CRUD" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-BENCHMARK-CRUD" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-benchmark-crud \- Benchmark Create, Read, Update, Delete for archives. .SH SYNOPSIS diff --git a/docs/man/borg-benchmark.1 b/docs/man/borg-benchmark.1 index 4dd0fcb5f4..ca1227810f 100644 --- a/docs/man/borg-benchmark.1 +++ b/docs/man/borg-benchmark.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-BENCHMARK" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-BENCHMARK" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-benchmark \- benchmark command .SH SYNOPSIS diff --git a/docs/man/borg-break-lock.1 b/docs/man/borg-break-lock.1 index 3bb5d24419..4df141d382 100644 --- a/docs/man/borg-break-lock.1 +++ b/docs/man/borg-break-lock.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-BREAK-LOCK" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-BREAK-LOCK" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-break-lock \- Break the repository lock (e.g. in case it was left by a dead borg. .SH SYNOPSIS diff --git a/docs/man/borg-check.1 b/docs/man/borg-check.1 index b4040573c4..8c674a294b 100644 --- a/docs/man/borg-check.1 +++ b/docs/man/borg-check.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-CHECK" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-CHECK" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-check \- Check repository consistency .SH SYNOPSIS @@ -40,8 +40,8 @@ It consists of two major steps: .INDENT 0.0 .IP 1. 3 Checking the consistency of the repository itself. This includes checking -the segment magic headers, and both the metadata and data of all objects in -the segments. The read data is checked by size and CRC. Bit rot and other +the file magic headers, and both the metadata and data of all objects in +the repository. The read data is checked by size and hash. Bit rot and other types of accidental damage can be detected this way. Running the repository check can be split into multiple partial checks using \fB\-\-max\-duration\fP\&. When checking a remote repository, please note that the checks run on the @@ -77,13 +77,12 @@ archive checks, nor enable repair mode. Consequently, if you want to use .sp \fBWarning:\fP Please note that partial repository checks (i.e. running it with \fB\-\-max\-duration\fP) can only perform non\-cryptographic checksum checks on the -segment files. A full repository check (i.e. without \fB\-\-max\-duration\fP) can -also do a repository index check. Enabling partial repository checks excepts -archive checks for the same reason. Therefore partial checks may be useful with -very large repositories only where a full check would take too long. +repository files. Enabling partial repository checks excepts archive checks +for the same reason. Therefore partial checks may be useful with very large +repositories only where a full check would take too long. .sp The \fB\-\-verify\-data\fP option will perform a full integrity verification (as -opposed to checking the CRC32 of the segment) of data, which means reading the +opposed to checking just the xxh64) of data, which means reading the data from the repository, decrypting and decompressing it. It is a complete cryptographic verification and hence very time consuming, but will detect any accidental and malicious corruption. Tamper\-resistance is only guaranteed for @@ -122,17 +121,15 @@ by definition, a potentially lossy task. In practice, repair mode hooks into both the repository and archive checks: .INDENT 0.0 .IP 1. 3 -When checking the repository\(aqs consistency, repair mode will try to recover -as many objects from segments with integrity errors as possible, and ensure -that the index is consistent with the data stored in the segments. +When checking the repository\(aqs consistency, repair mode removes corrupted +objects from the repository after it did a 2nd try to read them correctly. .IP 2. 3 When checking the consistency and correctness of archives, repair mode might remove whole archives from the manifest if their archive metadata chunk is corrupt or lost. On a chunk level (i.e. the contents of files), repair mode will replace corrupt or lost chunks with a same\-size replacement chunk of zeroes. If a previously zeroed chunk reappears, repair mode will restore -this lost chunk using the new chunk. Lastly, repair mode will also delete -orphaned chunks (e.g. caused by read errors while creating the archive). +this lost chunk using the new chunk. .UNINDENT .sp Most steps taken by repair mode have a one\-time effect on the repository, like @@ -152,6 +149,12 @@ replace the all\-zero replacement chunk by the reappeared chunk. If all lost chunks of a \(dqzero\-patched\(dq file reappear, this effectively \(dqheals\(dq the file. Consequently, if lost chunks were repaired earlier, it is advised to run \fB\-\-repair\fP a second time after creating some new backups. +.sp +If \fB\-\-repair \-\-undelete\-archives\fP is given, Borg will scan the repository +for archive metadata and if it finds some where no corresponding archives +directory entry exists, it will create the entries. This is basically undoing +\fBborg delete archive\fP or \fBborg prune ...\fP commands and only possible before +\fBborg compact\fP would remove the archives\(aq data completely. .SH OPTIONS .sp See \fIborg\-common(1)\fP for common options of Borg commands. @@ -170,6 +173,9 @@ perform cryptographic archive data integrity verification (conflicts with \fB\-\ .B \-\-repair attempt to repair any inconsistencies found .TP +.B \-\-undelete\-archives +attempt to undelete archives (use with \-\-repair) +.TP .BI \-\-max\-duration \ SECONDS do only a partial repo check for max. SECONDS seconds (Default: unlimited) .UNINDENT diff --git a/docs/man/borg-common.1 b/docs/man/borg-common.1 index ad10ac4a9c..7b71033db7 100644 --- a/docs/man/borg-common.1 +++ b/docs/man/borg-common.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-COMMON" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-COMMON" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-common \- Common options of Borg commands .SH SYNOPSIS @@ -64,10 +64,7 @@ format using IEC units (1KiB = 1024B) Output one JSON object per log line instead of formatted text. .TP .BI \-\-lock\-wait \ SECONDS -wait at most SECONDS for acquiring a repository/cache lock (default: 1). -.TP -.B \-\-bypass\-lock -Bypass locking mechanism +wait at most SECONDS for acquiring a repository/cache lock (default: 10). .TP .B \-\-show\-version show/log the borg version diff --git a/docs/man/borg-compact.1 b/docs/man/borg-compact.1 index 31c5d1cdb2..8b4fc4caf0 100644 --- a/docs/man/borg-compact.1 +++ b/docs/man/borg-compact.1 @@ -27,40 +27,25 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-COMPACT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-COMPACT" 1 "2024-09-08" "" "borg backup tool" .SH NAME -borg-compact \- compact segment files in the repository +borg-compact \- Collect garbage in repository .SH SYNOPSIS .sp borg [common options] compact [options] .SH DESCRIPTION .sp -This command frees repository space by compacting segments. +Free repository space by deleting unused chunks. .sp -Use this regularly to avoid running out of space \- you do not need to use this -after each borg command though. It is especially useful after deleting archives, -because only compaction will really free repository space. +borg compact analyzes all existing archives to find out which chunks are +actually used. There might be unused chunks resulting from borg delete or prune, +which can be removed to free space in the repository. .sp -borg compact does not need a key, so it is possible to invoke it from the -client or also from the server. -.sp -Depending on the amount of segments that need compaction, it may take a while, -so consider using the \fB\-\-progress\fP option. -.sp -A segment is compacted if the amount of saved space is above the percentage value -given by the \fB\-\-threshold\fP option. If omitted, a threshold of 10% is used. -When using \fB\-\-verbose\fP, borg will output an estimate of the freed space. -.sp -See \fIseparate_compaction\fP in Additional Notes for more details. +Differently than borg 1.x, borg2\(aqs compact needs the borg key if the repo is +encrypted. .SH OPTIONS .sp See \fIborg\-common(1)\fP for common options of Borg commands. -.SS optional arguments -.INDENT 0.0 -.TP -.BI \-\-threshold \ PERCENT -set minimum threshold for saved space in PERCENT (Default: 10) -.UNINDENT .SH EXAMPLES .INDENT 0.0 .INDENT 3.5 diff --git a/docs/man/borg-compression.1 b/docs/man/borg-compression.1 index 22c69ce710..0d94a8adf2 100644 --- a/docs/man/borg-compression.1 +++ b/docs/man/borg-compression.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-COMPRESSION" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-COMPRESSION" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-compression \- Details regarding compression .SH DESCRIPTION diff --git a/docs/man/borg-create.1 b/docs/man/borg-create.1 index d0616c639d..c507178ebf 100644 --- a/docs/man/borg-create.1 +++ b/docs/man/borg-create.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-CREATE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-CREATE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-create \- Create new archive .SH SYNOPSIS @@ -53,9 +53,7 @@ stdin\fP below for details. The archive will consume almost no disk space for files or parts of files that have already been stored in other archives. .sp -The archive name needs to be unique. It must not end in \(aq.checkpoint\(aq or -\(aq.checkpoint.N\(aq (with N being a number), because these names are used for -checkpoints and treated in special ways. +The archive name needs to be unique. .sp In the archive name, you may use the following placeholders: {now}, {utcnow}, {fqdn}, {hostname}, {user} and some others. @@ -155,12 +153,6 @@ only display items with the given status characters (see description) .B \-\-json output stats as JSON. Implies \fB\-\-stats\fP\&. .TP -.B \-\-no\-cache\-sync -experimental: do not synchronize the chunks cache. -.TP -.B \-\-no\-cache\-sync\-forced -experimental: do not synchronize the chunks cache (forced). -.TP .B \-\-prefer\-adhoc\-cache experimental: prefer AdHocCache (w/o files cache) over AdHocWithFilesCache (with files cache). .TP @@ -260,12 +252,6 @@ add a comment text to the archive .BI \-\-timestamp \ TIMESTAMP manually specify the archive creation date/time (yyyy\-mm\-ddThh:mm:ss[(+|\-)HH:MM] format, (+|\-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. .TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) -.TP -.BI \-\-checkpoint\-volume \ BYTES -write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) -.TP .BI \-\-chunker\-params \ PARAMS specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 .TP diff --git a/docs/man/borg-delete.1 b/docs/man/borg-delete.1 index 9e8baf0f44..7ecc0d3141 100644 --- a/docs/man/borg-delete.1 +++ b/docs/man/borg-delete.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-DELETE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-DELETE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-delete \- Delete archives .SH SYNOPSIS @@ -42,16 +42,9 @@ you run \fBborg compact\fP\&. .sp When in doubt, use \fB\-\-dry\-run \-\-list\fP to see what would be deleted. .sp -When using \fB\-\-stats\fP, you will get some statistics about how much data was -deleted \- the \(dqDeleted data\(dq deduplicated size there is most interesting as -that is how much your repository will shrink. -Please note that the \(dqAll archives\(dq stats refer to the state after deletion. -.sp You can delete multiple archives by specifying a matching pattern, using the \fB\-\-match\-archives PATTERN\fP option (for more info on these patterns, see \fIborg_patterns\fP). -.sp -Always first use \fB\-\-dry\-run \-\-list\fP to see what would be deleted. .SH OPTIONS .sp See \fIborg\-common(1)\fP for common options of Borg commands. @@ -63,18 +56,6 @@ do not change repository .TP .B \-\-list output verbose list of archives -.TP -.B \-\-consider\-checkpoints -consider checkpoint archives for deletion (default: not considered). -.TP -.B \-s\fP,\fB \-\-stats -print statistics for the deleted archive -.TP -.B \-\-force -force deletion of corrupted archives, use \fB\-\-force \-\-force\fP in case \fB\-\-force\fP does not work. -.TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) .UNINDENT .SS Archive filters .INDENT 0.0 diff --git a/docs/man/borg-diff.1 b/docs/man/borg-diff.1 index ab9e24f779..64343833e8 100644 --- a/docs/man/borg-diff.1 +++ b/docs/man/borg-diff.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-DIFF" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-DIFF" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-diff \- Diff contents of two archives .SH SYNOPSIS diff --git a/docs/man/borg-export-tar.1 b/docs/man/borg-export-tar.1 index 53632e1712..4c559aedb4 100644 --- a/docs/man/borg-export-tar.1 +++ b/docs/man/borg-export-tar.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-EXPORT-TAR" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-EXPORT-TAR" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-export-tar \- Export archive contents as a tarball .SH SYNOPSIS diff --git a/docs/man/borg-extract.1 b/docs/man/borg-extract.1 index 33585fd97e..dd47e489d4 100644 --- a/docs/man/borg-extract.1 +++ b/docs/man/borg-extract.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-EXTRACT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-EXTRACT" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-extract \- Extract archive contents .SH SYNOPSIS diff --git a/docs/man/borg-import-tar.1 b/docs/man/borg-import-tar.1 index 1b0be536d0..ac5b6b96a4 100644 --- a/docs/man/borg-import-tar.1 +++ b/docs/man/borg-import-tar.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-IMPORT-TAR" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-IMPORT-TAR" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-import-tar \- Create a backup archive from a tarball .SH SYNOPSIS @@ -126,12 +126,6 @@ add a comment text to the archive .BI \-\-timestamp \ TIMESTAMP manually specify the archive creation date/time (yyyy\-mm\-ddThh:mm:ss[(+|\-)HH:MM] format, (+|\-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. .TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) -.TP -.BI \-\-checkpoint\-volume \ BYTES -write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) -.TP .BI \-\-chunker\-params \ PARAMS specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 .TP diff --git a/docs/man/borg-info.1 b/docs/man/borg-info.1 index a9812b154b..58dbd5d276 100644 --- a/docs/man/borg-info.1 +++ b/docs/man/borg-info.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-INFO" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-INFO" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-info \- Show archive details such as disk space used .SH SYNOPSIS diff --git a/docs/man/borg-key-change-location.1 b/docs/man/borg-key-change-location.1 index 7ffb2f185f..a845b14d08 100644 --- a/docs/man/borg-key-change-location.1 +++ b/docs/man/borg-key-change-location.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-KEY-CHANGE-LOCATION" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-KEY-CHANGE-LOCATION" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-key-change-location \- Change repository key location .SH SYNOPSIS diff --git a/docs/man/borg-key-change-passphrase.1 b/docs/man/borg-key-change-passphrase.1 index a1c6c6b2c0..c2b8d5f63e 100644 --- a/docs/man/borg-key-change-passphrase.1 +++ b/docs/man/borg-key-change-passphrase.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-KEY-CHANGE-PASSPHRASE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-KEY-CHANGE-PASSPHRASE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-key-change-passphrase \- Change repository key file passphrase .SH SYNOPSIS diff --git a/docs/man/borg-key-export.1 b/docs/man/borg-key-export.1 index c202b3a87d..55bd41f41b 100644 --- a/docs/man/borg-key-export.1 +++ b/docs/man/borg-key-export.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-KEY-EXPORT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-KEY-EXPORT" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-key-export \- Export the repository key for backup .SH SYNOPSIS diff --git a/docs/man/borg-key-import.1 b/docs/man/borg-key-import.1 index 4240086786..d98963768e 100644 --- a/docs/man/borg-key-import.1 +++ b/docs/man/borg-key-import.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-KEY-IMPORT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-KEY-IMPORT" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-key-import \- Import the repository key from backup .SH SYNOPSIS diff --git a/docs/man/borg-key.1 b/docs/man/borg-key.1 index 7ce6eb6e93..734563b7b9 100644 --- a/docs/man/borg-key.1 +++ b/docs/man/borg-key.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-KEY" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-KEY" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-key \- Manage a keyfile or repokey of a repository .SH SYNOPSIS diff --git a/docs/man/borg-list.1 b/docs/man/borg-list.1 index 9d997585b7..113681b062 100644 --- a/docs/man/borg-list.1 +++ b/docs/man/borg-list.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-LIST" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-LIST" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-list \- List archive contents .SH SYNOPSIS @@ -186,12 +186,8 @@ flags: file flags .IP \(bu 2 size: file size .IP \(bu 2 -dsize: deduplicated size -.IP \(bu 2 num_chunks: number of chunks in this file .IP \(bu 2 -unique_chunks: number of unique chunks in this file -.IP \(bu 2 mtime: file modification time .IP \(bu 2 ctime: file change time diff --git a/docs/man/borg-match-archives.1 b/docs/man/borg-match-archives.1 index 6d5b39ef71..362ac404fe 100644 --- a/docs/man/borg-match-archives.1 +++ b/docs/man/borg-match-archives.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-MATCH-ARCHIVES" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-MATCH-ARCHIVES" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-match-archives \- Details regarding match-archives .SH DESCRIPTION diff --git a/docs/man/borg-mount.1 b/docs/man/borg-mount.1 index 81b7fc218d..a5da12b287 100644 --- a/docs/man/borg-mount.1 +++ b/docs/man/borg-mount.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-MOUNT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-MOUNT" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-mount \- Mount archive or an entire repository as a FUSE filesystem .SH SYNOPSIS @@ -110,9 +110,6 @@ paths to extract; patterns are supported .SS optional arguments .INDENT 0.0 .TP -.B \-\-consider\-checkpoints -Show checkpoint archives in the repository contents list (default: hidden). -.TP .B \-f\fP,\fB \-\-foreground stay in foreground, do not daemonize .TP diff --git a/docs/man/borg-patterns.1 b/docs/man/borg-patterns.1 index 6850b83ed8..c6e33e0037 100644 --- a/docs/man/borg-patterns.1 +++ b/docs/man/borg-patterns.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-PATTERNS" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-PATTERNS" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-patterns \- Details regarding patterns .SH DESCRIPTION diff --git a/docs/man/borg-placeholders.1 b/docs/man/borg-placeholders.1 index e570eb9cb1..14d66338c7 100644 --- a/docs/man/borg-placeholders.1 +++ b/docs/man/borg-placeholders.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-PLACEHOLDERS" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-PLACEHOLDERS" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-placeholders \- Details regarding placeholders .SH DESCRIPTION diff --git a/docs/man/borg-prune.1 b/docs/man/borg-prune.1 index 066d3ba7f2..a1411e721c 100644 --- a/docs/man/borg-prune.1 +++ b/docs/man/borg-prune.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-PRUNE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-PRUNE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-prune \- Prune repository archives according to specified rules .SH SYNOPSIS @@ -45,11 +45,6 @@ certain number of historic backups. This retention policy is commonly referred t \fI\%GFS\fP (Grandfather\-father\-son) backup rotation scheme. .sp -Also, prune automatically removes checkpoint archives (incomplete archives left -behind by interrupted backup runs) except if the checkpoint is the latest -archive (and thus still needed). Checkpoint archives are not considered when -comparing archive counts against the retention limits (\fB\-\-keep\-X\fP). -.sp If you use \-\-match\-archives (\-a), then only archives that match the pattern are considered for deletion and only those archives count towards the totals specified by the rules. @@ -85,11 +80,6 @@ The \fB\-\-keep\-last N\fP option is doing the same as \fB\-\-keep\-secondly N\f keep the last N archives under the assumption that you do not create more than one backup archive in the same second). .sp -When using \fB\-\-stats\fP, you will get some statistics about how much data was -deleted \- the \(dqDeleted data\(dq deduplicated size there is most interesting as -that is how much your repository will shrink. -Please note that the \(dqAll archives\(dq stats refer to the state after pruning. -.sp You can influence how the \fB\-\-list\fP output is formatted by using the \fB\-\-short\fP option (less wide output) or by giving a custom format using \fB\-\-format\fP (see the \fBborg rlist\fP description for more details about the format string). @@ -102,12 +92,6 @@ See \fIborg\-common(1)\fP for common options of Borg commands. .B \-n\fP,\fB \-\-dry\-run do not change repository .TP -.B \-\-force -force pruning of corrupted archives, use \fB\-\-force \-\-force\fP in case \fB\-\-force\fP does not work. -.TP -.B \-s\fP,\fB \-\-stats -print statistics for the deleted archive -.TP .B \-\-list output verbose list of archives it keeps/prunes .TP @@ -146,9 +130,6 @@ number of monthly archives to keep .TP .B \-y\fP,\fB \-\-keep\-yearly number of yearly archives to keep -.TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) .UNINDENT .SS Archive filters .INDENT 0.0 diff --git a/docs/man/borg-rcompress.1 b/docs/man/borg-rcompress.1 index 64b9556e85..5f97ddd08a 100644 --- a/docs/man/borg-rcompress.1 +++ b/docs/man/borg-rcompress.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RCOMPRESS" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RCOMPRESS" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rcompress \- Repository (re-)compression .SH SYNOPSIS @@ -37,20 +37,14 @@ borg [common options] rcompress [options] .sp Repository (re\-)compression (and/or re\-obfuscation). .sp -Reads all chunks in the repository (in on\-disk order, this is important for -compaction) and recompresses them if they are not already using the compression -type/level and obfuscation level given via \fB\-\-compression\fP\&. +Reads all chunks in the repository and recompresses them if they are not already +using the compression type/level and obfuscation level given via \fB\-\-compression\fP\&. .sp If the outcome of the chunk processing indicates a change in compression type/level or obfuscation level, the processed chunk is written to the repository. Please note that the outcome might not always be the desired compression type/level \- if no compression gives a shorter output, that might be chosen. .sp -Every \fB\-\-checkpoint\-interval\fP, progress is committed to the repository and -the repository is compacted (this is to keep temporary repo space usage in bounds). -A lower checkpoint interval means lower temporary repo space usage, but also -slower progress due to higher overhead (and vice versa). -.sp Please note that this command can not work in low (or zero) free disk space conditions. .sp @@ -72,9 +66,6 @@ select compression algorithm, see the output of the \(dqborg help compression\(d .TP .B \-s\fP,\fB \-\-stats print statistics -.TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) .UNINDENT .SH EXAMPLES .INDENT 0.0 diff --git a/docs/man/borg-rcreate.1 b/docs/man/borg-rcreate.1 index d26f08e33f..bb23f0d7ab 100644 --- a/docs/man/borg-rcreate.1 +++ b/docs/man/borg-rcreate.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RCREATE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RCREATE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rcreate \- Create a new, empty repository .SH SYNOPSIS @@ -35,8 +35,8 @@ borg-rcreate \- Create a new, empty repository borg [common options] rcreate [options] .SH DESCRIPTION .sp -This command creates a new, empty repository. A repository is a filesystem -directory containing the deduplicated data from zero or more archives. +This command creates a new, empty repository. A repository is a \fBborgstore\fP store +containing the deduplicated data from zero or more archives. .SS Encryption mode TLDR .sp The encryption mode can only be configured when creating a new repository \- you can @@ -226,6 +226,12 @@ Optionally, if you use \fB\-\-copy\-crypt\-key\fP you can also keep the same cry keys to manage. .sp Creating related repositories is useful e.g. if you want to use \fBborg transfer\fP later. +.SS Creating a related repository for data migration from borg 1.2 or 1.4 +.sp +You can use \fBborg rcreate \-\-other\-repo ORIG_REPO \-\-from\-borg1 ...\fP to create a related +repository that uses the same secret key material as the given other/original repository. +.sp +Then use \fBborg transfer \-\-other\-repo ORIG_REPO \-\-from\-borg1 ...\fP to transfer the archives. .SH OPTIONS .sp See \fIborg\-common(1)\fP for common options of Borg commands. @@ -235,6 +241,9 @@ See \fIborg\-common(1)\fP for common options of Borg commands. .BI \-\-other\-repo \ SRC_REPOSITORY reuse the key material from the other repository .TP +.B \-\-from\-borg1 +other repository is borg 1.x +.TP .BI \-e \ MODE\fR,\fB \ \-\-encryption \ MODE select encryption key mode \fB(required)\fP .TP diff --git a/docs/man/borg-rdelete.1 b/docs/man/borg-rdelete.1 index f7447dcb1c..f8056c562a 100644 --- a/docs/man/borg-rdelete.1 +++ b/docs/man/borg-rdelete.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RDELETE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RDELETE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rdelete \- Delete a repository .SH SYNOPSIS diff --git a/docs/man/borg-recreate.1 b/docs/man/borg-recreate.1 index 218032595d..143ba3d9f1 100644 --- a/docs/man/borg-recreate.1 +++ b/docs/man/borg-recreate.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RECREATE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RECREATE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-recreate \- Re-create archives .SH SYNOPSIS @@ -157,12 +157,6 @@ consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m. .BI \-\-target \ TARGET create a new archive with the name ARCHIVE, do not replace existing archive (only applies for a single archive) .TP -.BI \-c \ SECONDS\fR,\fB \ \-\-checkpoint\-interval \ SECONDS -write checkpoint every SECONDS seconds (Default: 1800) -.TP -.BI \-\-checkpoint\-volume \ BYTES -write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) -.TP .BI \-\-comment \ COMMENT add a comment text to the archive .TP diff --git a/docs/man/borg-rename.1 b/docs/man/borg-rename.1 index 48c2b1a99a..ce67e39474 100644 --- a/docs/man/borg-rename.1 +++ b/docs/man/borg-rename.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RENAME" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RENAME" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rename \- Rename an existing archive .SH SYNOPSIS diff --git a/docs/man/borg-rinfo.1 b/docs/man/borg-rinfo.1 index 3ba96e3b13..ba8a9a4759 100644 --- a/docs/man/borg-rinfo.1 +++ b/docs/man/borg-rinfo.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RINFO" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RINFO" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rinfo \- Show repository infos .SH SYNOPSIS @@ -36,15 +36,6 @@ borg [common options] rinfo [options] .SH DESCRIPTION .sp This command displays detailed information about the repository. -.sp -Please note that the deduplicated sizes of the individual archives do not add -up to the deduplicated size of the repository (\(dqall archives\(dq), because the two -are meaning different things: -.sp -This archive / deduplicated size = amount of data stored ONLY for this archive -= unique chunks of this archive. -All archives / deduplicated size = amount of data stored in the repo -= all chunks in the repository. .SH OPTIONS .sp See \fIborg\-common(1)\fP for common options of Borg commands. diff --git a/docs/man/borg-rlist.1 b/docs/man/borg-rlist.1 index da38b71ee5..c3480b4695 100644 --- a/docs/man/borg-rlist.1 +++ b/docs/man/borg-rlist.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-RLIST" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-RLIST" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-rlist \- List the archives contained in a repository .SH SYNOPSIS @@ -42,9 +42,6 @@ See \fIborg\-common(1)\fP for common options of Borg commands. .SS optional arguments .INDENT 0.0 .TP -.B \-\-consider\-checkpoints -Show checkpoint archives in the repository contents list (default: hidden). -.TP .B \-\-short only print the archive names, nothing else .TP diff --git a/docs/man/borg-rspace.1 b/docs/man/borg-rspace.1 new file mode 100644 index 0000000000..8e5034c4da --- /dev/null +++ b/docs/man/borg-rspace.1 @@ -0,0 +1,94 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "BORG-RSPACE" 1 "2024-09-08" "" "borg backup tool" +.SH NAME +borg-rspace \- Manage reserved space in repository +.SH SYNOPSIS +.sp +borg [common options] rspace [options] +.SH DESCRIPTION +.sp +This command manages reserved space in a repository. +.sp +Borg can not work in disk\-full conditions (can not lock a repo and thus can +not run prune/delete or compact operations to free disk space). +.sp +To avoid running into dead\-end situations like that, you can put some objects +into a repository that take up some disk space. If you ever run into a +disk\-full situation, you can free that space and then borg will be able to +run normally, so you can free more disk space by using prune/delete/compact. +After that, don\(aqt forget to reserve space again, in case you run into that +situation again at a later time. +.sp +Examples: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# Create a new repository: +$ borg rcreate ... +# Reserve approx. 1GB of space for emergencies: +$ borg rspace \-\-reserve 1G + +# Check amount of reserved space in the repository: +$ borg rspace + +# EMERGENCY! Free all reserved space to get things back to normal: +$ borg rspace \-\-free +$ borg prune ... +$ borg delete ... +$ borg compact \-v # only this actually frees space of deleted archives +$ borg rspace \-\-reserve 1G # reserve space again for next time +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +Reserved space is always rounded up to use full reservation blocks of 64MiB. +.SH OPTIONS +.sp +See \fIborg\-common(1)\fP for common options of Borg commands. +.SS optional arguments +.INDENT 0.0 +.TP +.BI \-\-reserve \ SPACE +Amount of space to reserve (e.g. 100M, 1G). Default: 0. +.TP +.B \-\-free +Free all reserved space. Don\(aqt forget to reserve space later again. +.UNINDENT +.SH SEE ALSO +.sp +\fIborg\-common(1)\fP +.SH AUTHOR +The Borg Collective +.\" Generated by docutils manpage writer. +. diff --git a/docs/man/borg-serve.1 b/docs/man/borg-serve.1 index b5ab1dbebe..7a198b9402 100644 --- a/docs/man/borg-serve.1 +++ b/docs/man/borg-serve.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-SERVE" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-SERVE" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-serve \- Start in server mode. This command is usually not used manually. .SH SYNOPSIS diff --git a/docs/man/borg-transfer.1 b/docs/man/borg-transfer.1 index f6cd45450f..f7dc50f9a5 100644 --- a/docs/man/borg-transfer.1 +++ b/docs/man/borg-transfer.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-TRANSFER" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-TRANSFER" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-transfer \- archives transfer from other repository, optionally upgrade data format .SH SYNOPSIS @@ -46,7 +46,14 @@ any case) and keep data compressed \(dqas is\(dq (saves time as no data compress If you want to globally change compression while transferring archives to the DST_REPO, give \fB\-\-compress=WANTED_COMPRESSION \-\-recompress=always\fP\&. .sp -Suggested use for general purpose archive transfer (not repo upgrades): +The default is to transfer all archives. +.sp +You could use the misc. archive filter options to limit which archives it will +transfer, e.g. using the \fB\-a\fP option. This is recommended for big +repositories with multiple data sets to keep the runtime per invocation lower. +.SS General purpose archive transfer +.sp +Transfer borg2 archives into a related other borg2 repository: .INDENT 0.0 .INDENT 3.5 .sp @@ -54,7 +61,7 @@ Suggested use for general purpose archive transfer (not repo upgrades): .ft C # create a related DST_REPO (reusing key material from SRC_REPO), so that # chunking and chunk id generation will work in the same way as before. -borg \-\-repo=DST_REPO rcreate \-\-other\-repo=SRC_REPO \-\-encryption=DST_ENC +borg \-\-repo=DST_REPO rcreate \-\-encryption=DST_ENC \-\-other\-repo=SRC_REPO # transfer archives from SRC_REPO to DST_REPO borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-dry\-run # check what it would do @@ -64,26 +71,23 @@ borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-dry\-run # check! .fi .UNINDENT .UNINDENT +.SS Data migration / upgrade from borg 1.x .sp -The default is to transfer all archives, including checkpoint archives. -.sp -You could use the misc. archive filter options to limit which archives it will -transfer, e.g. using the \fB\-a\fP option. This is recommended for big -repositories with multiple data sets to keep the runtime per invocation lower. -.sp -For repository upgrades (e.g. from a borg 1.2 repo to a related borg 2.0 repo), usage is -quite similar to the above: +To migrate your borg 1.x archives into a related, new borg2 repository, usage is quite similar +to the above, but you need the \fB\-\-from\-borg1\fP option: .INDENT 0.0 .INDENT 3.5 .sp .nf .ft C -# fast: compress metadata with zstd,3, but keep data chunks compressed as they are: -borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-upgrader=From12To20 \e - \-\-compress=zstd,3 \-\-recompress=never +borg \-\-repo=DST_REPO rcreate \-\-encryption=DST_ENC \-\-other\-repo=SRC_REPO \-\-from\-borg1 + +# to continue using lz4 compression as you did in SRC_REPO: +borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-from\-borg1 \e + \-\-compress=lz4 \-\-recompress=never -# compress metadata and recompress data with zstd,3 -borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-upgrader=From12To20 \e +# alternatively, to recompress everything to zstd,3: +borg \-\-repo=DST_REPO transfer \-\-other\-repo=SRC_REPO \-\-from\-borg1 \e \-\-compress=zstd,3 \-\-recompress=always .ft P .fi @@ -101,6 +105,9 @@ do not change repository, just check .BI \-\-other\-repo \ SRC_REPOSITORY transfer archives from the other repository .TP +.B \-\-from\-borg1 +other repository is borg 1.x +.TP .BI \-\-upgrader \ UPGRADER use the upgrader to convert transferred data (default: no conversion) .TP diff --git a/docs/man/borg-umount.1 b/docs/man/borg-umount.1 index 46e4058a33..88cac080b1 100644 --- a/docs/man/borg-umount.1 +++ b/docs/man/borg-umount.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-UMOUNT" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-UMOUNT" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-umount \- un-mount the FUSE filesystem .SH SYNOPSIS diff --git a/docs/man/borg-version.1 b/docs/man/borg-version.1 index 1635c41102..94fff3f586 100644 --- a/docs/man/borg-version.1 +++ b/docs/man/borg-version.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-VERSION" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-VERSION" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-version \- Display the borg client / borg server version .SH SYNOPSIS diff --git a/docs/man/borg-with-lock.1 b/docs/man/borg-with-lock.1 index db07e8ec90..9bb9abf686 100644 --- a/docs/man/borg-with-lock.1 +++ b/docs/man/borg-with-lock.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG-WITH-LOCK" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG-WITH-LOCK" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg-with-lock \- run a user specified command with the repository lock held .SH SYNOPSIS diff --git a/docs/man/borg.1 b/docs/man/borg.1 index d2944b145e..08e75c8dde 100644 --- a/docs/man/borg.1 +++ b/docs/man/borg.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORG" 1 "2024-07-19" "" "borg backup tool" +.TH "BORG" 1 "2024-09-08" "" "borg backup tool" .SH NAME borg \- deduplicating and encrypting backup tool .SH SYNOPSIS @@ -238,6 +238,10 @@ Note: you may also prepend a \fBfile://\fP to a filesystem path to get URL style .sp \fBssh://user@host:port/~/path/to/repo\fP \- path relative to user\(aqs home directory .sp +\fBRemote repositories\fP accessed via sftp: +.sp +\fBsftp://user@host:port/path/to/repo\fP \- absolute path\(ga +.sp If you frequently need the same repo URL, it is a good idea to set the \fBBORG_REPO\fP environment variable to set a default for the repo URL: .INDENT 0.0 @@ -491,10 +495,6 @@ given order, e.g.: Choose the implementation for the clientside cache, choose one of: .INDENT 7.0 .IP \(bu 2 -\fBlocal\fP: uses a persistent chunks cache and keeps it in a perfect state (precise refcounts and -sizes), requiring a potentially resource expensive cache sync in multi\-client scenarios. -Also has a persistent files cache. -.IP \(bu 2 \fBadhoc\fP: builds a non\-persistent chunks cache by querying the repo. Chunks cache contents are somewhat sloppy for already existing chunks, concerning their refcount (\(dqinfinite\(dq) and size (0). No files cache (slow, will chunk all input files). DEPRECATED. @@ -698,38 +698,48 @@ mode 600, root:root). .UNINDENT .SS File systems .sp -We strongly recommend against using Borg (or any other database\-like -software) on non\-journaling file systems like FAT, since it is not -possible to assume any consistency in case of power failures (or a -sudden disconnect of an external drive or similar failures). +We recommend using a reliable, scalable journaling filesystem for the +repository, e.g. zfs, btrfs, ext4, apfs. +.sp +Borg now uses the \fBborgstore\fP package to implement the key/value store it +uses for the repository. +.sp +It currently uses the \fBfile:\fP Store (posixfs backend) either with a local +directory or via ssh and a remote \fBborg serve\fP agent using borgstore on the +remote side. +.sp +This means that it will store each chunk into a separate filesystem file +(for more details, see the \fBborgstore\fP project). .sp -While Borg uses a data store that is resilient against these failures -when used on journaling file systems, it is not possible to guarantee -this with some hardware \-\- independent of the software used. We don\(aqt -know a list of affected hardware. +This has some pros and cons (compared to legacy borg 1.x\(aqs segment files): .sp -If you are suspicious whether your Borg repository is still consistent -and readable after one of the failures mentioned above occurred, run -\fBborg check \-\-verify\-data\fP to make sure it is consistent. -Requirements for Borg repository file systems +Pros: .INDENT 0.0 .IP \(bu 2 -Long file names +Simplicity and better maintainability of the borg code. .IP \(bu 2 -At least three directory levels with short names +Sometimes faster, less I/O, better scalability: e.g. borg compact can just +remove unused chunks by deleting a single file and does not need to read +and re\-write segment files to free space. .IP \(bu 2 -Typically, file sizes up to a few hundred MB. -Large repositories may require large files (>2 GB). +In future, easier to adapt to other kinds of storage: +borgstore\(aqs backends are quite simple to implement. +A \fBsftp:\fP backend already exists, cloud storage might be easy to add. .IP \(bu 2 -Up to 1000 files per directory. +Parallel repository access with less locking is easier to implement. +.UNINDENT +.sp +Cons: +.INDENT 0.0 .IP \(bu 2 -rename(2) / MoveFile(Ex) should work as specified, i.e. on the same file system -it should be a move (not a copy) operation, and in case of a directory -it should fail if the destination exists and is not an empty directory, -since this is used for locking. +The repository filesystem will have to deal with a big amount of files (there +are provisions in borgstore against having too many files in a single directory +by using a nested directory structure). .IP \(bu 2 -Also hardlinks are used for more safe and secure file updating (e.g. of the repo -config file), but the code tries to work also if hardlinks are not supported. +Bigger fs space usage overhead (will depend on allocation block size \- modern +filesystems like zfs are rather clever here using a variable block size). +.IP \(bu 2 +Sometimes slower, due to less sequential / more random access operations. .UNINDENT .SS Units .sp @@ -747,6 +757,10 @@ For more information about that, see: \fI\%https://xkcd.com/1179/\fP .sp Unless otherwise noted, we display local date and time. Internally, we store and process date and time as UTC. +TIMESPAN +.sp +Some options accept a TIMESPAN parameter, which can be given as a +number of days (e.g. \fB7d\fP) or months (e.g. \fB12m\fP). .SS Resource Usage .sp Borg might use a lot of resources depending on the size of the data set it is dealing with. diff --git a/docs/man/borgfs.1 b/docs/man/borgfs.1 index 7c37e09f1e..9e32579b43 100644 --- a/docs/man/borgfs.1 +++ b/docs/man/borgfs.1 @@ -27,7 +27,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] .in \\n[rst2man-indent\\n[rst2man-indent-level]]u .. -.TH "BORGFS" 1 "2024-07-19" "" "borg backup tool" +.TH "BORGFS" 1 "2024-09-08" "" "borg backup tool" .SH NAME borgfs \- Mount archive or an entire repository as a FUSE filesystem .SH SYNOPSIS @@ -54,9 +54,6 @@ paths to extract; patterns are supported .B \-V\fP,\fB \-\-version show version number and exit .TP -.B \-\-consider\-checkpoints -Show checkpoint archives in the repository contents list (default: hidden). -.TP .B \-f\fP,\fB \-\-foreground stay in foreground, do not daemonize .TP diff --git a/docs/quickstart.rst b/docs/quickstart.rst index de3c8bb050..025b5be554 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -35,18 +35,6 @@ of free space on the destination filesystem that has your backup repository (and also on ~/.cache). A few GB should suffice for most hard-drive sized repositories. See also :ref:`cache-memory-usage`. -Borg doesn't use space reserved for root on repository disks (even when run as root). -On file systems which do not support this mechanism (e.g. XFS) we recommend to reserve -some space in Borg itself just to be safe by adjusting the ``additional_free_space`` -setting (a good starting point is ``2G``):: - - borg config additional_free_space 2G - -If Borg runs out of disk space, it tries to free as much space as it -can while aborting the current operation safely, which allows the user to free more space -by deleting/pruning archives. This mechanism is not bullet-proof in some -circumstances [1]_. - If you do run out of disk space, it can be hard or impossible to free space, because Borg needs free space to operate - even to delete backup archives. @@ -55,18 +43,13 @@ in your backup log files (you check them regularly anyway, right?). Also helpful: -- create a big file as a "space reserve", that you can delete to free space +- use `borg rspace` to reserve some disk space that can be freed when the fs + does not have free space any more. - if you use LVM: use a LV + a filesystem that you can resize later and have some unallocated PEs you can add to the LV. - consider using quotas - use `prune` and `compact` regularly -.. [1] This failsafe can fail in these circumstances: - - - The underlying file system doesn't support statvfs(2), or returns incorrect - data, or the repository doesn't reside on a single file system - - Other tasks fill the disk simultaneously - - Hard quotas (which may not be reflected in statvfs(2)) Important note about permissions -------------------------------- @@ -270,7 +253,7 @@ A passphrase should be a single line of text. Any trailing linefeed will be stripped. Do not use empty passphrases, as these can be trivially guessed, which does not -leave any encrypted data secure. +leave any encrypted data secure. Avoid passphrases containing non-ASCII characters. Borg can process any unicode text, but problems may arise at input due to text @@ -420,6 +403,15 @@ You can also use other remote filesystems in a similar way. Just be careful, not all filesystems out there are really stable and working good enough to be acceptable for backup usage. +Other kinds of repositories +--------------------------- + +Due to using the `borgstore` project, borg now also supports other kinds of +(remote) repositories besides `file:` and `ssh:`: + +- sftp: the borg client will directly talk to an sftp server. + This does not require borg being installed on the sftp server. +- Others may come in the future, adding backends to `borgstore` is rather simple. Restoring a backup ------------------ diff --git a/docs/usage.rst b/docs/usage.rst index c27e418066..d88f968ea7 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -37,6 +37,7 @@ Usage usage/general usage/rcreate + usage/rspace usage/rlist usage/rinfo usage/rcompress diff --git a/docs/usage/check.rst.inc b/docs/usage/check.rst.inc index e0764302bb..f046aaa0e4 100644 --- a/docs/usage/check.rst.inc +++ b/docs/usage/check.rst.inc @@ -23,6 +23,8 @@ borg check +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | | ``--repair`` | attempt to repair any inconsistencies found | +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--undelete-archives`` | attempt to undelete archives (use with --repair) | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | | ``--max-duration SECONDS`` | do only a partial repo check for max. SECONDS seconds (Default: unlimited) | +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | .. class:: borg-common-opt-ref | @@ -65,6 +67,7 @@ borg check --archives-only only perform archives checks --verify-data perform cryptographic archive data integrity verification (conflicts with ``--repository-only``) --repair attempt to repair any inconsistencies found + --undelete-archives attempt to undelete archives (use with --repair) --max-duration SECONDS do only a partial repo check for max. SECONDS seconds (Default: unlimited) @@ -89,8 +92,8 @@ The check command verifies the consistency of a repository and its archives. It consists of two major steps: 1. Checking the consistency of the repository itself. This includes checking - the segment magic headers, and both the metadata and data of all objects in - the segments. The read data is checked by size and CRC. Bit rot and other + the file magic headers, and both the metadata and data of all objects in + the repository. The read data is checked by size and hash. Bit rot and other types of accidental damage can be detected this way. Running the repository check can be split into multiple partial checks using ``--max-duration``. When checking a remote repository, please note that the checks run on the @@ -125,13 +128,12 @@ archive checks, nor enable repair mode. Consequently, if you want to use **Warning:** Please note that partial repository checks (i.e. running it with ``--max-duration``) can only perform non-cryptographic checksum checks on the -segment files. A full repository check (i.e. without ``--max-duration``) can -also do a repository index check. Enabling partial repository checks excepts -archive checks for the same reason. Therefore partial checks may be useful with -very large repositories only where a full check would take too long. +repository files. Enabling partial repository checks excepts archive checks +for the same reason. Therefore partial checks may be useful with very large +repositories only where a full check would take too long. The ``--verify-data`` option will perform a full integrity verification (as -opposed to checking the CRC32 of the segment) of data, which means reading the +opposed to checking just the xxh64) of data, which means reading the data from the repository, decrypting and decompressing it. It is a complete cryptographic verification and hence very time consuming, but will detect any accidental and malicious corruption. Tamper-resistance is only guaranteed for @@ -168,17 +170,15 @@ by definition, a potentially lossy task. In practice, repair mode hooks into both the repository and archive checks: -1. When checking the repository's consistency, repair mode will try to recover - as many objects from segments with integrity errors as possible, and ensure - that the index is consistent with the data stored in the segments. +1. When checking the repository's consistency, repair mode removes corrupted + objects from the repository after it did a 2nd try to read them correctly. 2. When checking the consistency and correctness of archives, repair mode might remove whole archives from the manifest if their archive metadata chunk is corrupt or lost. On a chunk level (i.e. the contents of files), repair mode will replace corrupt or lost chunks with a same-size replacement chunk of zeroes. If a previously zeroed chunk reappears, repair mode will restore - this lost chunk using the new chunk. Lastly, repair mode will also delete - orphaned chunks (e.g. caused by read errors while creating the archive). + this lost chunk using the new chunk. Most steps taken by repair mode have a one-time effect on the repository, like removing a lost archive from the repository. However, replacing a corrupt or @@ -196,4 +196,10 @@ repair mode Borg will check whether a previously lost chunk reappeared and will replace the all-zero replacement chunk by the reappeared chunk. If all lost chunks of a "zero-patched" file reappear, this effectively "heals" the file. Consequently, if lost chunks were repaired earlier, it is advised to run -``--repair`` a second time after creating some new backups. \ No newline at end of file +``--repair`` a second time after creating some new backups. + +If ``--repair --undelete-archives`` is given, Borg will scan the repository +for archive metadata and if it finds some where no corresponding archives +directory entry exists, it will create the entries. This is basically undoing +``borg delete archive`` or ``borg prune ...`` commands and only possible before +``borg compact`` would remove the archives' data completely. \ No newline at end of file diff --git a/docs/usage/common-options.rst.inc b/docs/usage/common-options.rst.inc index 793aedd84f..31a9df8657 100644 --- a/docs/usage/common-options.rst.inc +++ b/docs/usage/common-options.rst.inc @@ -8,8 +8,7 @@ -p, --progress show progress information --iec format using IEC units (1KiB = 1024B) --log-json Output one JSON object per log line instead of formatted text. ---lock-wait SECONDS wait at most SECONDS for acquiring a repository/cache lock (default: 1). ---bypass-lock Bypass locking mechanism +--lock-wait SECONDS wait at most SECONDS for acquiring a repository/cache lock (default: 10). --show-version show/log the borg version --show-rc show/log the return code (rc) --umask M set umask to M (local only, default: 0077) diff --git a/docs/usage/compact.rst.inc b/docs/usage/compact.rst.inc index 946b376fc4..8fad820d0f 100644 --- a/docs/usage/compact.rst.inc +++ b/docs/usage/compact.rst.inc @@ -12,15 +12,11 @@ borg compact .. class:: borg-options-table - +-------------------------------------------------------+-------------------------+----------------------------------------------------------------+ - | **optional arguments** | - +-------------------------------------------------------+-------------------------+----------------------------------------------------------------+ - | | ``--threshold PERCENT`` | set minimum threshold for saved space in PERCENT (Default: 10) | - +-------------------------------------------------------+-------------------------+----------------------------------------------------------------+ - | .. class:: borg-common-opt-ref | - | | - | :ref:`common_options` | - +-------------------------------------------------------+-------------------------+----------------------------------------------------------------+ + +-------------------------------------------------------+ + | .. class:: borg-common-opt-ref | + | | + | :ref:`common_options` | + +-------------------------------------------------------+ .. raw:: html @@ -34,30 +30,17 @@ borg compact - optional arguments - --threshold PERCENT set minimum threshold for saved space in PERCENT (Default: 10) - - :ref:`common_options` | Description ~~~~~~~~~~~ -This command frees repository space by compacting segments. - -Use this regularly to avoid running out of space - you do not need to use this -after each borg command though. It is especially useful after deleting archives, -because only compaction will really free repository space. - -borg compact does not need a key, so it is possible to invoke it from the -client or also from the server. - -Depending on the amount of segments that need compaction, it may take a while, -so consider using the ``--progress`` option. +Free repository space by deleting unused chunks. -A segment is compacted if the amount of saved space is above the percentage value -given by the ``--threshold`` option. If omitted, a threshold of 10% is used. -When using ``--verbose``, borg will output an estimate of the freed space. +borg compact analyzes all existing archives to find out which chunks are +actually used. There might be unused chunks resulting from borg delete or prune, +which can be removed to free space in the repository. -See :ref:`separate_compaction` in Additional Notes for more details. \ No newline at end of file +Differently than borg 1.x, borg2's compact needs the borg key if the repo is +encrypted. \ No newline at end of file diff --git a/docs/usage/create.rst.inc b/docs/usage/create.rst.inc index bf6129692a..7924e4adfc 100644 --- a/docs/usage/create.rst.inc +++ b/docs/usage/create.rst.inc @@ -31,10 +31,6 @@ borg create +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--json`` | output stats as JSON. Implies ``--stats``. | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--no-cache-sync`` | experimental: do not synchronize the chunks cache. | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--no-cache-sync-forced`` | experimental: do not synchronize the chunks cache (forced). | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--prefer-adhoc-cache`` | experimental: prefer AdHocCache (w/o files cache) over AdHocWithFilesCache (with files cache). | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--stdin-name NAME`` | use NAME in archive for stdin data (default: 'stdin') | @@ -105,10 +101,6 @@ borg create +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--timestamp TIMESTAMP`` | manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--checkpoint-volume BYTES`` | write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--chunker-params PARAMS`` | specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``-C COMPRESSION``, ``--compression COMPRESSION`` | select compression algorithm, see the output of the "borg help compression" command for details. | @@ -136,8 +128,6 @@ borg create --list output verbose list of items (files, dirs, ...) --filter STATUSCHARS only display items with the given status characters (see description) --json output stats as JSON. Implies ``--stats``. - --no-cache-sync experimental: do not synchronize the chunks cache. - --no-cache-sync-forced experimental: do not synchronize the chunks cache (forced). --prefer-adhoc-cache experimental: prefer AdHocCache (w/o files cache) over AdHocWithFilesCache (with files cache). --stdin-name NAME use NAME in archive for stdin data (default: 'stdin') --stdin-user USER set user USER in archive for stdin data (default: do not store user/uid) @@ -180,8 +170,6 @@ borg create Archive options --comment COMMENT add a comment text to the archive --timestamp TIMESTAMP manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) - --checkpoint-volume BYTES write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) --chunker-params PARAMS specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. @@ -207,9 +195,7 @@ stdin* below for details. The archive will consume almost no disk space for files or parts of files that have already been stored in other archives. -The archive name needs to be unique. It must not end in '.checkpoint' or -'.checkpoint.N' (with N being a number), because these names are used for -checkpoints and treated in special ways. +The archive name needs to be unique. In the archive name, you may use the following placeholders: {now}, {utcnow}, {fqdn}, {hostname}, {user} and some others. diff --git a/docs/usage/delete.rst.inc b/docs/usage/delete.rst.inc index c5740ca678..688bce06e4 100644 --- a/docs/usage/delete.rst.inc +++ b/docs/usage/delete.rst.inc @@ -12,43 +12,35 @@ borg delete .. class:: borg-options-table - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | **optional arguments** | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``-n``, ``--dry-run`` | do not change repository | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--list`` | output verbose list of archives | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--consider-checkpoints`` | consider checkpoint archives for deletion (default: not considered). | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``-s``, ``--stats`` | print statistics for the deleted archive | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--force`` | force deletion of corrupted archives, use ``--force --force`` in case ``--force`` does not work. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | .. class:: borg-common-opt-ref | - | | - | :ref:`common_options` | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | **Archive filters** — Archive filters can be applied to repository targets. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archive names matching the pattern. see "borg help match-archives". | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id; default is: timestamp | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--first N`` | consider first N archives after other filters were applied | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--last N`` | consider last N archives after other filters were applied | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | **optional arguments** | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``-n``, ``--dry-run`` | do not change repository | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--list`` | output verbose list of archives | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | .. class:: borg-common-opt-ref | + | | + | :ref:`common_options` | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | **Archive filters** — Archive filters can be applied to repository targets. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archive names matching the pattern. see "borg help match-archives". | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id; default is: timestamp | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--first N`` | consider first N archives after other filters were applied | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--last N`` | consider last N archives after other filters were applied | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ + | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ .. raw:: html @@ -63,12 +55,8 @@ borg delete optional arguments - -n, --dry-run do not change repository - --list output verbose list of archives - --consider-checkpoints consider checkpoint archives for deletion (default: not considered). - -s, --stats print statistics for the deleted archive - --force force deletion of corrupted archives, use ``--force --force`` in case ``--force`` does not work. - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) + -n, --dry-run do not change repository + --list output verbose list of archives :ref:`common_options` @@ -95,13 +83,6 @@ you run ``borg compact``. When in doubt, use ``--dry-run --list`` to see what would be deleted. -When using ``--stats``, you will get some statistics about how much data was -deleted - the "Deleted data" deduplicated size there is most interesting as -that is how much your repository will shrink. -Please note that the "All archives" stats refer to the state after deletion. - You can delete multiple archives by specifying a matching pattern, using the ``--match-archives PATTERN`` option (for more info on these patterns, -see :ref:`borg_patterns`). - -Always first use ``--dry-run --list`` to see what would be deleted. \ No newline at end of file +see :ref:`borg_patterns`). \ No newline at end of file diff --git a/docs/usage/general/environment.rst.inc b/docs/usage/general/environment.rst.inc index cd89f8d508..907cd28a32 100644 --- a/docs/usage/general/environment.rst.inc +++ b/docs/usage/general/environment.rst.inc @@ -88,9 +88,6 @@ General: BORG_CACHE_IMPL Choose the implementation for the clientside cache, choose one of: - - ``local``: uses a persistent chunks cache and keeps it in a perfect state (precise refcounts and - sizes), requiring a potentially resource expensive cache sync in multi-client scenarios. - Also has a persistent files cache. - ``adhoc``: builds a non-persistent chunks cache by querying the repo. Chunks cache contents are somewhat sloppy for already existing chunks, concerning their refcount ("infinite") and size (0). No files cache (slow, will chunk all input files). DEPRECATED. diff --git a/docs/usage/general/file-systems.rst.inc b/docs/usage/general/file-systems.rst.inc index 1fbe472465..d53eb96bf9 100644 --- a/docs/usage/general/file-systems.rst.inc +++ b/docs/usage/general/file-systems.rst.inc @@ -1,30 +1,37 @@ File systems ~~~~~~~~~~~~ -We strongly recommend against using Borg (or any other database-like -software) on non-journaling file systems like FAT, since it is not -possible to assume any consistency in case of power failures (or a -sudden disconnect of an external drive or similar failures). - -While Borg uses a data store that is resilient against these failures -when used on journaling file systems, it is not possible to guarantee -this with some hardware -- independent of the software used. We don't -know a list of affected hardware. - -If you are suspicious whether your Borg repository is still consistent -and readable after one of the failures mentioned above occurred, run -``borg check --verify-data`` to make sure it is consistent. - -.. rubric:: Requirements for Borg repository file systems - -- Long file names -- At least three directory levels with short names -- Typically, file sizes up to a few hundred MB. - Large repositories may require large files (>2 GB). -- Up to 1000 files per directory. -- rename(2) / MoveFile(Ex) should work as specified, i.e. on the same file system - it should be a move (not a copy) operation, and in case of a directory - it should fail if the destination exists and is not an empty directory, - since this is used for locking. -- Also hardlinks are used for more safe and secure file updating (e.g. of the repo - config file), but the code tries to work also if hardlinks are not supported. +We recommend using a reliable, scalable journaling filesystem for the +repository, e.g. zfs, btrfs, ext4, apfs. + +Borg now uses the ``borgstore`` package to implement the key/value store it +uses for the repository. + +It currently uses the ``file:`` Store (posixfs backend) either with a local +directory or via ssh and a remote ``borg serve`` agent using borgstore on the +remote side. + +This means that it will store each chunk into a separate filesystem file +(for more details, see the ``borgstore`` project). + +This has some pros and cons (compared to legacy borg 1.x's segment files): + +Pros: + +- Simplicity and better maintainability of the borg code. +- Sometimes faster, less I/O, better scalability: e.g. borg compact can just + remove unused chunks by deleting a single file and does not need to read + and re-write segment files to free space. +- In future, easier to adapt to other kinds of storage: + borgstore's backends are quite simple to implement. + A ``sftp:`` backend already exists, cloud storage might be easy to add. +- Parallel repository access with less locking is easier to implement. + +Cons: + +- The repository filesystem will have to deal with a big amount of files (there + are provisions in borgstore against having too many files in a single directory + by using a nested directory structure). +- Bigger fs space usage overhead (will depend on allocation block size - modern + filesystems like zfs are rather clever here using a variable block size). +- Sometimes slower, due to less sequential / more random access operations. diff --git a/docs/usage/general/repository-urls.rst.inc b/docs/usage/general/repository-urls.rst.inc index 2d1fc27d57..e6167d08aa 100644 --- a/docs/usage/general/repository-urls.rst.inc +++ b/docs/usage/general/repository-urls.rst.inc @@ -20,6 +20,9 @@ Note: you may also prepend a ``file://`` to a filesystem path to get URL style. ``ssh://user@host:port/~/path/to/repo`` - path relative to user's home directory +**Remote repositories** accessed via sftp: + +``sftp://user@host:port/path/to/repo`` - absolute path` If you frequently need the same repo URL, it is a good idea to set the ``BORG_REPO`` environment variable to set a default for the repo URL: diff --git a/docs/usage/import-tar.rst.inc b/docs/usage/import-tar.rst.inc index fcf0eaa5f1..a207008827 100644 --- a/docs/usage/import-tar.rst.inc +++ b/docs/usage/import-tar.rst.inc @@ -43,10 +43,6 @@ borg import-tar +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--timestamp TIMESTAMP`` | manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--checkpoint-volume BYTES`` | write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) | - +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--chunker-params PARAMS`` | specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 | +-------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``-C COMPRESSION``, ``--compression COMPRESSION`` | select compression algorithm, see the output of the "borg help compression" command for details. | @@ -83,8 +79,6 @@ borg import-tar Archive options --comment COMMENT add a comment text to the archive --timestamp TIMESTAMP manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) - --checkpoint-volume BYTES write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) --chunker-params PARAMS specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE). default: buzhash,19,23,21,4095 -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. diff --git a/docs/usage/list.rst.inc b/docs/usage/list.rst.inc index bd99efcc00..88634bc86f 100644 --- a/docs/usage/list.rst.inc +++ b/docs/usage/list.rst.inc @@ -127,9 +127,7 @@ Keys available only when listing files in an archive: - flags: file flags - size: file size -- dsize: deduplicated size - num_chunks: number of chunks in this file -- unique_chunks: number of unique chunks in this file - mtime: file modification time - ctime: file change time diff --git a/docs/usage/mount.rst.inc b/docs/usage/mount.rst.inc index 777a86b1b8..ec8c634f09 100644 --- a/docs/usage/mount.rst.inc +++ b/docs/usage/mount.rst.inc @@ -21,8 +21,6 @@ borg mount +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | **optional arguments** | +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ - | | ``--consider-checkpoints`` | Show checkpoint archives in the repository contents list (default: hidden). | - +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | | ``-f``, ``--foreground`` | stay in foreground, do not daemonize | +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------+ | | ``-o`` | Extra mount options | @@ -81,7 +79,6 @@ borg mount optional arguments - --consider-checkpoints Show checkpoint archives in the repository contents list (default: hidden). -f, --foreground stay in foreground, do not daemonize -o Extra mount options --numeric-ids use numeric user and group identifiers from archive(s) diff --git a/docs/usage/prune.rst.inc b/docs/usage/prune.rst.inc index 0504f15d31..dfa46e1cc0 100644 --- a/docs/usage/prune.rst.inc +++ b/docs/usage/prune.rst.inc @@ -12,59 +12,53 @@ borg prune .. class:: borg-options-table - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | **optional arguments** | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-n``, ``--dry-run`` | do not change repository | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--force`` | force pruning of corrupted archives, use ``--force --force`` in case ``--force`` does not work. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-s``, ``--stats`` | print statistics for the deleted archive | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--list`` | output verbose list of archives it keeps/prunes | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--short`` | use a less wide archive part format | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--list-pruned`` | output verbose list of archives it prunes | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--list-kept`` | output verbose list of archives it keeps | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--format FORMAT`` | specify format for the archive part (default: "{archive:<36} {time} [{id}]") | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--keep-within INTERVAL`` | keep all archives within this time interval | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--keep-last``, ``--keep-secondly`` | number of secondly archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--keep-minutely`` | number of minutely archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-H``, ``--keep-hourly`` | number of hourly archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-d``, ``--keep-daily`` | number of daily archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-w``, ``--keep-weekly`` | number of weekly archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-m``, ``--keep-monthly`` | number of monthly archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-y``, ``--keep-yearly`` | number of yearly archives to keep | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | .. class:: borg-common-opt-ref | - | | - | :ref:`common_options` | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | **Archive filters** — Archive filters can be applied to repository targets. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archive names matching the pattern. see "borg help match-archives". | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ - | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------+ + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | **optional arguments** | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-n``, ``--dry-run`` | do not change repository | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--list`` | output verbose list of archives it keeps/prunes | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--short`` | use a less wide archive part format | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--list-pruned`` | output verbose list of archives it prunes | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--list-kept`` | output verbose list of archives it keeps | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--format FORMAT`` | specify format for the archive part (default: "{archive:<36} {time} [{id}]") | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--keep-within INTERVAL`` | keep all archives within this time interval | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--keep-last``, ``--keep-secondly`` | number of secondly archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--keep-minutely`` | number of minutely archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-H``, ``--keep-hourly`` | number of hourly archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-d``, ``--keep-daily`` | number of daily archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-w``, ``--keep-weekly`` | number of weekly archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-m``, ``--keep-monthly`` | number of monthly archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-y``, ``--keep-yearly`` | number of yearly archives to keep | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | .. class:: borg-common-opt-ref | + | | + | :ref:`common_options` | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | **Archive filters** — Archive filters can be applied to repository targets. | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archive names matching the pattern. see "borg help match-archives". | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ + | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+ .. raw:: html @@ -80,8 +74,6 @@ borg prune optional arguments -n, --dry-run do not change repository - --force force pruning of corrupted archives, use ``--force --force`` in case ``--force`` does not work. - -s, --stats print statistics for the deleted archive --list output verbose list of archives it keeps/prunes --short use a less wide archive part format --list-pruned output verbose list of archives it prunes @@ -95,7 +87,6 @@ borg prune -w, --keep-weekly number of weekly archives to keep -m, --keep-monthly number of monthly archives to keep -y, --keep-yearly number of yearly archives to keep - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) :ref:`common_options` @@ -122,11 +113,6 @@ certain number of historic backups. This retention policy is commonly referred t `GFS `_ (Grandfather-father-son) backup rotation scheme. -Also, prune automatically removes checkpoint archives (incomplete archives left -behind by interrupted backup runs) except if the checkpoint is the latest -archive (and thus still needed). Checkpoint archives are not considered when -comparing archive counts against the retention limits (``--keep-X``). - If you use --match-archives (-a), then only archives that match the pattern are considered for deletion and only those archives count towards the totals specified by the rules. @@ -162,11 +148,6 @@ The ``--keep-last N`` option is doing the same as ``--keep-secondly N`` (and it keep the last N archives under the assumption that you do not create more than one backup archive in the same second). -When using ``--stats``, you will get some statistics about how much data was -deleted - the "Deleted data" deduplicated size there is most interesting as -that is how much your repository will shrink. -Please note that the "All archives" stats refer to the state after pruning. - You can influence how the ``--list`` output is formatted by using the ``--short`` option (less wide output) or by giving a custom format using ``--format`` (see the ``borg rlist`` description for more details about the format string). \ No newline at end of file diff --git a/docs/usage/rcompress.rst.inc b/docs/usage/rcompress.rst.inc index 97d19c247b..9d3861895a 100644 --- a/docs/usage/rcompress.rst.inc +++ b/docs/usage/rcompress.rst.inc @@ -19,8 +19,6 @@ borg rcompress +-------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------+ | | ``-s``, ``--stats`` | print statistics | +-------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------+ | .. class:: borg-common-opt-ref | | | | :ref:`common_options` | @@ -41,7 +39,6 @@ borg rcompress optional arguments -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. -s, --stats print statistics - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) :ref:`common_options` @@ -52,20 +49,14 @@ Description Repository (re-)compression (and/or re-obfuscation). -Reads all chunks in the repository (in on-disk order, this is important for -compaction) and recompresses them if they are not already using the compression -type/level and obfuscation level given via ``--compression``. +Reads all chunks in the repository and recompresses them if they are not already +using the compression type/level and obfuscation level given via ``--compression``. If the outcome of the chunk processing indicates a change in compression type/level or obfuscation level, the processed chunk is written to the repository. Please note that the outcome might not always be the desired compression type/level - if no compression gives a shorter output, that might be chosen. -Every ``--checkpoint-interval``, progress is committed to the repository and -the repository is compacted (this is to keep temporary repo space usage in bounds). -A lower checkpoint interval means lower temporary repo space usage, but also -slower progress due to higher overhead (and vice versa). - Please note that this command can not work in low (or zero) free disk space conditions. diff --git a/docs/usage/rcreate.rst.inc b/docs/usage/rcreate.rst.inc index 9082fb5621..b61acd497c 100644 --- a/docs/usage/rcreate.rst.inc +++ b/docs/usage/rcreate.rst.inc @@ -17,6 +17,8 @@ borg rcreate +-------------------------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--other-repo SRC_REPOSITORY`` | reuse the key material from the other repository | +-------------------------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | | ``--from-borg1`` | other repository is borg 1.x | + +-------------------------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``-e MODE``, ``--encryption MODE`` | select encryption key mode **(required)** | +-------------------------------------------------------+------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--append-only`` | create an append-only mode repository. Note that this only affects the low level structure of the repository, and running `delete` or `prune` will still be allowed. See :ref:`append_only_mode` in Additional Notes for more details. | @@ -46,6 +48,7 @@ borg rcreate optional arguments --other-repo SRC_REPOSITORY reuse the key material from the other repository + --from-borg1 other repository is borg 1.x -e MODE, --encryption MODE select encryption key mode **(required)** --append-only create an append-only mode repository. Note that this only affects the low level structure of the repository, and running `delete` or `prune` will still be allowed. See :ref:`append_only_mode` in Additional Notes for more details. --storage-quota QUOTA Set storage quota of the new repository (e.g. 5G, 1.5T). Default: no quota. @@ -59,8 +62,8 @@ borg rcreate Description ~~~~~~~~~~~ -This command creates a new, empty repository. A repository is a filesystem -directory containing the deduplicated data from zero or more archives. +This command creates a new, empty repository. A repository is a ``borgstore`` store +containing the deduplicated data from zero or more archives. Encryption mode TLDR ++++++++++++++++++++ @@ -173,4 +176,12 @@ Optionally, if you use ``--copy-crypt-key`` you can also keep the same crypt_key (used for authenticated encryption). Might be desired e.g. if you want to have less keys to manage. -Creating related repositories is useful e.g. if you want to use ``borg transfer`` later. \ No newline at end of file +Creating related repositories is useful e.g. if you want to use ``borg transfer`` later. + +Creating a related repository for data migration from borg 1.2 or 1.4 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +You can use ``borg rcreate --other-repo ORIG_REPO --from-borg1 ...`` to create a related +repository that uses the same secret key material as the given other/original repository. + +Then use ``borg transfer --other-repo ORIG_REPO --from-borg1 ...`` to transfer the archives. \ No newline at end of file diff --git a/docs/usage/recreate.rst.inc b/docs/usage/recreate.rst.inc index 16ee53b211..46f0638c4d 100644 --- a/docs/usage/recreate.rst.inc +++ b/docs/usage/recreate.rst.inc @@ -67,10 +67,6 @@ borg recreate +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--target TARGET`` | create a new archive with the name ARCHIVE, do not replace existing archive (only applies for a single archive) | +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``-c SECONDS``, ``--checkpoint-interval SECONDS`` | write checkpoint every SECONDS seconds (Default: 1800) | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--checkpoint-volume BYTES`` | write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) | - +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--comment COMMENT`` | add a comment text to the archive | +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--timestamp TIMESTAMP`` | manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. | @@ -115,21 +111,19 @@ borg recreate Archive filters - -a PATTERN, --match-archives PATTERN only consider archive names matching the pattern. see "borg help match-archives". - --sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id; default is: timestamp - --first N consider first N archives after other filters were applied - --last N consider last N archives after other filters were applied - --oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. - --newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. - --older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m. - --newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. - --target TARGET create a new archive with the name ARCHIVE, do not replace existing archive (only applies for a single archive) - -c SECONDS, --checkpoint-interval SECONDS write checkpoint every SECONDS seconds (Default: 1800) - --checkpoint-volume BYTES write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing) - --comment COMMENT add a comment text to the archive - --timestamp TIMESTAMP manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. - -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. - --chunker-params PARAMS rechunk using given chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the chunker defaults. default: do not rechunk + -a PATTERN, --match-archives PATTERN only consider archive names matching the pattern. see "borg help match-archives". + --sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id; default is: timestamp + --first N consider first N archives after other filters were applied + --last N consider last N archives after other filters were applied + --oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. + --newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. + --older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m. + --newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. + --target TARGET create a new archive with the name ARCHIVE, do not replace existing archive (only applies for a single archive) + --comment COMMENT add a comment text to the archive + --timestamp TIMESTAMP manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, (+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory. + -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. + --chunker-params PARAMS rechunk using given chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the chunker defaults. default: do not rechunk Description diff --git a/docs/usage/rinfo.rst.inc b/docs/usage/rinfo.rst.inc index fa57cdebd7..098a7c7eaa 100644 --- a/docs/usage/rinfo.rst.inc +++ b/docs/usage/rinfo.rst.inc @@ -44,13 +44,4 @@ borg rinfo Description ~~~~~~~~~~~ -This command displays detailed information about the repository. - -Please note that the deduplicated sizes of the individual archives do not add -up to the deduplicated size of the repository ("all archives"), because the two -are meaning different things: - -This archive / deduplicated size = amount of data stored ONLY for this archive -= unique chunks of this archive. -All archives / deduplicated size = amount of data stored in the repo -= all chunks in the repository. \ No newline at end of file +This command displays detailed information about the repository. \ No newline at end of file diff --git a/docs/usage/rlist.rst.inc b/docs/usage/rlist.rst.inc index 2c3b5fcb88..d96d1df71b 100644 --- a/docs/usage/rlist.rst.inc +++ b/docs/usage/rlist.rst.inc @@ -15,8 +15,6 @@ borg rlist +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | **optional arguments** | +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | | ``--consider-checkpoints`` | Show checkpoint archives in the repository contents list (default: hidden). | - +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--short`` | only print the archive names, nothing else | +-----------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--format FORMAT`` | specify format for archive listing (default: "{archive:<36} {time} [{id}]{NL}") | @@ -59,7 +57,6 @@ borg rlist optional arguments - --consider-checkpoints Show checkpoint archives in the repository contents list (default: hidden). --short only print the archive names, nothing else --format FORMAT specify format for archive listing (default: "{archive:<36} {time} [{id}]{NL}") --json Format output as JSON. The form of ``--format`` is ignored, but keys used in it are added to the JSON output. Some keys are always present. Note: JSON can only represent text. diff --git a/docs/usage/rspace.rst b/docs/usage/rspace.rst new file mode 100644 index 0000000000..0913340fd6 --- /dev/null +++ b/docs/usage/rspace.rst @@ -0,0 +1 @@ +.. include:: rspace.rst.inc diff --git a/docs/usage/rspace.rst.inc b/docs/usage/rspace.rst.inc new file mode 100644 index 0000000000..28e8ce62a3 --- /dev/null +++ b/docs/usage/rspace.rst.inc @@ -0,0 +1,80 @@ +.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit! + +.. _borg_rspace: + +borg rspace +----------- +.. code-block:: none + + borg [common options] rspace [options] + +.. only:: html + + .. class:: borg-options-table + + +-------------------------------------------------------+---------------------+---------------------------------------------------------------------+ + | **optional arguments** | + +-------------------------------------------------------+---------------------+---------------------------------------------------------------------+ + | | ``--reserve SPACE`` | Amount of space to reserve (e.g. 100M, 1G). Default: 0. | + +-------------------------------------------------------+---------------------+---------------------------------------------------------------------+ + | | ``--free`` | Free all reserved space. Don't forget to reserve space later again. | + +-------------------------------------------------------+---------------------+---------------------------------------------------------------------+ + | .. class:: borg-common-opt-ref | + | | + | :ref:`common_options` | + +-------------------------------------------------------+---------------------+---------------------------------------------------------------------+ + + .. raw:: html + + + +.. only:: latex + + + + optional arguments + --reserve SPACE Amount of space to reserve (e.g. 100M, 1G). Default: 0. + --free Free all reserved space. Don't forget to reserve space later again. + + + :ref:`common_options` + | + +Description +~~~~~~~~~~~ + +This command manages reserved space in a repository. + +Borg can not work in disk-full conditions (can not lock a repo and thus can +not run prune/delete or compact operations to free disk space). + +To avoid running into dead-end situations like that, you can put some objects +into a repository that take up some disk space. If you ever run into a +disk-full situation, you can free that space and then borg will be able to +run normally, so you can free more disk space by using prune/delete/compact. +After that, don't forget to reserve space again, in case you run into that +situation again at a later time. + +Examples:: + + # Create a new repository: + $ borg rcreate ... + # Reserve approx. 1GB of space for emergencies: + $ borg rspace --reserve 1G + + # Check amount of reserved space in the repository: + $ borg rspace + + # EMERGENCY! Free all reserved space to get things back to normal: + $ borg rspace --free + $ borg prune ... + $ borg delete ... + $ borg compact -v # only this actually frees space of deleted archives + $ borg rspace --reserve 1G # reserve space again for next time + + +Reserved space is always rounded up to use full reservation blocks of 64MiB. \ No newline at end of file diff --git a/docs/usage/transfer.rst.inc b/docs/usage/transfer.rst.inc index f1c3d570ad..5b6d453351 100644 --- a/docs/usage/transfer.rst.inc +++ b/docs/usage/transfer.rst.inc @@ -19,6 +19,8 @@ borg transfer +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--other-repo SRC_REPOSITORY`` | transfer archives from the other repository | +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | | ``--from-borg1`` | other repository is borg 1.x | + +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``--upgrader UPGRADER`` | use the upgrader to convert transferred data (default: no conversion) | +-----------------------------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | | ``-C COMPRESSION``, ``--compression COMPRESSION`` | select compression algorithm, see the output of the "borg help compression" command for details. | @@ -63,6 +65,7 @@ borg transfer optional arguments -n, --dry-run do not change repository, just check --other-repo SRC_REPOSITORY transfer archives from the other repository + --from-borg1 other repository is borg 1.x --upgrader UPGRADER use the upgrader to convert transferred data (default: no conversion) -C COMPRESSION, --compression COMPRESSION select compression algorithm, see the output of the "borg help compression" command for details. --recompress MODE recompress data chunks according to `MODE` and ``--compression``. Possible modes are `always`: recompress unconditionally; and `never`: do not recompress (faster: re-uses compressed data chunks w/o change).If no MODE is given, `always` will be used. Not passing --recompress is equivalent to "--recompress never". @@ -96,31 +99,40 @@ any case) and keep data compressed "as is" (saves time as no data compression is If you want to globally change compression while transferring archives to the DST_REPO, give ``--compress=WANTED_COMPRESSION --recompress=always``. -Suggested use for general purpose archive transfer (not repo upgrades):: +The default is to transfer all archives. + +You could use the misc. archive filter options to limit which archives it will +transfer, e.g. using the ``-a`` option. This is recommended for big +repositories with multiple data sets to keep the runtime per invocation lower. + +General purpose archive transfer +++++++++++++++++++++++++++++++++ + +Transfer borg2 archives into a related other borg2 repository:: # create a related DST_REPO (reusing key material from SRC_REPO), so that # chunking and chunk id generation will work in the same way as before. - borg --repo=DST_REPO rcreate --other-repo=SRC_REPO --encryption=DST_ENC + borg --repo=DST_REPO rcreate --encryption=DST_ENC --other-repo=SRC_REPO # transfer archives from SRC_REPO to DST_REPO borg --repo=DST_REPO transfer --other-repo=SRC_REPO --dry-run # check what it would do borg --repo=DST_REPO transfer --other-repo=SRC_REPO # do it! borg --repo=DST_REPO transfer --other-repo=SRC_REPO --dry-run # check! anything left? -The default is to transfer all archives, including checkpoint archives. -You could use the misc. archive filter options to limit which archives it will -transfer, e.g. using the ``-a`` option. This is recommended for big -repositories with multiple data sets to keep the runtime per invocation lower. +Data migration / upgrade from borg 1.x +++++++++++++++++++++++++++++++++++++++ + +To migrate your borg 1.x archives into a related, new borg2 repository, usage is quite similar +to the above, but you need the ``--from-borg1`` option:: -For repository upgrades (e.g. from a borg 1.2 repo to a related borg 2.0 repo), usage is -quite similar to the above:: + borg --repo=DST_REPO rcreate --encryption=DST_ENC --other-repo=SRC_REPO --from-borg1 - # fast: compress metadata with zstd,3, but keep data chunks compressed as they are: - borg --repo=DST_REPO transfer --other-repo=SRC_REPO --upgrader=From12To20 \ - --compress=zstd,3 --recompress=never + # to continue using lz4 compression as you did in SRC_REPO: + borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \ + --compress=lz4 --recompress=never - # compress metadata and recompress data with zstd,3 - borg --repo=DST_REPO transfer --other-repo=SRC_REPO --upgrader=From12To20 \ + # alternatively, to recompress everything to zstd,3: + borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \ --compress=zstd,3 --recompress=always diff --git a/pyproject.toml b/pyproject.toml index 3003d9bbbe..40c374cba7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ dependencies = [ "platformdirs >=3.0.0, <5.0.0; sys_platform == 'darwin'", # for macOS: breaking changes in 3.0.0, "platformdirs >=2.6.0, <5.0.0; sys_platform != 'darwin'", # for others: 2.6+ works consistently. "argon2-cffi", + "borgstore", + ] [project.optional-dependencies] diff --git a/requirements.d/codestyle.txt b/requirements.d/codestyle.txt index b9e137e680..4a92e2c83c 100644 --- a/requirements.d/codestyle.txt +++ b/requirements.d/codestyle.txt @@ -1 +1 @@ -black >=23.0, <24 +black >=24.0, <25 diff --git a/scripts/fuzz-cache-sync/HOWTO b/scripts/fuzz-cache-sync/HOWTO deleted file mode 100644 index ae144b287f..0000000000 --- a/scripts/fuzz-cache-sync/HOWTO +++ /dev/null @@ -1,10 +0,0 @@ -- Install AFL and the requirements for LLVM mode (see docs) -- Compile the fuzzing target, e.g. - - AFL_HARDEN=1 afl-clang-fast main.c -o fuzz-target -O3 - - (other options, like using ASan or MSan are possible as well) -- Add additional test cases to testcase_dir -- Run afl, easiest (but inefficient) way; - - afl-fuzz -i testcase_dir -o findings_dir ./fuzz-target diff --git a/scripts/fuzz-cache-sync/main.c b/scripts/fuzz-cache-sync/main.c deleted file mode 100644 index c65dd272d1..0000000000 --- a/scripts/fuzz-cache-sync/main.c +++ /dev/null @@ -1,33 +0,0 @@ - -#define BORG_NO_PYTHON - -#include "../../src/borg/_hashindex.c" -#include "../../src/borg/cache_sync/cache_sync.c" - -#define BUFSZ 32768 - -int main() { - char buf[BUFSZ]; - int len, ret; - CacheSyncCtx *ctx; - HashIndex *idx; - - /* capacity, key size, value size */ - idx = hashindex_init(0, 32, 12); - ctx = cache_sync_init(idx); - - while (1) { - len = read(0, buf, BUFSZ); - if (!len) { - break; - } - ret = cache_sync_feed(ctx, buf, len); - if(!ret && cache_sync_error(ctx)) { - fprintf(stderr, "error: %s\n", cache_sync_error(ctx)); - return 1; - } - } - hashindex_free(idx); - cache_sync_free(ctx); - return 0; -} diff --git a/scripts/fuzz-cache-sync/testcase_dir/test_simple b/scripts/fuzz-cache-sync/testcase_dir/test_simple deleted file mode 100644 index d5f6670c15..0000000000 Binary files a/scripts/fuzz-cache-sync/testcase_dir/test_simple and /dev/null differ diff --git a/src/borg/archive.py b/src/borg/archive.py index fe19b4b6b5..c8a7399112 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -22,14 +22,14 @@ from . import xattr from .chunker import get_chunker, Chunk -from .cache import ChunkListEntry +from .cache import ChunkListEntry, build_chunkindex_from_repo from .crypto.key import key_factory, UnsupportedPayloadError from .compress import CompressionSpec from .constants import * # NOQA from .crypto.low_level import IntegrityError as IntegrityErrorBase from .helpers import BackupError, BackupRaceConditionError from .helpers import BackupOSError, BackupPermissionError, BackupFileNotFoundError, BackupIOError -from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer +from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import HardLinkManager from .helpers import ChunkIteratorFileWrapper, open_item from .helpers import Error, IntegrityError, set_ec @@ -44,14 +44,13 @@ from .helpers import os_open, flags_normal, flags_dir from .helpers import os_stat from .helpers import msgpack -from .helpers import sig_int from .helpers.lrucache import LRUCache from .manifest import Manifest from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem, ItemDiff from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname -from .remote import cache_if_remote -from .repository import Repository, LIST_SCAN_LIMIT +from .remote import RemoteRepository, cache_if_remote +from .repository import Repository, NoManifestError from .repoobj import RepoObj has_link = hasattr(os, "link") @@ -123,7 +122,6 @@ def __repr__(self): def as_dict(self): return { "original_size": FileSize(self.osize, iec=self.iec), - "deduplicated_size": FileSize(self.usize, iec=self.iec), "nfiles": self.nfiles, "hashing_time": self.hashing_time, "chunking_time": self.chunking_time, @@ -356,18 +354,6 @@ def flush(self, flush=False): def is_full(self): return self.buffer.tell() > self.BUFFER_SIZE - def save_chunks_state(self): - # as we only append to self.chunks, remembering the current length is good enough - self.saved_chunks_len = len(self.chunks) - - def restore_chunks_state(self): - scl = self.saved_chunks_len - assert scl is not None, "forgot to call save_chunks_state?" - tail_chunks = self.chunks[scl:] - del self.chunks[scl:] - self.saved_chunks_len = None - return tail_chunks - class CacheChunkBuffer(ChunkBuffer): def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS): @@ -506,14 +492,8 @@ def __init__( self.create = create if self.create: self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats) - if name in manifest.archives: + if manifest.archives.exists(name): raise self.AlreadyExists(name) - i = 0 - while True: - self.checkpoint_name = "{}.checkpoint{}".format(name, i and (".%d" % i) or "") - if self.checkpoint_name not in manifest.archives: - break - i += 1 else: info = self.manifest.archives.get(name) if info is None: @@ -628,35 +608,9 @@ def add_item(self, item, show_progress=True, stats=None): stats.show_progress(item=item, dt=0.2) self.items_buffer.add(item) - def prepare_checkpoint(self): - # we need to flush the archive metadata stream to repo chunks, so that - # we have the metadata stream chunks WITHOUT the part file item we add later. - # The part file item will then get into its own metadata stream chunk, which we - # can easily NOT include into the next checkpoint or the final archive. - self.items_buffer.flush(flush=True) - # remember the current state of self.chunks, which corresponds to the flushed chunks - self.items_buffer.save_chunks_state() - - def write_checkpoint(self): - metadata = self.save(self.checkpoint_name) - # that .save() has committed the repo. - # at next commit, we won't need this checkpoint archive any more because we will then - # have either a newer checkpoint archive or the final archive. - # so we can already remove it here, the next .save() will then commit this cleanup. - # remove its manifest entry, remove its ArchiveItem chunk, remove its item_ptrs chunks: - del self.manifest.archives[self.checkpoint_name] - self.cache.chunk_decref(self.id, 1, self.stats) - for id in metadata.item_ptrs: - self.cache.chunk_decref(id, 1, self.stats) - # also get rid of that part item, we do not want to have it in next checkpoint or final archive - tail_chunks = self.items_buffer.restore_chunks_state() - # tail_chunks contain the tail of the archive items metadata stream, not needed for next commit. - for id in tail_chunks: - self.cache.chunk_decref(id, 1, self.stats) # TODO can we have real size here? - def save(self, name=None, comment=None, timestamp=None, stats=None, additional_metadata=None): name = name or self.name - if name in self.manifest.archives: + if self.manifest.archives.exists(name): raise self.AlreadyExists(name) self.items_buffer.flush(flush=True) item_ptrs = archive_put_items( @@ -703,41 +657,13 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m raise while self.repository.async_response(wait=True) is not None: pass - self.manifest.archives[name] = (self.id, metadata.time) + self.manifest.archives.create(name, self.id, metadata.time) self.manifest.write() - self.repository.commit(compact=False) - self.cache.commit() return metadata def calc_stats(self, cache, want_unique=True): - if not want_unique: - unique_size = 0 - else: - - def add(id): - entry = cache.chunks[id] - archive_index.add(id, 1, entry.size) - - archive_index = ChunkIndex() - sync = CacheSynchronizer(archive_index) - add(self.id) - # we must escape any % char in the archive name, because we use it in a format string, see #6500 - arch_name_escd = self.name.replace("%", "%%") - pi = ProgressIndicatorPercent( - total=len(self.metadata.items), - msg="Calculating statistics for archive %s ... %%3.0f%%%%" % arch_name_escd, - msgid="archive.calc_stats", - ) - for id, chunk in zip(self.metadata.items, self.repository.get_many(self.metadata.items)): - pi.show(increase=1) - add(id) - _, data = self.repo_objs.parse(id, chunk, ro_type=ROBJ_ARCHIVE_STREAM) - sync.feed(data) - unique_size = archive_index.stats_against(cache.chunks)[1] - pi.finish() - stats = Statistics(iec=self.iec) - stats.usize = unique_size + stats.usize = 0 # this is expensive to compute stats.nfiles = self.metadata.nfiles stats.osize = self.metadata.size return stats @@ -1025,96 +951,22 @@ def set_meta(self, key, value): data = self.key.pack_metadata(metadata.as_dict()) new_id = self.key.id_hash(data) self.cache.add_chunk(new_id, {}, data, stats=self.stats, ro_type=ROBJ_ARCHIVE_META) - self.manifest.archives[self.name] = (new_id, metadata.time) - self.cache.chunk_decref(self.id, 1, self.stats) + self.manifest.archives.create(self.name, new_id, metadata.time, overwrite=True) self.id = new_id def rename(self, name): - if name in self.manifest.archives: + if self.manifest.archives.exists(name): raise self.AlreadyExists(name) oldname = self.name self.name = name self.set_meta("name", name) - del self.manifest.archives[oldname] - - def delete(self, stats, progress=False, forced=False): - class ChunksIndexError(Error): - """Chunk ID {} missing from chunks index, corrupted chunks index - aborting transaction.""" - - exception_ignored = object() - - def fetch_async_response(wait=True): - try: - return self.repository.async_response(wait=wait) - except Repository.ObjectNotFound: - nonlocal error - # object not in repo - strange, but we wanted to delete it anyway. - if forced == 0: - raise - error = True - return exception_ignored # must not return None here - - def chunk_decref(id, size, stats): - try: - self.cache.chunk_decref(id, size, stats, wait=False) - except KeyError: - nonlocal error - if forced == 0: - cid = bin_to_hex(id) - raise ChunksIndexError(cid) - error = True - else: - fetch_async_response(wait=False) + self.manifest.archives.delete(oldname) - error = False - try: - unpacker = msgpack.Unpacker(use_list=False) - items_ids = self.metadata.items - pi = ProgressIndicatorPercent( - total=len(items_ids), msg="Decrementing references %3.0f%%", msgid="archive.delete" - ) - for i, (items_id, data) in enumerate(zip(items_ids, self.repository.get_many(items_ids))): - if progress: - pi.show(i) - _, data = self.repo_objs.parse(items_id, data, ro_type=ROBJ_ARCHIVE_STREAM) - unpacker.feed(data) - chunk_decref(items_id, 1, stats) - try: - for item in unpacker: - item = Item(internal_dict=item) - if "chunks" in item: - for chunk_id, size in item.chunks: - chunk_decref(chunk_id, size, stats) - except (TypeError, ValueError): - # if items metadata spans multiple chunks and one chunk got dropped somehow, - # it could be that unpacker yields bad types - if forced == 0: - raise - error = True - if progress: - pi.finish() - except (msgpack.UnpackException, Repository.ObjectNotFound): - # items metadata corrupted - if forced == 0: - raise - error = True - - # delete the blocks that store all the references that end up being loaded into metadata.items: - for id in self.metadata.item_ptrs: - chunk_decref(id, 1, stats) - - # in forced delete mode, we try hard to delete at least the manifest entry, - # if possible also the archive superblock, even if processing the items raises - # some harmless exception. - chunk_decref(self.id, 1, stats) - del self.manifest.archives[self.name] - while fetch_async_response(wait=True) is not None: - # we did async deletes, process outstanding results (== exceptions), - # so there is nothing pending when we return and our caller wants to commit. - pass - if error: - logger.warning("forced deletion succeeded, but the deleted archive was corrupted.") - logger.warning("borg check --repair is required to free all space.") + def delete(self): + # quick and dirty: we just nuke the archive from the archives list - that will + # potentially orphan all chunks previously referenced by the archive, except the ones also + # referenced by other archives. In the end, "borg compact" will clean up and free space. + self.manifest.archives.delete(self.name) @staticmethod def compare_archives_iter( @@ -1261,60 +1113,11 @@ def cached_hash(chunk, id_hash): class ChunksProcessor: # Processes an iterator of chunks for an Item - def __init__( - self, - *, - key, - cache, - add_item, - prepare_checkpoint, - write_checkpoint, - checkpoint_interval, - checkpoint_volume, - rechunkify, - ): + def __init__(self, *, key, cache, add_item, rechunkify): self.key = key self.cache = cache self.add_item = add_item - self.prepare_checkpoint = prepare_checkpoint - self.write_checkpoint = write_checkpoint self.rechunkify = rechunkify - # time interval based checkpointing - self.checkpoint_interval = checkpoint_interval - self.last_checkpoint = time.monotonic() - # file content volume based checkpointing - self.checkpoint_volume = checkpoint_volume - self.current_volume = 0 - self.last_volume_checkpoint = 0 - - def write_part_file(self, item): - self.prepare_checkpoint() - item = Item(internal_dict=item.as_dict()) - # for borg recreate, we already have a size member in the source item (giving the total file size), - # but we consider only a part of the file here, thus we must recompute the size from the chunks: - item.get_size(memorize=True, from_chunks=True) - item.path += ".borg_part" - self.add_item(item, show_progress=False) - self.write_checkpoint() - - def maybe_checkpoint(self, item): - checkpoint_done = False - sig_int_triggered = sig_int and sig_int.action_triggered() - if ( - sig_int_triggered - or (self.checkpoint_interval and time.monotonic() - self.last_checkpoint > self.checkpoint_interval) - or (self.checkpoint_volume and self.current_volume - self.last_volume_checkpoint >= self.checkpoint_volume) - ): - if sig_int_triggered: - logger.info("checkpoint requested: starting checkpoint creation...") - self.write_part_file(item) - checkpoint_done = True - self.last_checkpoint = time.monotonic() - self.last_volume_checkpoint = self.current_volume - if sig_int_triggered: - sig_int.action_completed() - logger.info("checkpoint requested: finished checkpoint creation!") - return checkpoint_done # whether a checkpoint archive was created def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): if not chunk_processor: @@ -1335,16 +1138,13 @@ def chunk_processor(chunk): for chunk in chunk_iter: chunk_entry = chunk_processor(chunk) item.chunks.append(chunk_entry) - self.current_volume += chunk_entry[1] if show_progress: stats.show_progress(item=item, dt=0.2) - self.maybe_checkpoint(item) class FilesystemObjectProcessors: # When ported to threading, then this doesn't need chunker, cache, key any more. - # write_checkpoint should then be in the item buffer, - # and process_file becomes a callback passed to __init__. + # process_file becomes a callback passed to __init__. def __init__( self, @@ -1508,20 +1308,11 @@ def process_pipe(self, *, path, cache, fd, mode, user=None, group=None): item.uid = uid if gid is not None: item.gid = gid - try: - self.process_file_chunks( - item, cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd)) - ) - except BackupOSError: - # see comments in process_file's exception handler, same issue here. - for chunk in item.get("chunks", []): - cache.chunk_decref(chunk.id, chunk.size, self.stats, wait=False) - raise - else: - item.get_size(memorize=True) - self.stats.nfiles += 1 - self.add_item(item, stats=self.stats) - return status + self.process_file_chunks(item, cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd))) + item.get_size(memorize=True) + self.stats.nfiles += 1 + self.add_item(item, stats=self.stats) + return status def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal, last_try=False, strip_prefix): with self.create_helper(path, st, None, strip_prefix=strip_prefix) as ( @@ -1542,93 +1333,81 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal, # so it can be extracted / accessed in FUSE mount like a regular file. # this needs to be done early, so that part files also get the patched mode. item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) - # we begin processing chunks now (writing or incref'ing them to the repository), - # which might require cleanup (see except-branch): - try: - if hl_chunks is not None: # create_helper gave us chunks from a previous hardlink - item.chunks = [] - for chunk_id, chunk_size in hl_chunks: - # process one-by-one, so we will know in item.chunks how far we got - chunk_entry = cache.chunk_incref(chunk_id, chunk_size, self.stats) - item.chunks.append(chunk_entry) - else: # normal case, no "2nd+" hardlink - if not is_special_file: - hashed_path = safe_encode(os.path.join(self.cwd, path)) - started_hashing = time.monotonic() - path_hash = self.key.id_hash(hashed_path) - self.stats.hashing_time += time.monotonic() - started_hashing - known, chunks = cache.file_known_and_unchanged(hashed_path, path_hash, st) + # we begin processing chunks now. + if hl_chunks is not None: # create_helper gave us chunks from a previous hardlink + item.chunks = [] + for chunk_id, chunk_size in hl_chunks: + # process one-by-one, so we will know in item.chunks how far we got + chunk_entry = cache.reuse_chunk(chunk_id, chunk_size, self.stats) + item.chunks.append(chunk_entry) + else: # normal case, no "2nd+" hardlink + if not is_special_file: + hashed_path = safe_encode(os.path.join(self.cwd, path)) + started_hashing = time.monotonic() + path_hash = self.key.id_hash(hashed_path) + self.stats.hashing_time += time.monotonic() - started_hashing + known, chunks = cache.file_known_and_unchanged(hashed_path, path_hash, st) + else: + # in --read-special mode, we may be called for special files. + # there should be no information in the cache about special files processed in + # read-special mode, but we better play safe as this was wrong in the past: + hashed_path = path_hash = None + known, chunks = False, None + if chunks is not None: + # Make sure all ids are available + for chunk in chunks: + if not cache.seen_chunk(chunk.id): + # cache said it is unmodified, but we lost a chunk: process file like modified + status = "M" + break else: - # in --read-special mode, we may be called for special files. - # there should be no information in the cache about special files processed in - # read-special mode, but we better play safe as this was wrong in the past: - hashed_path = path_hash = None - known, chunks = False, None - if chunks is not None: - # Make sure all ids are available + item.chunks = [] for chunk in chunks: - if not cache.seen_chunk(chunk.id): - # cache said it is unmodified, but we lost a chunk: process file like modified - status = "M" - break + # process one-by-one, so we will know in item.chunks how far we got + cache.reuse_chunk(chunk.id, chunk.size, self.stats) + item.chunks.append(chunk) + status = "U" # regular file, unchanged + else: + status = "M" if known else "A" # regular file, modified or added + self.print_file_status(status, path) + # Only chunkify the file if needed + changed_while_backup = False + if "chunks" not in item: + with backup_io("read"): + self.process_file_chunks( + item, + cache, + self.stats, + self.show_progress, + backup_io_iter(self.chunker.chunkify(None, fd)), + ) + self.stats.chunking_time = self.chunker.chunking_time + if not is_win32: # TODO for win32 + with backup_io("fstat2"): + st2 = os.fstat(fd) + # special files: + # - fifos change naturally, because they are fed from the other side. no problem. + # - blk/chr devices don't change ctime anyway. + changed_while_backup = not is_special_file and st.st_ctime_ns != st2.st_ctime_ns + if changed_while_backup: + # regular file changed while we backed it up, might be inconsistent/corrupt! + if last_try: + status = "C" # crap! retries did not help. else: - item.chunks = [] - for chunk in chunks: - # process one-by-one, so we will know in item.chunks how far we got - cache.chunk_incref(chunk.id, chunk.size, self.stats) - item.chunks.append(chunk) - status = "U" # regular file, unchanged - else: - status = "M" if known else "A" # regular file, modified or added - self.print_file_status(status, path) - # Only chunkify the file if needed - changed_while_backup = False - if "chunks" not in item: - with backup_io("read"): - self.process_file_chunks( - item, - cache, - self.stats, - self.show_progress, - backup_io_iter(self.chunker.chunkify(None, fd)), - ) - self.stats.chunking_time = self.chunker.chunking_time - if not is_win32: # TODO for win32 - with backup_io("fstat2"): - st2 = os.fstat(fd) - # special files: - # - fifos change naturally, because they are fed from the other side. no problem. - # - blk/chr devices don't change ctime anyway. - changed_while_backup = not is_special_file and st.st_ctime_ns != st2.st_ctime_ns - if changed_while_backup: - # regular file changed while we backed it up, might be inconsistent/corrupt! - if last_try: - status = "C" # crap! retries did not help. - else: - raise BackupError("file changed while we read it!") - if not is_special_file and not changed_while_backup: - # we must not memorize special files, because the contents of e.g. a - # block or char device will change without its mtime/size/inode changing. - # also, we must not memorize a potentially inconsistent/corrupt file that - # changed while we backed it up. - cache.memorize_file(hashed_path, path_hash, st, item.chunks) - self.stats.files_stats[status] += 1 # must be done late - if not changed_while_backup: - status = None # we already called print_file_status - self.stats.nfiles += 1 - item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd)) - item.get_size(memorize=True) - return status - except BackupOSError: - # Something went wrong and we might need to clean up a bit. - # Maybe we have already incref'ed some file content chunks in the repo - - # but we will not add an item (see add_item in create_helper) and thus - # they would be orphaned chunks in case that we commit the transaction. - for chunk in item.get("chunks", []): - cache.chunk_decref(chunk.id, chunk.size, self.stats, wait=False) - # Now that we have cleaned up the chunk references, we can re-raise the exception. - # This will skip processing of this file, but might retry or continue with the next one. - raise + raise BackupError("file changed while we read it!") + if not is_special_file and not changed_while_backup: + # we must not memorize special files, because the contents of e.g. a + # block or char device will change without its mtime/size/inode changing. + # also, we must not memorize a potentially inconsistent/corrupt file that + # changed while we backed it up. + cache.memorize_file(hashed_path, path_hash, st, item.chunks) + self.stats.files_stats[status] += 1 # must be done late + if not changed_while_backup: + status = None # we already called print_file_status + self.stats.nfiles += 1 + item.update(self.metadata_collector.stat_ext_attrs(st, path, fd=fd)) + item.get_size(memorize=True) + return status class TarfileObjectProcessors: @@ -1723,21 +1502,15 @@ def process_file(self, *, tarinfo, status, type, tar): with self.create_helper(tarinfo, status, type) as (item, status): self.print_file_status(status, tarinfo.name) status = None # we already printed the status - try: - fd = tar.extractfile(tarinfo) - self.process_file_chunks( - item, self.cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd)) - ) - item.get_size(memorize=True, from_chunks=True) - self.stats.nfiles += 1 - # we need to remember ALL files, see HardLinkManager.__doc__ - self.hlm.remember(id=tarinfo.name, info=item.chunks) - return status - except BackupOSError: - # see comment in FilesystemObjectProcessors.process_file, same issue here. - for chunk in item.get("chunks", []): - self.cache.chunk_decref(chunk.id, chunk.size, self.stats, wait=False) - raise + fd = tar.extractfile(tarinfo) + self.process_file_chunks( + item, self.cache, self.stats, self.show_progress, backup_io_iter(self.chunker.chunkify(fd)) + ) + item.get_size(memorize=True, from_chunks=True) + self.stats.nfiles += 1 + # we need to remember ALL files, see HardLinkManager.__doc__ + self.hlm.remember(id=tarinfo.name, info=item.chunks) + return status def valid_msgpacked_dict(d, keys_serialized): @@ -1821,7 +1594,6 @@ def __next__(self): class ArchiveChecker: def __init__(self): self.error_found = False - self.possibly_superseded = set() def check( self, @@ -1829,6 +1601,7 @@ def check( *, verify_data=False, repair=False, + undelete_archives=False, match=None, sort_by="", first=0, @@ -1841,40 +1614,46 @@ def check( """Perform a set of checks on 'repository' :param repair: enable repair mode, write updated or corrected data into repository + :param undelete_archives: create archive directory entries that are missing :param first/last/sort_by: only check this number of first/last archives ordered by sort_by :param match: only check archives matching this pattern :param older/newer: only check archives older/newer than timedelta from now :param oldest/newest: only check archives older/newer than timedelta from oldest/newest archive timestamp :param verify_data: integrity verification of data referenced by archives """ + if not isinstance(repository, (Repository, RemoteRepository)): + logger.error("Checking legacy repositories is not supported.") + return False logger.info("Starting archive consistency check...") self.check_all = not any((first, last, match, older, newer, oldest, newest)) self.repair = repair self.repository = repository - self.init_chunks() - if not self.chunks: - logger.error("Repository contains no apparent data at all, cannot continue check/repair.") - return False + self.chunks = build_chunkindex_from_repo(self.repository) self.key = self.make_key(repository) self.repo_objs = RepoObj(self.key) if verify_data: self.verify_data() - if Manifest.MANIFEST_ID not in self.chunks: - logger.error("Repository manifest not found!") + rebuild_manifest = False + try: + repository.get_manifest() + except NoManifestError: + logger.error("Repository manifest is missing.") self.error_found = True - self.manifest = self.rebuild_manifest() + rebuild_manifest = True else: try: self.manifest = Manifest.load(repository, (Manifest.Operation.CHECK,), key=self.key) except IntegrityErrorBase as exc: logger.error("Repository manifest is corrupted: %s", exc) self.error_found = True - del self.chunks[Manifest.MANIFEST_ID] - self.manifest = self.rebuild_manifest() - self.rebuild_refcounts( + rebuild_manifest = True + if rebuild_manifest: + self.manifest = self.rebuild_manifest() + if undelete_archives: + self.rebuild_archives_directory() + self.rebuild_archives( match=match, first=first, last=last, sort_by=sort_by, older=older, oldest=oldest, newer=newer, newest=newest ) - self.orphan_chunks_check() self.finish() if self.error_found: logger.error("Archive consistency check complete, problems found.") @@ -1882,24 +1661,22 @@ def check( logger.info("Archive consistency check complete, no problems found.") return self.repair or not self.error_found - def init_chunks(self): - """Fetch a list of all object keys from repository""" - # Explicitly set the initial usable hash table capacity to avoid performance issues - # due to hash table "resonance". - # Since reconstruction of archive items can add some new chunks, add 10 % headroom. - self.chunks = ChunkIndex(usable=len(self.repository) * 1.1) - marker = None - while True: - result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker) - if not result: - break - marker = result[-1] - init_entry = ChunkIndexEntry(refcount=0, size=0) - for id_ in result: - self.chunks[id_] = init_entry - def make_key(self, repository): attempt = 0 + + # try the manifest first! + attempt += 1 + try: + cdata = repository.get_manifest() + except NoManifestError: + pass + else: + try: + return key_factory(repository, cdata) + except UnsupportedPayloadError: + # we get here, if the cdata we got has a corrupted key type byte + pass # ignore it, just continue trying + for chunkid, _ in self.chunks.iteritems(): attempt += 1 if attempt > 999: @@ -1919,51 +1696,32 @@ def make_key(self, repository): def verify_data(self): logger.info("Starting cryptographic data integrity verification...") - chunks_count_index = len(self.chunks) - chunks_count_segments = 0 + chunks_count = len(self.chunks) errors = 0 defect_chunks = [] pi = ProgressIndicatorPercent( - total=chunks_count_index, msg="Verifying data %6.2f%%", step=0.01, msgid="check.verify_data" + total=chunks_count, msg="Verifying data %6.2f%%", step=0.01, msgid="check.verify_data" ) - state = None - while True: - chunk_ids, state = self.repository.scan(limit=100, state=state) - if not chunk_ids: - break - chunks_count_segments += len(chunk_ids) - chunk_data_iter = self.repository.get_many(chunk_ids) - chunk_ids_revd = list(reversed(chunk_ids)) - while chunk_ids_revd: - pi.show() - chunk_id = chunk_ids_revd.pop(-1) # better efficiency + for chunk_id, _ in self.chunks.iteritems(): + pi.show() + try: + encrypted_data = self.repository.get(chunk_id) + except (Repository.ObjectNotFound, IntegrityErrorBase) as err: + self.error_found = True + errors += 1 + logger.error("chunk %s: %s", bin_to_hex(chunk_id), err) + if isinstance(err, IntegrityErrorBase): + defect_chunks.append(chunk_id) + else: try: - encrypted_data = next(chunk_data_iter) - except (Repository.ObjectNotFound, IntegrityErrorBase) as err: + # we must decompress, so it'll call assert_id() in there: + self.repo_objs.parse(chunk_id, encrypted_data, decompress=True, ro_type=ROBJ_DONTCARE) + except IntegrityErrorBase as integrity_error: self.error_found = True errors += 1 - logger.error("chunk %s: %s", bin_to_hex(chunk_id), err) - if isinstance(err, IntegrityErrorBase): - defect_chunks.append(chunk_id) - # as the exception killed our generator, make a new one for remaining chunks: - if chunk_ids_revd: - chunk_ids = list(reversed(chunk_ids_revd)) - chunk_data_iter = self.repository.get_many(chunk_ids) - else: - try: - # we must decompress, so it'll call assert_id() in there: - self.repo_objs.parse(chunk_id, encrypted_data, decompress=True, ro_type=ROBJ_DONTCARE) - except IntegrityErrorBase as integrity_error: - self.error_found = True - errors += 1 - logger.error("chunk %s, integrity error: %s", bin_to_hex(chunk_id), integrity_error) - defect_chunks.append(chunk_id) + logger.error("chunk %s, integrity error: %s", bin_to_hex(chunk_id), integrity_error) + defect_chunks.append(chunk_id) pi.finish() - if chunks_count_index != chunks_count_segments: - logger.error("Repo/Chunks index object count vs. segment files object count mismatch.") - logger.error( - "Repo/Chunks index: %d objects != segment files: %d objects", chunks_count_index, chunks_count_segments - ) if defect_chunks: if self.repair: # if we kill the defect chunk here, subsequent actions within this "borg check" @@ -2002,14 +1760,27 @@ def verify_data(self): log = logger.error if errors else logger.info log( "Finished cryptographic data integrity verification, verified %d chunks with %d integrity errors.", - chunks_count_segments, + chunks_count, errors, ) def rebuild_manifest(self): - """Rebuild the manifest object if it is missing + """Rebuild the manifest object.""" + + logger.info("Rebuilding missing/corrupted manifest.") + # as we have lost the manifest, we do not know any more what valid item keys we had. + # collecting any key we encounter in a damaged repo seems unwise, thus we just use + # the hardcoded list from the source code. thus, it is not recommended to rebuild a + # lost manifest on a older borg version than the most recent one that was ever used + # within this repository (assuming that newer borg versions support more item keys). + return Manifest(self.key, self.repository) + + def rebuild_archives_directory(self): + """Rebuild the archives directory, undeleting archives. Iterates through all objects in the repository looking for archive metadata blocks. + When finding some that do not have a corresponding archives directory entry, it will + create that entry (undeleting all archives). """ def valid_archive(obj): @@ -2017,15 +1788,12 @@ def valid_archive(obj): return False return REQUIRED_ARCHIVE_KEYS.issubset(obj) - logger.info("Rebuilding missing manifest, this might take some time...") - # as we have lost the manifest, we do not know any more what valid item keys we had. - # collecting any key we encounter in a damaged repo seems unwise, thus we just use - # the hardcoded list from the source code. thus, it is not recommended to rebuild a - # lost manifest on a older borg version than the most recent one that was ever used - # within this repository (assuming that newer borg versions support more item keys). - manifest = Manifest(self.key, self.repository) + logger.info("Rebuilding missing archives directory entries, this might take some time...") pi = ProgressIndicatorPercent( - total=len(self.chunks), msg="Rebuilding manifest %6.2f%%", step=0.01, msgid="check.rebuild_manifest" + total=len(self.chunks), + msg="Rebuilding missing archives directory entries %6.2f%%", + step=0.01, + msgid="check.rebuild_archives_directory", ) for chunk_id, _ in self.chunks.iteritems(): pi.show() @@ -2047,34 +1815,30 @@ def valid_archive(obj): archive = self.key.unpack_archive(data) archive = ArchiveItem(internal_dict=archive) name = archive.name - logger.info("Found archive %s", name) - if name in manifest.archives: + logger.info(f"Found archive {name}, id {bin_to_hex(chunk_id)}.") + if self.manifest.archives.exists_name_and_id(name, chunk_id): + logger.info("We already have an archives directory entry for this.") + elif not self.manifest.archives.exists(name): + # no archives list entry yet and name is not taken yet, create an entry + logger.warning(f"Creating archives directory entry for {name}.") + self.manifest.archives.create(name, chunk_id, archive.time) + else: + # we don't have an entry yet, but the name is taken by something else i = 1 while True: new_name = "%s.%d" % (name, i) - if new_name not in manifest.archives: + if not self.manifest.archives.exists(new_name): break i += 1 - logger.warning("Duplicate archive name %s, storing as %s", name, new_name) - name = new_name - manifest.archives[name] = (chunk_id, archive.time) + logger.warning(f"Creating archives directory entry using {new_name}.") + self.manifest.archives.create(new_name, chunk_id, archive.time) pi.finish() - logger.info("Manifest rebuild complete.") - return manifest + logger.info("Rebuilding missing archives directory entries completed.") - def rebuild_refcounts( + def rebuild_archives( self, first=0, last=0, sort_by="", match=None, older=None, newer=None, oldest=None, newest=None ): - """Rebuild object reference counts by walking the metadata - - Missing and/or incorrect data is repaired when detected - """ - # Exclude the manifest from chunks (manifest entry might be already deleted from self.chunks) - self.chunks.pop(Manifest.MANIFEST_ID, None) - - def mark_as_possibly_superseded(id_): - if self.chunks.get(id_, ChunkIndexEntry(0, 0)).refcount == 0: - self.possibly_superseded.add(id_) + """Analyze and rebuild archives, expecting some damage and trying to make stuff consistent again.""" def add_callback(chunk): id_ = self.key.id_hash(chunk) @@ -2082,12 +1846,11 @@ def add_callback(chunk): add_reference(id_, len(chunk), cdata) return id_ - def add_reference(id_, size, cdata=None): - try: - self.chunks.incref(id_) - except KeyError: + def add_reference(id_, size, cdata): + # either we already have this chunk in repo and chunks index or we add it now + if id_ not in self.chunks: assert cdata is not None - self.chunks[id_] = ChunkIndexEntry(refcount=1, size=size) + self.chunks[id_] = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=size) if self.repair: self.repository.put(id_, cdata) @@ -2138,9 +1901,7 @@ def replacement_chunk(size): ) ) chunk_id, size = chunk_current - if chunk_id in self.chunks: - add_reference(chunk_id, size) - else: + if chunk_id not in self.chunks: logger.warning( "{}: {}: Missing all-zero replacement chunk detected (Byte {}-{}, Chunk {}). " "Generating new replacement chunk.".format( @@ -2152,16 +1913,13 @@ def replacement_chunk(size): add_reference(chunk_id, size, cdata) else: if chunk_current == chunk_healthy: - # normal case, all fine. - add_reference(chunk_id, size) + pass # normal case, all fine. else: logger.info( "{}: {}: Healed previously missing file chunk! (Byte {}-{}, Chunk {}).".format( archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id) ) ) - add_reference(chunk_id, size) - mark_as_possibly_superseded(chunk_current[0]) # maybe orphaned the all-zero replacement chunk chunk_list.append([chunk_id, size]) # list-typed element as chunks_healthy is list-of-lists offset += size if chunks_replaced and not has_chunks_healthy: @@ -2276,11 +2034,11 @@ def valid_item(obj): if last and len(archive_infos) < last: logger.warning("--last %d archives: only found %d archives", last, len(archive_infos)) else: - archive_infos = self.manifest.archives.list(sort_by=sort_by, consider_checkpoints=True) + archive_infos = self.manifest.archives.list(sort_by=sort_by) num_archives = len(archive_infos) pi = ProgressIndicatorPercent( - total=num_archives, msg="Checking archives %3.1f%%", step=0.1, msgid="check.rebuild_refcounts" + total=num_archives, msg="Checking archives %3.1f%%", step=0.1, msgid="check.rebuild_archives" ) with cache_if_remote(self.repository) as repository: for i, info in enumerate(archive_infos): @@ -2290,16 +2048,23 @@ def valid_item(obj): if archive_id not in self.chunks: logger.error("Archive metadata block %s is missing!", bin_to_hex(archive_id)) self.error_found = True - del self.manifest.archives[info.name] + if self.repair: + logger.error(f"Deleting broken archive {info.name}.") + self.manifest.archives.delete(info.name) + else: + logger.error(f"Would delete broken archive {info.name}.") continue - mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) try: _, data = self.repo_objs.parse(archive_id, cdata, ro_type=ROBJ_ARCHIVE_META) except IntegrityError as integrity_error: logger.error("Archive metadata block %s is corrupted: %s", bin_to_hex(archive_id), integrity_error) self.error_found = True - del self.manifest.archives[info.name] + if self.repair: + logger.error(f"Deleting broken archive {info.name}.") + self.manifest.archives.delete(info.name) + else: + logger.error(f"Would delete broken archive {info.name}.") continue archive = self.key.unpack_archive(data) archive = ArchiveItem(internal_dict=archive) @@ -2312,47 +2077,23 @@ def valid_item(obj): verify_file_chunks(info.name, item) items_buffer.add(item) items_buffer.flush(flush=True) - for previous_item_id in archive_get_items( - archive, repo_objs=self.repo_objs, repository=self.repository - ): - mark_as_possibly_superseded(previous_item_id) - for previous_item_ptr in archive.item_ptrs: - mark_as_possibly_superseded(previous_item_ptr) - archive.item_ptrs = archive_put_items( - items_buffer.chunks, repo_objs=self.repo_objs, add_reference=add_reference - ) - data = self.key.pack_metadata(archive.as_dict()) - new_archive_id = self.key.id_hash(data) - cdata = self.repo_objs.format(new_archive_id, {}, data, ro_type=ROBJ_ARCHIVE_META) - add_reference(new_archive_id, len(data), cdata) - self.manifest.archives[info.name] = (new_archive_id, info.ts) + if self.repair: + archive.item_ptrs = archive_put_items( + items_buffer.chunks, repo_objs=self.repo_objs, add_reference=add_reference + ) + data = self.key.pack_metadata(archive.as_dict()) + new_archive_id = self.key.id_hash(data) + logger.debug(f"archive id old: {bin_to_hex(archive_id)}") + logger.debug(f"archive id new: {bin_to_hex(new_archive_id)}") + cdata = self.repo_objs.format(new_archive_id, {}, data, ro_type=ROBJ_ARCHIVE_META) + add_reference(new_archive_id, len(data), cdata) + self.manifest.archives.create(info.name, new_archive_id, info.ts, overwrite=True) pi.finish() - def orphan_chunks_check(self): - if self.check_all: - unused = {id_ for id_, entry in self.chunks.iteritems() if entry.refcount == 0} - orphaned = unused - self.possibly_superseded - if orphaned: - logger.info(f"{len(orphaned)} orphaned (unused) objects found.") - for chunk_id in orphaned: - logger.debug(f"chunk {bin_to_hex(chunk_id)} is orphaned.") - # To support working with AdHocCache or AdHocWithFilesCache, we do not set self.error_found = True. - if self.repair and unused: - logger.info( - "Deleting %d orphaned and %d superseded objects..." % (len(orphaned), len(self.possibly_superseded)) - ) - for id_ in unused: - self.repository.delete(id_) - logger.info("Finished deleting orphaned/superseded objects.") - else: - logger.info("Orphaned objects check skipped (needs all archives checked).") - def finish(self): if self.repair: logger.info("Writing Manifest.") self.manifest.write() - logger.info("Committing repo.") - self.repository.commit(compact=False) class ArchiveRecreater: @@ -2379,8 +2120,6 @@ def __init__( progress=False, file_status_printer=None, timestamp=None, - checkpoint_interval=1800, - checkpoint_volume=0, ): self.manifest = manifest self.repository = manifest.repository @@ -2405,8 +2144,6 @@ def __init__( self.stats = stats self.progress = progress self.print_file_status = file_status_printer or (lambda *args: None) - self.checkpoint_interval = None if dry_run else checkpoint_interval - self.checkpoint_volume = None if dry_run else checkpoint_volume def recreate(self, archive_name, comment=None, target_name=None): assert not self.is_temporary_archive(archive_name) @@ -2449,7 +2186,7 @@ def process_item(self, archive, target, item): def process_chunks(self, archive, target, item): if not target.recreate_rechunkify: for chunk_id, size in item.chunks: - self.cache.chunk_incref(chunk_id, size, target.stats) + self.cache.reuse_chunk(chunk_id, size, target.stats) return item.chunks chunk_iterator = self.iter_chunks(archive, target, list(item.chunks)) chunk_processor = partial(self.chunk_processor, target) @@ -2459,7 +2196,7 @@ def chunk_processor(self, target, chunk): chunk_id, data = cached_hash(chunk, self.key.id_hash) size = len(data) if chunk_id in self.seen_chunks: - return self.cache.chunk_incref(chunk_id, size, target.stats) + return self.cache.reuse_chunk(chunk_id, size, target.stats) chunk_entry = self.cache.add_chunk(chunk_id, {}, data, stats=target.stats, wait=False, ro_type=ROBJ_FILE_STREAM) self.cache.repository.async_response(wait=False) self.seen_chunks.add(chunk_entry.id) @@ -2503,7 +2240,7 @@ def save(self, archive, target, comment=None, replace_original=True): target.save(comment=comment, timestamp=self.timestamp, additional_metadata=additional_metadata) if replace_original: - archive.delete(Statistics(), progress=self.progress) + archive.delete() target.rename(archive.name) if self.stats: target.start = _start @@ -2552,14 +2289,7 @@ def create_target(self, archive, target_name=None): "Rechunking archive from %s to %s", source_chunker_params or "(unknown)", target.chunker_params ) target.process_file_chunks = ChunksProcessor( - cache=self.cache, - key=self.key, - add_item=target.add_item, - prepare_checkpoint=target.prepare_checkpoint, - write_checkpoint=target.write_checkpoint, - checkpoint_interval=self.checkpoint_interval, - checkpoint_volume=self.checkpoint_volume, - rechunkify=target.recreate_rechunkify, + cache=self.cache, key=self.key, add_item=target.add_item, rechunkify=target.recreate_rechunkify ).process_file_chunks target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed, sparse=False) return target diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index f0de105296..e18fef3d06 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -14,7 +14,6 @@ import os import shlex import signal - import time from datetime import datetime, timezone from ..logger import create_logger, setup_logging @@ -68,7 +67,6 @@ def get_func(args): from .benchmark_cmd import BenchmarkMixIn from .check_cmd import CheckMixIn from .compact_cmd import CompactMixIn -from .config_cmd import ConfigMixIn from .create_cmd import CreateMixIn from .debug_cmd import DebugMixIn from .delete_cmd import DeleteMixIn @@ -88,6 +86,7 @@ def get_func(args): from .rinfo_cmd import RInfoMixIn from .rdelete_cmd import RDeleteMixIn from .rlist_cmd import RListMixIn +from .rspace_cmd import RSpaceMixIn from .serve_cmd import ServeMixIn from .tar_cmds import TarMixIn from .transfer_cmd import TransferMixIn @@ -98,7 +97,6 @@ class Archiver( BenchmarkMixIn, CheckMixIn, CompactMixIn, - ConfigMixIn, CreateMixIn, DebugMixIn, DeleteMixIn, @@ -118,6 +116,7 @@ class Archiver( RDeleteMixIn, RInfoMixIn, RListMixIn, + RSpaceMixIn, ServeMixIn, TarMixIn, TransferMixIn, @@ -126,7 +125,6 @@ class Archiver( def __init__(self, lock_wait=None, prog=None): self.lock_wait = lock_wait self.prog = prog - self.last_checkpoint = time.monotonic() def print_warning(self, msg, *args, **kw): warning_code = kw.get("wc", EXIT_WARNING) # note: wc=None can be used to not influence exit code @@ -336,7 +334,6 @@ def build_parser(self): self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser) self.build_parser_check(subparsers, common_parser, mid_common_parser) self.build_parser_compact(subparsers, common_parser, mid_common_parser) - self.build_parser_config(subparsers, common_parser, mid_common_parser) self.build_parser_create(subparsers, common_parser, mid_common_parser) self.build_parser_debug(subparsers, common_parser, mid_common_parser) self.build_parser_delete(subparsers, common_parser, mid_common_parser) @@ -356,6 +353,7 @@ def build_parser(self): self.build_parser_rlist(subparsers, common_parser, mid_common_parser) self.build_parser_recreate(subparsers, common_parser, mid_common_parser) self.build_parser_rename(subparsers, common_parser, mid_common_parser) + self.build_parser_rspace(subparsers, common_parser, mid_common_parser) self.build_parser_serve(subparsers, common_parser, mid_common_parser) self.build_parser_tar(subparsers, common_parser, mid_common_parser) self.build_parser_transfer(subparsers, common_parser, mid_common_parser) @@ -412,22 +410,6 @@ def parse_args(self, args=None): elif not args.paths_from_stdin: # need at least 1 path but args.paths may also be populated from patterns parser.error("Need at least one PATH argument.") - if not getattr(args, "lock", True): # Option --bypass-lock sets args.lock = False - bypass_allowed = { - self.do_check, - self.do_config, - self.do_diff, - self.do_export_tar, - self.do_extract, - self.do_info, - self.do_rinfo, - self.do_list, - self.do_rlist, - self.do_mount, - self.do_umount, - } - if func not in bypass_allowed: - raise Error("Not allowed to bypass locking mechanism for chosen command") # we can only have a complete knowledge of placeholder replacements we should do **after** arg parsing, # e.g. due to options like --timestamp that override the current time. # thus we have to initialize replace_placeholders here and process all args that need placeholder replacement. @@ -474,20 +456,6 @@ def _setup_topic_debugging(self, args): logger.debug("Enabling debug topic %s", topic) logging.getLogger(topic).setLevel("DEBUG") - def maybe_checkpoint(self, *, checkpoint_func, checkpoint_interval): - checkpointed = False - sig_int_triggered = sig_int and sig_int.action_triggered() - if sig_int_triggered or checkpoint_interval and time.monotonic() - self.last_checkpoint > checkpoint_interval: - if sig_int_triggered: - logger.info("checkpoint requested: starting checkpoint creation...") - checkpoint_func() - checkpointed = True - self.last_checkpoint = time.monotonic() - if sig_int_triggered: - sig_int.action_completed() - logger.info("checkpoint requested: finished checkpoint creation!") - return checkpointed - def run(self, args): os.umask(args.umask) # early, before opening files self.lock_wait = args.lock_wait @@ -617,14 +585,13 @@ def main(): # pragma: no cover # Register fault handler for SIGSEGV, SIGFPE, SIGABRT, SIGBUS and SIGILL. faulthandler.enable() - with signal_handler("SIGINT", raising_signal_handler(KeyboardInterrupt)), signal_handler( - "SIGHUP", raising_signal_handler(SigHup) - ), signal_handler("SIGTERM", raising_signal_handler(SigTerm)), signal_handler( - "SIGUSR1", sig_info_handler - ), signal_handler( - "SIGUSR2", sig_trace_handler - ), signal_handler( - "SIGINFO", sig_info_handler + with ( + signal_handler("SIGINT", raising_signal_handler(KeyboardInterrupt)), + signal_handler("SIGHUP", raising_signal_handler(SigHup)), + signal_handler("SIGTERM", raising_signal_handler(SigTerm)), + signal_handler("SIGUSR1", sig_info_handler), + signal_handler("SIGUSR2", sig_trace_handler), + signal_handler("SIGINFO", sig_info_handler), ): archiver = Archiver() msg = msgid = tb = None diff --git a/src/borg/archiver/_common.py b/src/borg/archiver/_common.py index 4a49de9b89..4449c4f634 100644 --- a/src/borg/archiver/_common.py +++ b/src/borg/archiver/_common.py @@ -1,4 +1,3 @@ -import argparse import functools import os import textwrap @@ -13,7 +12,9 @@ from ..helpers.nanorst import rst_to_terminal from ..manifest import Manifest, AI_HUMAN_SORT_KEYS from ..patterns import PatternMatcher +from ..legacyremote import LegacyRemoteRepository from ..remote import RemoteRepository +from ..legacyrepository import LegacyRepository from ..repository import Repository from ..repoobj import RepoObj, RepoObj1 from ..patterns import ( @@ -29,9 +30,12 @@ logger = create_logger(__name__) -def get_repository(location, *, create, exclusive, lock_wait, lock, append_only, make_parent_dirs, storage_quota, args): +def get_repository( + location, *, create, exclusive, lock_wait, lock, append_only, make_parent_dirs, storage_quota, args, v1_or_v2 +): if location.proto in ("ssh", "socket"): - repository = RemoteRepository( + RemoteRepoCls = LegacyRemoteRepository if v1_or_v2 else RemoteRepository + repository = RemoteRepoCls( location, create=create, exclusive=exclusive, @@ -42,8 +46,21 @@ def get_repository(location, *, create, exclusive, lock_wait, lock, append_only, args=args, ) - else: + elif location.proto in ("sftp", "file") and not v1_or_v2: # stuff directly supported by borgstore repository = Repository( + location, + create=create, + exclusive=exclusive, + lock_wait=lock_wait, + lock=lock, + append_only=append_only, + make_parent_dirs=make_parent_dirs, + storage_quota=storage_quota, + ) + + else: + RepoCls = LegacyRepository if v1_or_v2 else Repository + repository = RepoCls( location.path, create=create, exclusive=exclusive, @@ -98,8 +115,7 @@ def with_repository( decorator_name="with_repository", ) - # To process the `--bypass-lock` option if specified, we need to - # modify `lock` inside `wrapper`. Therefore we cannot use the + # We may need to modify `lock` inside `wrapper`. Therefore we cannot use the # `nonlocal` statement to access `lock` as modifications would also # affect the scope outside of `wrapper`. Subsequent calls would # only see the overwritten value of `lock`, not the original one. @@ -129,13 +145,15 @@ def wrapper(self, args, **kwargs): make_parent_dirs=make_parent_dirs, storage_quota=storage_quota, args=args, + v1_or_v2=False, ) with repository: - if repository.version not in (2,): + if repository.version not in (3,): raise Error( - "This borg version only accepts version 2 repos for -r/--repo. " - "You can use 'borg transfer' to copy archives from old to new repos." + f"This borg version only accepts version 3 repos for -r/--repo, " + f"but not version {repository.version}. " + f"You can use 'borg transfer' to copy archives from old to new repos." ) if manifest or cache: manifest_ = Manifest.load(repository, compatibility) @@ -185,6 +203,8 @@ def wrapper(self, args, **kwargs): if not location.valid: # nothing to do return method(self, args, **kwargs) + v1_or_v2 = getattr(args, "v1_or_v2", False) + repository = get_repository( location, create=False, @@ -195,11 +215,16 @@ def wrapper(self, args, **kwargs): make_parent_dirs=False, storage_quota=None, args=args, + v1_or_v2=v1_or_v2, ) with repository: - if repository.version not in (1, 2): - raise Error("This borg version only accepts version 1 or 2 repos for --other-repo.") + acceptable_versions = (1, 2) if v1_or_v2 else (3,) + if repository.version not in acceptable_versions: + raise Error( + f"This borg version only accepts version {' or '.join(acceptable_versions)} " + f"repos for --other-repo." + ) kwargs["other_repository"] = repository if manifest or cache: manifest_ = Manifest.load( @@ -500,17 +525,10 @@ def define_common_options(add_common_option): metavar="SECONDS", dest="lock_wait", type=int, - default=int(os.environ.get("BORG_LOCK_WAIT", 1)), + default=int(os.environ.get("BORG_LOCK_WAIT", 10)), action=Highlander, help="wait at most SECONDS for acquiring a repository/cache lock (default: %(default)d).", ) - add_common_option( - "--bypass-lock", - dest="lock", - action="store_false", - default=argparse.SUPPRESS, # only create args attribute if option is specified - help="Bypass locking mechanism", - ) add_common_option("--show-version", dest="show_version", action="store_true", help="show/log the borg version") add_common_option("--show-rc", dest="show_rc", action="store_true", help="show/log the return code (rc)") add_common_option( diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index b3ca070a9a..6f075e8f4b 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -37,10 +37,10 @@ def do_check(self, args, repository): ) if args.repair and args.max_duration: raise CommandError("--repair does not allow --max-duration argument.") + if args.undelete_archives and not args.repair: + raise CommandError("--undelete-archives requires --repair argument.") if args.max_duration and not args.repo_only: - # when doing a partial repo check, we can only check crc32 checksums in segment files, - # we can't build a fresh repo index in memory to verify the on-disk index against it. - # thus, we should not do an archives check based on a unknown-quality on-disk repo index. + # when doing a partial repo check, we can only check xxh64 hashes in repository files. # also, there is no max_duration support in the archives check code anyway. raise CommandError("--repository-only is required for --max-duration support.") if not args.archives_only: @@ -50,6 +50,7 @@ def do_check(self, args, repository): repository, verify_data=args.verify_data, repair=args.repair, + undelete_archives=args.undelete_archives, match=args.match_archives, sort_by=args.sort_by or "ts", first=args.first, @@ -72,8 +73,8 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): It consists of two major steps: 1. Checking the consistency of the repository itself. This includes checking - the segment magic headers, and both the metadata and data of all objects in - the segments. The read data is checked by size and CRC. Bit rot and other + the file magic headers, and both the metadata and data of all objects in + the repository. The read data is checked by size and hash. Bit rot and other types of accidental damage can be detected this way. Running the repository check can be split into multiple partial checks using ``--max-duration``. When checking a remote repository, please note that the checks run on the @@ -108,13 +109,12 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): **Warning:** Please note that partial repository checks (i.e. running it with ``--max-duration``) can only perform non-cryptographic checksum checks on the - segment files. A full repository check (i.e. without ``--max-duration``) can - also do a repository index check. Enabling partial repository checks excepts - archive checks for the same reason. Therefore partial checks may be useful with - very large repositories only where a full check would take too long. + repository files. Enabling partial repository checks excepts archive checks + for the same reason. Therefore partial checks may be useful with very large + repositories only where a full check would take too long. The ``--verify-data`` option will perform a full integrity verification (as - opposed to checking the CRC32 of the segment) of data, which means reading the + opposed to checking just the xxh64) of data, which means reading the data from the repository, decrypting and decompressing it. It is a complete cryptographic verification and hence very time consuming, but will detect any accidental and malicious corruption. Tamper-resistance is only guaranteed for @@ -151,17 +151,15 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): In practice, repair mode hooks into both the repository and archive checks: - 1. When checking the repository's consistency, repair mode will try to recover - as many objects from segments with integrity errors as possible, and ensure - that the index is consistent with the data stored in the segments. + 1. When checking the repository's consistency, repair mode removes corrupted + objects from the repository after it did a 2nd try to read them correctly. 2. When checking the consistency and correctness of archives, repair mode might remove whole archives from the manifest if their archive metadata chunk is corrupt or lost. On a chunk level (i.e. the contents of files), repair mode will replace corrupt or lost chunks with a same-size replacement chunk of zeroes. If a previously zeroed chunk reappears, repair mode will restore - this lost chunk using the new chunk. Lastly, repair mode will also delete - orphaned chunks (e.g. caused by read errors while creating the archive). + this lost chunk using the new chunk. Most steps taken by repair mode have a one-time effect on the repository, like removing a lost archive from the repository. However, replacing a corrupt or @@ -180,6 +178,12 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): chunks of a "zero-patched" file reappear, this effectively "heals" the file. Consequently, if lost chunks were repaired earlier, it is advised to run ``--repair`` a second time after creating some new backups. + + If ``--repair --undelete-archives`` is given, Borg will scan the repository + for archive metadata and if it finds some where no corresponding archives + directory entry exists, it will create the entries. This is basically undoing + ``borg delete archive`` or ``borg prune ...`` commands and only possible before + ``borg compact`` would remove the archives' data completely. """ ) subparser = subparsers.add_parser( @@ -207,6 +211,12 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): subparser.add_argument( "--repair", dest="repair", action="store_true", help="attempt to repair any inconsistencies found" ) + subparser.add_argument( + "--undelete-archives", + dest="undelete_archives", + action="store_true", + help="attempt to undelete archives (use with --repair)", + ) subparser.add_argument( "--max-duration", metavar="SECONDS", diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index d5847741ad..cf905fd1ce 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -1,47 +1,174 @@ import argparse +from typing import Tuple, Dict -from ._common import with_repository, Highlander +from ._common import with_repository +from ..archive import Archive from ..constants import * # NOQA +from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex +from ..helpers import ProgressIndicatorPercent from ..manifest import Manifest +from ..remote import RemoteRepository +from ..repository import Repository from ..logger import create_logger logger = create_logger() +class ArchiveGarbageCollector: + def __init__(self, repository, manifest): + self.repository = repository + assert isinstance(repository, (Repository, RemoteRepository)) + self.manifest = manifest + self.repository_chunks = None # what we have in the repository, id -> stored_size + self.used_chunks = None # what archives currently reference + self.wanted_chunks = None # chunks that would be nice to have for next borg check --repair + self.total_files = None # overall number of source files written to all archives in this repo + self.total_size = None # overall size of source file content data written to all archives + self.archives_count = None # number of archives + + @property + def repository_size(self): + if self.repository_chunks is None: + return None + return sum(self.repository_chunks.values()) # sum of stored sizes + + def garbage_collect(self): + """Removes unused chunks from a repository.""" + logger.info("Starting compaction / garbage collection...") + logger.info("Getting object IDs present in the repository...") + self.repository_chunks = self.get_repository_chunks() + logger.info("Computing object IDs used by archives...") + (self.used_chunks, self.wanted_chunks, self.total_files, self.total_size, self.archives_count) = ( + self.analyze_archives() + ) + self.report_and_delete() + logger.info("Finished compaction / garbage collection...") + + def get_repository_chunks(self) -> Dict[bytes, int]: + """Build a dict id -> size of all chunks present in the repository""" + repository_chunks = {} + marker = None + while True: + result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + if not result: + break + marker = result[-1][0] + for id, stored_size in result: + repository_chunks[id] = stored_size + return repository_chunks + + def analyze_archives(self) -> Tuple[Dict[bytes, int], Dict[bytes, int], int, int, int]: + """Iterate over all items in all archives, create the dicts id -> size of all used/wanted chunks.""" + used_chunks = {} # chunks referenced by item.chunks + wanted_chunks = {} # additional "wanted" chunks seen in item.chunks_healthy + archive_infos = self.manifest.archives.list() + num_archives = len(archive_infos) + pi = ProgressIndicatorPercent( + total=num_archives, msg="Computing used/wanted chunks %3.1f%%", step=0.1, msgid="compact.analyze_archives" + ) + total_size, total_files = 0, 0 + for i, info in enumerate(archive_infos): + pi.show(i) + logger.info(f"Analyzing archive {info.name} ({i + 1}/{num_archives})") + archive = Archive(self.manifest, info.name) + # archive metadata size unknown, but usually small/irrelevant: + used_chunks[archive.id] = 0 + for id in archive.metadata.item_ptrs: + used_chunks[id] = 0 + for id in archive.metadata.items: + used_chunks[id] = 0 + # archive items content data: + for item in archive.iter_items(): + total_files += 1 # every fs object counts, not just regular files + if "chunks" in item: + for id, size in item.chunks: + total_size += size # original, uncompressed file content size + used_chunks[id] = size + if "chunks_healthy" in item: + # we also consider the chunks_healthy chunks as referenced - do not throw away + # anything that borg check --repair might still need. + for id, size in item.chunks_healthy: + if id not in used_chunks: + wanted_chunks[id] = size + pi.finish() + return used_chunks, wanted_chunks, total_files, total_size, num_archives + + def report_and_delete(self): + run_repair = " Run borg check --repair!" + + missing_new = set(self.used_chunks) - set(self.repository_chunks) + if missing_new: + logger.error(f"Repository has {len(missing_new)} new missing objects." + run_repair) + set_ec(EXIT_ERROR) + + missing_known = set(self.wanted_chunks) - set(self.repository_chunks) + if missing_known: + logger.warning(f"Repository has {len(missing_known)} known missing objects.") + set_ec(EXIT_WARNING) + + missing_found = set(self.wanted_chunks) & set(self.repository_chunks) + if missing_found: + logger.warning(f"{len(missing_found)} previously missing objects re-appeared!" + run_repair) + set_ec(EXIT_WARNING) + + repo_size_before = self.repository_size + referenced_chunks = set(self.used_chunks) | set(self.wanted_chunks) + unused = set(self.repository_chunks) - referenced_chunks + logger.info(f"Repository has {len(unused)} objects to delete.") + if unused: + logger.info(f"Deleting {len(unused)} unused objects...") + pi = ProgressIndicatorPercent( + total=len(unused), msg="Deleting unused objects %3.1f%%", step=0.1, msgid="compact.report_and_delete" + ) + for i, id in enumerate(unused): + pi.show(i) + self.repository.delete(id) + del self.repository_chunks[id] + pi.finish() + repo_size_after = self.repository_size + + count = len(self.repository_chunks) + logger.info(f"Overall statistics, considering all {self.archives_count} archives in this repository:") + logger.info( + f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files." + ) + dsize = 0 + for id in self.repository_chunks: + if id in self.used_chunks: + dsize += self.used_chunks[id] + elif id in self.wanted_chunks: + dsize += self.wanted_chunks[id] + else: + raise KeyError(bin_to_hex(id)) + logger.info(f"Repository size is {format_file_size(self.repository_size, precision=0)} in {count} objects.") + if self.total_size != 0: + logger.info(f"Space reduction factor due to deduplication: {dsize / self.total_size:.3f}") + if dsize != 0: + logger.info(f"Space reduction factor due to compression: {self.repository_size / dsize:.3f}") + logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.") + + class CompactMixIn: - @with_repository(manifest=False, exclusive=True) - def do_compact(self, args, repository): - """compact segment files in the repository""" - # see the comment in do_with_lock about why we do it like this: - data = repository.get(Manifest.MANIFEST_ID) - repository.put(Manifest.MANIFEST_ID, data) - threshold = args.threshold / 100 - repository.commit(compact=True, threshold=threshold) + @with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,)) + def do_compact(self, args, repository, manifest): + """Collect garbage in repository""" + ArchiveGarbageCollector(repository, manifest).garbage_collect() def build_parser_compact(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog compact_epilog = process_epilog( """ - This command frees repository space by compacting segments. + Free repository space by deleting unused chunks. - Use this regularly to avoid running out of space - you do not need to use this - after each borg command though. It is especially useful after deleting archives, - because only compaction will really free repository space. + borg compact analyzes all existing archives to find out which chunks are + actually used. There might be unused chunks resulting from borg delete or prune, + which can be removed to free space in the repository. - borg compact does not need a key, so it is possible to invoke it from the - client or also from the server. - - Depending on the amount of segments that need compaction, it may take a while, - so consider using the ``--progress`` option. - - A segment is compacted if the amount of saved space is above the percentage value - given by the ``--threshold`` option. If omitted, a threshold of 10% is used. - When using ``--verbose``, borg will output an estimate of the freed space. - - See :ref:`separate_compaction` in Additional Notes for more details. - """ + Differently than borg 1.x, borg2's compact needs the borg key if the repo is + encrypted. + """ ) subparser = subparsers.add_parser( "compact", @@ -50,15 +177,6 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser): description=self.do_compact.__doc__, epilog=compact_epilog, formatter_class=argparse.RawDescriptionHelpFormatter, - help="compact segment files / free space in repo", + help="compact repository", ) subparser.set_defaults(func=self.do_compact) - subparser.add_argument( - "--threshold", - metavar="PERCENT", - dest="threshold", - type=int, - default=10, - action=Highlander, - help="set minimum threshold for saved space in PERCENT (Default: 10)", - ) diff --git a/src/borg/archiver/config_cmd.py b/src/borg/archiver/config_cmd.py deleted file mode 100644 index f92baf4f35..0000000000 --- a/src/borg/archiver/config_cmd.py +++ /dev/null @@ -1,177 +0,0 @@ -import argparse -import configparser - -from ._common import with_repository -from ..cache import Cache, assert_secure -from ..constants import * # NOQA -from ..helpers import Error, CommandError -from ..helpers import parse_file_size, hex_to_bin -from ..manifest import Manifest - -from ..logger import create_logger - -logger = create_logger() - - -class ConfigMixIn: - @with_repository(exclusive=True, manifest=False) - def do_config(self, args, repository): - """get, set, and delete values in a repository or cache config file""" - - def repo_validate(section, name, value=None, check_value=True): - if section not in ["repository"]: - raise ValueError("Invalid section") - if name in ["segments_per_dir", "last_segment_checked"]: - if check_value: - try: - int(value) - except ValueError: - raise ValueError("Invalid value") from None - elif name in ["max_segment_size", "additional_free_space", "storage_quota"]: - if check_value: - try: - parse_file_size(value) - except ValueError: - raise ValueError("Invalid value") from None - if name == "storage_quota": - if parse_file_size(value) < parse_file_size("10M"): - raise ValueError("Invalid value: storage_quota < 10M") - elif name == "max_segment_size": - if parse_file_size(value) >= MAX_SEGMENT_SIZE_LIMIT: - raise ValueError("Invalid value: max_segment_size >= %d" % MAX_SEGMENT_SIZE_LIMIT) - elif name in ["append_only"]: - if check_value and value not in ["0", "1"]: - raise ValueError("Invalid value") - elif name in ["id"]: - if check_value: - hex_to_bin(value, length=32) - else: - raise ValueError("Invalid name") - - def cache_validate(section, name, value=None, check_value=True): - if section not in ["cache"]: - raise ValueError("Invalid section") - # currently, we do not support setting anything in the cache via borg config. - raise ValueError("Invalid name") - - def list_config(config): - default_values = { - "version": "1", - "segments_per_dir": str(DEFAULT_SEGMENTS_PER_DIR), - "max_segment_size": str(MAX_SEGMENT_SIZE_LIMIT), - "additional_free_space": "0", - "storage_quota": repository.storage_quota, - "append_only": repository.append_only, - } - print("[repository]") - for key in [ - "version", - "segments_per_dir", - "max_segment_size", - "storage_quota", - "additional_free_space", - "append_only", - "id", - ]: - value = config.get("repository", key, fallback=False) - if value is None: - value = default_values.get(key) - if value is None: - raise Error("The repository config is missing the %s key which has no default value" % key) - print(f"{key} = {value}") - for key in ["last_segment_checked"]: - value = config.get("repository", key, fallback=None) - if value is None: - continue - print(f"{key} = {value}") - - if not args.list: - if args.name is None: - raise CommandError("No config key name was provided.") - try: - section, name = args.name.split(".") - except ValueError: - section = args.cache and "cache" or "repository" - name = args.name - - if args.cache: - manifest = Manifest.load(repository, (Manifest.Operation.WRITE,)) - assert_secure(repository, manifest, self.lock_wait) - cache = Cache(repository, manifest, lock_wait=self.lock_wait) - - try: - if args.cache: - cache.cache_config.load() - config = cache.cache_config._config - save = cache.cache_config.save - validate = cache_validate - else: - config = repository.config - save = lambda: repository.save_config(repository.path, repository.config) # noqa - validate = repo_validate - - if args.delete: - validate(section, name, check_value=False) - config.remove_option(section, name) - if len(config.options(section)) == 0: - config.remove_section(section) - save() - elif args.list: - list_config(config) - elif args.value: - validate(section, name, args.value) - if section not in config.sections(): - config.add_section(section) - config.set(section, name, args.value) - save() - else: - try: - print(config.get(section, name)) - except (configparser.NoOptionError, configparser.NoSectionError) as e: - raise Error(e) - finally: - if args.cache: - cache.close() - - def build_parser_config(self, subparsers, common_parser, mid_common_parser): - from ._common import process_epilog - - config_epilog = process_epilog( - """ - This command gets and sets options in a local repository or cache config file. - For security reasons, this command only works on local repositories. - - To delete a config value entirely, use ``--delete``. To list the values - of the configuration file or the default values, use ``--list``. To get an existing - key, pass only the key name. To set a key, pass both the key name and - the new value. Keys can be specified in the format "section.name" or - simply "name"; the section will default to "repository" and "cache" for - the repo and cache configs, respectively. - - - By default, borg config manipulates the repository config file. Using ``--cache`` - edits the repository cache's config file instead. - """ - ) - subparser = subparsers.add_parser( - "config", - parents=[common_parser], - add_help=False, - description=self.do_config.__doc__, - epilog=config_epilog, - formatter_class=argparse.RawDescriptionHelpFormatter, - help="get and set configuration values", - ) - subparser.set_defaults(func=self.do_config) - subparser.add_argument( - "-c", "--cache", dest="cache", action="store_true", help="get and set values from the repo cache" - ) - - group = subparser.add_mutually_exclusive_group() - group.add_argument( - "-d", "--delete", dest="delete", action="store_true", help="delete the key from the config file" - ) - group.add_argument("-l", "--list", dest="list", action="store_true", help="list the configuration of the repo") - - subparser.add_argument("name", metavar="NAME", nargs="?", help="name of config key") - subparser.add_argument("value", metavar="VALUE", nargs="?", help="new value for key") diff --git a/src/borg/archiver/create_cmd.py b/src/borg/archiver/create_cmd.py index 40160f641f..166e302be0 100644 --- a/src/borg/archiver/create_cmd.py +++ b/src/borg/archiver/create_cmd.py @@ -41,7 +41,7 @@ class CreateMixIn: - @with_repository(exclusive=True, compatibility=(Manifest.Operation.WRITE,)) + @with_repository(compatibility=(Manifest.Operation.WRITE,)) def do_create(self, args, repository, manifest): """Create new archive""" key = manifest.key @@ -196,8 +196,7 @@ def create_inner(archive, cache, fso): archive.stats.rx_bytes = getattr(repository, "rx_bytes", 0) archive.stats.tx_bytes = getattr(repository, "tx_bytes", 0) if sig_int: - # do not save the archive if the user ctrl-c-ed - it is valid, but incomplete. - # we already have a checkpoint archive in this case. + # do not save the archive if the user ctrl-c-ed. raise Error("Got Ctrl-C / SIGINT.") else: archive.save(comment=args.comment, timestamp=args.timestamp) @@ -224,8 +223,6 @@ def create_inner(archive, cache, fso): manifest, progress=args.progress, lock_wait=self.lock_wait, - no_cache_sync_permitted=args.no_cache_sync, - no_cache_sync_forced=args.no_cache_sync_forced, prefer_adhoc_cache=args.prefer_adhoc_cache, cache_mode=args.files_cache_mode, iec=args.iec, @@ -254,16 +251,7 @@ def create_inner(archive, cache, fso): numeric_ids=args.numeric_ids, nobirthtime=args.nobirthtime, ) - cp = ChunksProcessor( - cache=cache, - key=key, - add_item=archive.add_item, - prepare_checkpoint=archive.prepare_checkpoint, - write_checkpoint=archive.write_checkpoint, - checkpoint_interval=args.checkpoint_interval, - checkpoint_volume=args.checkpoint_volume, - rechunkify=False, - ) + cp = ChunksProcessor(cache=cache, key=key, add_item=archive.add_item, rechunkify=False) fso = FilesystemObjectProcessors( metadata_collector=metadata_collector, cache=cache, @@ -587,9 +575,7 @@ def build_parser_create(self, subparsers, common_parser, mid_common_parser): The archive will consume almost no disk space for files or parts of files that have already been stored in other archives. - The archive name needs to be unique. It must not end in '.checkpoint' or - '.checkpoint.N' (with N being a number), because these names are used for - checkpoints and treated in special ways. + The archive name needs to be unique. In the archive name, you may use the following placeholders: {now}, {utcnow}, {fqdn}, {hostname}, {user} and some others. @@ -799,18 +785,6 @@ def build_parser_create(self, subparsers, common_parser, mid_common_parser): help="only display items with the given status characters (see description)", ) subparser.add_argument("--json", action="store_true", help="output stats as JSON. Implies ``--stats``.") - subparser.add_argument( - "--no-cache-sync", - dest="no_cache_sync", - action="store_true", - help="experimental: do not synchronize the chunks cache.", - ) - subparser.add_argument( - "--no-cache-sync-forced", - dest="no_cache_sync_forced", - action="store_true", - help="experimental: do not synchronize the chunks cache (forced).", - ) subparser.add_argument( "--prefer-adhoc-cache", dest="prefer_adhoc_cache", @@ -956,25 +930,6 @@ def build_parser_create(self, subparsers, common_parser, mid_common_parser): help="manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, " "(+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory.", ) - archive_group.add_argument( - "-c", - "--checkpoint-interval", - metavar="SECONDS", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) - archive_group.add_argument( - "--checkpoint-volume", - metavar="BYTES", - dest="checkpoint_volume", - type=int, - default=0, - action=Highlander, - help="write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing)", - ) archive_group.add_argument( "--chunker-params", metavar="PARAMS", diff --git a/src/borg/archiver/debug_cmd.py b/src/borg/archiver/debug_cmd.py index fe9df81f4a..6f747ec88d 100644 --- a/src/borg/archiver/debug_cmd.py +++ b/src/borg/archiver/debug_cmd.py @@ -11,11 +11,11 @@ from ..helpers import bin_to_hex, hex_to_bin, prepare_dump_dict from ..helpers import dash_open from ..helpers import StableDict -from ..helpers import positive_int_validator, archivename_validator +from ..helpers import archivename_validator from ..helpers import CommandError, RTError from ..manifest import Manifest from ..platform import get_process_id -from ..repository import Repository, LIST_SCAN_LIMIT, TAG_PUT, TAG_DELETE, TAG_COMMIT +from ..repository import Repository, LIST_SCAN_LIMIT from ..repoobj import RepoObj from ._common import with_repository, Highlander @@ -46,7 +46,7 @@ def do_debug_dump_archive(self, args, repository, manifest): """dump decoded archive metadata (not: data)""" repo_objs = manifest.repo_objs try: - archive_meta_orig = manifest.archives.get_raw_dict()[args.name] + archive_meta_orig = manifest.archives.get(args.name, raw=True) except KeyError: raise Archive.DoesNotExist(args.name) @@ -99,7 +99,8 @@ def output(fd): def do_debug_dump_manifest(self, args, repository, manifest): """dump decoded repository manifest""" repo_objs = manifest.repo_objs - _, data = repo_objs.parse(manifest.MANIFEST_ID, repository.get(manifest.MANIFEST_ID), ro_type=ROBJ_MANIFEST) + cdata = repository.get_manifest() + _, data = repo_objs.parse(manifest.MANIFEST_ID, cdata, ro_type=ROBJ_MANIFEST) meta = prepare_dump_dict(msgpack.unpackb(data, object_hook=StableDict)) @@ -108,57 +109,34 @@ def do_debug_dump_manifest(self, args, repository, manifest): @with_repository(manifest=False) def do_debug_dump_repo_objs(self, args, repository): - """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct""" + """dump (decrypted, decompressed) repo objects""" from ..crypto.key import key_factory - def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None): + def decrypt_dump(id, cdata): if cdata is not None: _, data = repo_objs.parse(id, cdata, ro_type=ROBJ_DONTCARE) else: _, data = {}, b"" - tag_str = "" if tag is None else "_" + tag - segment_str = "_" + str(segment) if segment is not None else "" - offset_str = "_" + str(offset) if offset is not None else "" - id_str = "_" + bin_to_hex(id) if id is not None else "" - filename = "%08d%s%s%s%s.obj" % (i, segment_str, offset_str, tag_str, id_str) + filename = f"{bin_to_hex(id)}.obj" print("Dumping", filename) with open(filename, "wb") as fd: fd.write(data) - if args.ghost: - # dump ghosty stuff from segment files: not yet committed objects, deleted / superseded objects, commit tags - - # set up the key without depending on a manifest obj - for id, cdata, tag, segment, offset in repository.scan_low_level(): - if tag == TAG_PUT: - key = key_factory(repository, cdata) - repo_objs = RepoObj(key) - break - i = 0 - for id, cdata, tag, segment, offset in repository.scan_low_level(segment=args.segment, offset=args.offset): - if tag == TAG_PUT: - decrypt_dump(i, id, cdata, tag="put", segment=segment, offset=offset) - elif tag == TAG_DELETE: - decrypt_dump(i, id, None, tag="del", segment=segment, offset=offset) - elif tag == TAG_COMMIT: - decrypt_dump(i, None, None, tag="commit", segment=segment, offset=offset) - i += 1 - else: - # set up the key without depending on a manifest obj - ids = repository.list(limit=1, marker=None) - cdata = repository.get(ids[0]) - key = key_factory(repository, cdata) - repo_objs = RepoObj(key) - state = None - i = 0 - while True: - ids, state = repository.scan(limit=LIST_SCAN_LIMIT, state=state) # must use on-disk order scanning here - if not ids: - break - for id in ids: - cdata = repository.get(id) - decrypt_dump(i, id, cdata) - i += 1 + # set up the key without depending on a manifest obj + result = repository.list(limit=1, marker=None) + id, _ = result[0] + cdata = repository.get(id) + key = key_factory(repository, cdata) + repo_objs = RepoObj(key) + marker = None + while True: + result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + if not result: + break + marker = result[-1][0] + for id, stored_size in result: + cdata = repository.get(id) + decrypt_dump(id, cdata) print("Done.") @with_repository(manifest=False) @@ -191,20 +169,22 @@ def print_finding(info, wanted, data, offset): from ..crypto.key import key_factory # set up the key without depending on a manifest obj - ids = repository.list(limit=1, marker=None) - cdata = repository.get(ids[0]) + result = repository.list(limit=1, marker=None) + id, _ = result[0] + cdata = repository.get(id) key = key_factory(repository, cdata) repo_objs = RepoObj(key) - state = None + marker = None last_data = b"" last_id = None i = 0 while True: - ids, state = repository.scan(limit=LIST_SCAN_LIMIT, state=state) # must use on-disk order scanning here - if not ids: + result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + if not result: break - for id in ids: + marker = result[-1][0] + for id, stored_size in result: cdata = repository.get(id) _, data = repo_objs.parse(id, cdata, ro_type=ROBJ_DONTCARE) @@ -301,7 +281,7 @@ def do_debug_format_obj(self, args, repository, manifest): with open(args.object_path, "wb") as f: f.write(data_encrypted) - @with_repository(manifest=False, exclusive=True) + @with_repository(manifest=False) def do_debug_put_obj(self, args, repository): """put file contents into the repository""" with open(args.path, "rb") as f: @@ -314,12 +294,10 @@ def do_debug_put_obj(self, args, repository): repository.put(id, data) print("object %s put." % hex_id) - repository.commit(compact=False) @with_repository(manifest=False, exclusive=True) def do_debug_delete_obj(self, args, repository): """delete the objects with the given IDs from the repo""" - modified = False for hex_id in args.ids: try: id = hex_to_bin(hex_id, length=32) @@ -328,46 +306,11 @@ def do_debug_delete_obj(self, args, repository): else: try: repository.delete(id) - modified = True print("object %s deleted." % hex_id) except Repository.ObjectNotFound: print("object %s not found." % hex_id) - if modified: - repository.commit(compact=False) print("Done.") - @with_repository(manifest=False, exclusive=True, cache=True, compatibility=Manifest.NO_OPERATION_CHECK) - def do_debug_refcount_obj(self, args, repository, manifest, cache): - """display refcounts for the objects with the given IDs""" - for hex_id in args.ids: - try: - id = hex_to_bin(hex_id, length=32) - except ValueError: - print("object id %s is invalid." % hex_id) - else: - try: - refcount = cache.chunks[id][0] - print("object %s has %d referrers [info from chunks cache]." % (hex_id, refcount)) - except KeyError: - print("object %s not found [info from chunks cache]." % hex_id) - - @with_repository(manifest=False, exclusive=True) - def do_debug_dump_hints(self, args, repository): - """dump repository hints""" - if not repository._active_txn: - repository.prepare_txn(repository.get_transaction_id()) - try: - hints = dict( - segments=repository.segments, - compact=repository.compact, - storage_quota_use=repository.storage_quota_use, - shadow_index={bin_to_hex(k): v for k, v in repository.shadow_index.items()}, - ) - with dash_open(args.path, "w") as fd: - json.dump(hints, fd, indent=4) - finally: - repository.rollback() - def do_debug_convert_profile(self, args): """convert Borg profile to Python profile""" import marshal @@ -484,30 +427,6 @@ def build_parser_debug(self, subparsers, common_parser, mid_common_parser): help="dump repo objects (debug)", ) subparser.set_defaults(func=self.do_debug_dump_repo_objs) - subparser.add_argument( - "--ghost", - dest="ghost", - action="store_true", - help="dump all segment file contents, including deleted/uncommitted objects and commits.", - ) - subparser.add_argument( - "--segment", - metavar="SEG", - dest="segment", - type=positive_int_validator, - default=None, - action=Highlander, - help="used together with --ghost: limit processing to given segment.", - ) - subparser.add_argument( - "--offset", - metavar="OFFS", - dest="offset", - type=positive_int_validator, - default=None, - action=Highlander, - help="used together with --ghost: limit processing to given offset.", - ) debug_search_repo_objs_epilog = process_epilog( """ @@ -672,40 +591,6 @@ def build_parser_debug(self, subparsers, common_parser, mid_common_parser): "ids", metavar="IDs", nargs="+", type=str, help="hex object ID(s) to delete from the repo" ) - debug_refcount_obj_epilog = process_epilog( - """ - This command displays the reference count for objects from the repository. - """ - ) - subparser = debug_parsers.add_parser( - "refcount-obj", - parents=[common_parser], - add_help=False, - description=self.do_debug_refcount_obj.__doc__, - epilog=debug_refcount_obj_epilog, - formatter_class=argparse.RawDescriptionHelpFormatter, - help="show refcount for object from repository (debug)", - ) - subparser.set_defaults(func=self.do_debug_refcount_obj) - subparser.add_argument("ids", metavar="IDs", nargs="+", type=str, help="hex object ID(s) to show refcounts for") - - debug_dump_hints_epilog = process_epilog( - """ - This command dumps the repository hints data. - """ - ) - subparser = debug_parsers.add_parser( - "dump-hints", - parents=[common_parser], - add_help=False, - description=self.do_debug_dump_hints.__doc__, - epilog=debug_dump_hints_epilog, - formatter_class=argparse.RawDescriptionHelpFormatter, - help="dump repo hints (debug)", - ) - subparser.set_defaults(func=self.do_debug_dump_hints) - subparser.add_argument("path", metavar="PATH", type=str, help="file to dump data into") - debug_convert_profile_epilog = process_epilog( """ Convert a Borg profile to a Python cProfile compatible profile. diff --git a/src/borg/archiver/delete_cmd.py b/src/borg/archiver/delete_cmd.py index 7095cda903..1e4b1f17ba 100644 --- a/src/borg/archiver/delete_cmd.py +++ b/src/borg/archiver/delete_cmd.py @@ -1,11 +1,9 @@ import argparse import logging -from ._common import with_repository, Highlander -from ..archive import Archive, Statistics -from ..cache import Cache +from ._common import with_repository from ..constants import * # NOQA -from ..helpers import log_multi, format_archive, sig_int, CommandError, Error +from ..helpers import format_archive, CommandError from ..manifest import Manifest from ..logger import create_logger @@ -14,7 +12,7 @@ class DeleteMixIn: - @with_repository(exclusive=True, manifest=False) + @with_repository(manifest=False) def do_delete(self, args, repository): """Delete archives""" self.output_list = args.output_list @@ -29,67 +27,28 @@ def do_delete(self, args, repository): "or just delete the whole repository (might be much faster)." ) - if args.forced == 2: - deleted = False - logger_list = logging.getLogger("borg.output.list") - for i, archive_name in enumerate(archive_names, 1): - try: - current_archive = manifest.archives.pop(archive_name) - except KeyError: - self.print_warning(f"Archive {archive_name} not found ({i}/{len(archive_names)}).") - else: - deleted = True - if self.output_list: - msg = "Would delete: {} ({}/{})" if dry_run else "Deleted archive: {} ({}/{})" - logger_list.info(msg.format(format_archive(current_archive), i, len(archive_names))) - if dry_run: - logger.info("Finished dry-run.") - elif deleted: - manifest.write() - # note: might crash in compact() after committing the repo - repository.commit(compact=False) - self.print_warning('Done. Run "borg check --repair" to clean up the mess.', wc=None) + deleted = False + logger_list = logging.getLogger("borg.output.list") + for i, archive_name in enumerate(archive_names, 1): + try: + # this does NOT use Archive.delete, so this code hopefully even works in cases a corrupt archive + # would make the code in class Archive crash, so the user can at least get rid of such archives. + current_archive = manifest.archives.delete(archive_name) + except KeyError: + self.print_warning(f"Archive {archive_name} not found ({i}/{len(archive_names)}).") else: - self.print_warning("Aborted.", wc=None) - return - - stats = Statistics(iec=args.iec) - with Cache(repository, manifest, progress=args.progress, lock_wait=self.lock_wait, iec=args.iec) as cache: - - def checkpoint_func(): - manifest.write() - repository.commit(compact=False) - cache.commit() - - msg_delete = "Would delete archive: {} ({}/{})" if dry_run else "Deleting archive: {} ({}/{})" - msg_not_found = "Archive {} not found ({}/{})." - logger_list = logging.getLogger("borg.output.list") - uncommitted_deletes = 0 - for i, archive_name in enumerate(archive_names, 1): - if sig_int and sig_int.action_done(): - break - try: - archive_info = manifest.archives[archive_name] - except KeyError: - self.print_warning(msg_not_found.format(archive_name, i, len(archive_names))) - else: - if self.output_list: - logger_list.info(msg_delete.format(format_archive(archive_info), i, len(archive_names))) - - if not dry_run: - archive = Archive(manifest, archive_name, cache=cache) - archive.delete(stats, progress=args.progress, forced=args.forced) - checkpointed = self.maybe_checkpoint( - checkpoint_func=checkpoint_func, checkpoint_interval=args.checkpoint_interval - ) - uncommitted_deletes = 0 if checkpointed else (uncommitted_deletes + 1) - if sig_int: - # Ctrl-C / SIGINT: do not checkpoint (commit) again, we already have a checkpoint in this case. - raise Error("Got Ctrl-C / SIGINT.") - elif uncommitted_deletes > 0: - checkpoint_func() - if args.stats: - log_multi(str(stats), logger=logging.getLogger("borg.output.stats")) + deleted = True + if self.output_list: + msg = "Would delete: {} ({}/{})" if dry_run else "Deleted archive: {} ({}/{})" + logger_list.info(msg.format(format_archive(current_archive), i, len(archive_names))) + if dry_run: + logger.info("Finished dry-run.") + elif deleted: + manifest.write() + self.print_warning('Done. Run "borg compact" to free space.', wc=None) + else: + self.print_warning("Aborted.", wc=None) + return def build_parser_delete(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog, define_archive_filters_group @@ -103,16 +62,9 @@ def build_parser_delete(self, subparsers, common_parser, mid_common_parser): When in doubt, use ``--dry-run --list`` to see what would be deleted. - When using ``--stats``, you will get some statistics about how much data was - deleted - the "Deleted data" deduplicated size there is most interesting as - that is how much your repository will shrink. - Please note that the "All archives" stats refer to the state after deletion. - You can delete multiple archives by specifying a matching pattern, using the ``--match-archives PATTERN`` option (for more info on these patterns, see :ref:`borg_patterns`). - - Always first use ``--dry-run --list`` to see what would be deleted. """ ) subparser = subparsers.add_parser( @@ -129,30 +81,4 @@ def build_parser_delete(self, subparsers, common_parser, mid_common_parser): subparser.add_argument( "--list", dest="output_list", action="store_true", help="output verbose list of archives" ) - subparser.add_argument( - "--consider-checkpoints", - action="store_true", - dest="consider_checkpoints", - help="consider checkpoint archives for deletion (default: not considered).", - ) - subparser.add_argument( - "-s", "--stats", dest="stats", action="store_true", help="print statistics for the deleted archive" - ) - subparser.add_argument( - "--force", - dest="forced", - action="count", - default=0, - help="force deletion of corrupted archives, " "use ``--force --force`` in case ``--force`` does not work.", - ) - subparser.add_argument( - "-c", - "--checkpoint-interval", - metavar="SECONDS", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) define_archive_filters_group(subparser) diff --git a/src/borg/archiver/info_cmd.py b/src/borg/archiver/info_cmd.py index 78ccf105dc..3de4d5a368 100644 --- a/src/borg/archiver/info_cmd.py +++ b/src/borg/archiver/info_cmd.py @@ -18,7 +18,6 @@ class InfoMixIn: def do_info(self, args, repository, manifest, cache): """Show archive details such as disk space used""" - args.consider_checkpoints = True archive_names = tuple(x.name for x in manifest.archives.list_considering(args)) output_data = [] @@ -44,7 +43,6 @@ def do_info(self, args, repository, manifest, cache): Command line: {command_line} Number of files: {stats[nfiles]} Original size: {stats[original_size]} - Deduplicated size: {stats[deduplicated_size]} """ ) .strip() diff --git a/src/borg/archiver/key_cmds.py b/src/borg/archiver/key_cmds.py index 1a7b4769a0..50e0315b4d 100644 --- a/src/borg/archiver/key_cmds.py +++ b/src/borg/archiver/key_cmds.py @@ -73,13 +73,8 @@ def do_change_location(self, args, repository, manifest, cache): manifest.key = key_new manifest.repo_objs.key = key_new manifest.write() - repository.commit(compact=False) - # we need to rewrite cache config and security key-type info, - # so that the cached key-type will match the repo key-type. - cache.begin_txn() # need to start a cache transaction, otherwise commit() does nothing. cache.key = key_new - cache.commit() loc = key_new.find_key() if hasattr(key_new, "find_key") else None if args.keep: @@ -88,7 +83,7 @@ def do_change_location(self, args, repository, manifest, cache): key.remove(key.target) # remove key from current location logger.info(f"Key moved to {loc}") - @with_repository(lock=False, exclusive=False, manifest=False, cache=False) + @with_repository(lock=False, manifest=False, cache=False) def do_key_export(self, args, repository): """Export the repository key for backup""" manager = KeyManager(repository) @@ -107,7 +102,7 @@ def do_key_export(self, args, repository): except IsADirectoryError: raise CommandError(f"'{args.path}' must be a file, not a directory") - @with_repository(lock=False, exclusive=False, manifest=False, cache=False) + @with_repository(lock=False, manifest=False, cache=False) def do_key_import(self, args, repository): """Import the repository key from backup""" manager = KeyManager(repository) diff --git a/src/borg/archiver/lock_cmds.py b/src/borg/archiver/lock_cmds.py index d0cf026eeb..1a9c0051e3 100644 --- a/src/borg/archiver/lock_cmds.py +++ b/src/borg/archiver/lock_cmds.py @@ -4,8 +4,7 @@ from ._common import with_repository from ..cache import Cache from ..constants import * # NOQA -from ..helpers import prepare_subprocess_env, set_ec, CommandError -from ..manifest import Manifest +from ..helpers import prepare_subprocess_env, set_ec, CommandError, ThreadRunner from ..logger import create_logger @@ -16,20 +15,10 @@ class LocksMixIn: @with_repository(manifest=False, exclusive=True) def do_with_lock(self, args, repository): """run a user specified command with the repository lock held""" - # for a new server, this will immediately take an exclusive lock. - # to support old servers, that do not have "exclusive" arg in open() - # RPC API, we also do it the old way: - # re-write manifest to start a repository transaction - this causes a - # lock upgrade to exclusive for remote (and also for local) repositories. - # by using manifest=False in the decorator, we avoid having to require - # the encryption key (and can operate just with encrypted data). - data = repository.get(Manifest.MANIFEST_ID) - repository.put(Manifest.MANIFEST_ID, data) - # usually, a 0 byte (open for writing) segment file would be visible in the filesystem here. - # we write and close this file, to rather have a valid segment file on disk, before invoking the subprocess. - # we can only do this for local repositories (with .io), though: - if hasattr(repository, "io"): - repository.io.close_segment() + # the repository lock needs to get refreshed regularly, or it will be killed as stale. + # refreshing the lock is not part of the repository API, so we do it indirectly via repository.info. + lock_refreshing_thread = ThreadRunner(sleep_interval=60, target=repository.info) + lock_refreshing_thread.start() env = prepare_subprocess_env(system=True) try: # we exit with the return code we get from the subprocess @@ -38,13 +27,7 @@ def do_with_lock(self, args, repository): except (FileNotFoundError, OSError, ValueError) as e: raise CommandError(f"Error while trying to run '{args.command}': {e}") finally: - # we need to commit the "no change" operation we did to the manifest - # because it created a new segment file in the repository. if we would - # roll back, the same file would be later used otherwise (for other content). - # that would be bad if somebody uses rsync with ignore-existing (or - # any other mechanism relying on existing segment data not changing). - # see issue #1867. - repository.commit(compact=False) + lock_refreshing_thread.terminate() @with_repository(lock=False, manifest=False) def do_break_lock(self, args, repository): diff --git a/src/borg/archiver/mount_cmds.py b/src/borg/archiver/mount_cmds.py index 15eade7a28..3d80090bc1 100644 --- a/src/borg/archiver/mount_cmds.py +++ b/src/borg/archiver/mount_cmds.py @@ -158,12 +158,6 @@ def _define_borg_mount(self, parser): from ._common import define_exclusion_group, define_archive_filters_group parser.set_defaults(func=self.do_mount) - parser.add_argument( - "--consider-checkpoints", - action="store_true", - dest="consider_checkpoints", - help="Show checkpoint archives in the repository contents list (default: hidden).", - ) parser.add_argument("mountpoint", metavar="MOUNTPOINT", type=str, help="where to mount filesystem") parser.add_argument( "-f", "--foreground", dest="foreground", action="store_true", help="stay in foreground, do not daemonize" diff --git a/src/borg/archiver/prune_cmd.py b/src/borg/archiver/prune_cmd.py index e8121993aa..c9cabbf311 100644 --- a/src/borg/archiver/prune_cmd.py +++ b/src/borg/archiver/prune_cmd.py @@ -4,13 +4,12 @@ import logging from operator import attrgetter import os -import re from ._common import with_repository, Highlander -from ..archive import Archive, Statistics +from ..archive import Archive from ..cache import Cache from ..constants import * # NOQA -from ..helpers import ArchiveFormatter, interval, sig_int, log_multi, ProgressIndicatorPercent, CommandError, Error +from ..helpers import ArchiveFormatter, interval, sig_int, ProgressIndicatorPercent, CommandError, Error from ..manifest import Manifest from ..logger import create_logger @@ -71,7 +70,7 @@ def prune_split(archives, rule, n, kept_because=None): class PruneMixIn: - @with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,)) + @with_repository(compatibility=(Manifest.Operation.DELETE,)) def do_prune(self, args, repository, manifest): """Prune repository archives according to specified rules""" if not any( @@ -91,25 +90,7 @@ def do_prune(self, args, repository, manifest): format = os.environ.get("BORG_PRUNE_FORMAT", "{archive:<36} {time} [{id}]") formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec) - checkpoint_re = r"\.checkpoint(\.\d+)?" - archives_checkpoints = manifest.archives.list( - match=args.match_archives, - consider_checkpoints=True, - match_end=r"(%s)?\Z" % checkpoint_re, - sort_by=["ts"], - reverse=True, - ) - is_checkpoint = re.compile(r"(%s)\Z" % checkpoint_re).search - checkpoints = [arch for arch in archives_checkpoints if is_checkpoint(arch.name)] - # keep the latest checkpoint, if there is no later non-checkpoint archive - if archives_checkpoints and checkpoints and archives_checkpoints[0] is checkpoints[0]: - keep_checkpoints = checkpoints[:1] - else: - keep_checkpoints = [] - checkpoints = set(checkpoints) - # ignore all checkpoint archives to avoid keeping one (which is an incomplete backup) - # that is newer than a successfully completed backup - and killing the successful backup. - archives = [arch for arch in archives_checkpoints if arch not in checkpoints] + archives = manifest.archives.list(match=args.match_archives, sort_by=["ts"], reverse=True) keep = [] # collect the rule responsible for the keeping of each archive in this dict # keys are archive ids, values are a tuple @@ -126,22 +107,15 @@ def do_prune(self, args, repository, manifest): if num is not None: keep += prune_split(archives, rule, num, kept_because) - to_delete = (set(archives) | checkpoints) - (set(keep) | set(keep_checkpoints)) - stats = Statistics(iec=args.iec) + to_delete = set(archives) - set(keep) with Cache(repository, manifest, lock_wait=self.lock_wait, iec=args.iec) as cache: - - def checkpoint_func(): - manifest.write() - repository.commit(compact=False) - cache.commit() - list_logger = logging.getLogger("borg.output.list") # set up counters for the progress display to_delete_len = len(to_delete) archives_deleted = 0 uncommitted_deletes = 0 pi = ProgressIndicatorPercent(total=len(to_delete), msg="Pruning archives %3.0f%%", msgid="prune") - for archive in archives_checkpoints: + for archive in archives: if sig_int and sig_int.action_done(): break if archive in to_delete: @@ -152,18 +126,12 @@ def checkpoint_func(): archives_deleted += 1 log_message = "Pruning archive (%d/%d):" % (archives_deleted, to_delete_len) archive = Archive(manifest, archive.name, cache) - archive.delete(stats, forced=args.forced) - checkpointed = self.maybe_checkpoint( - checkpoint_func=checkpoint_func, checkpoint_interval=args.checkpoint_interval - ) - uncommitted_deletes = 0 if checkpointed else (uncommitted_deletes + 1) + archive.delete() + uncommitted_deletes += 1 else: - if is_checkpoint(archive.name): - log_message = "Keeping checkpoint archive:" - else: - log_message = "Keeping archive (rule: {rule} #{num}):".format( - rule=kept_because[archive.id][0], num=kept_because[archive.id][1] - ) + log_message = "Keeping archive (rule: {rule} #{num}):".format( + rule=kept_because[archive.id][0], num=kept_because[archive.id][1] + ) if ( args.output_list or (args.list_pruned and archive in to_delete) @@ -172,12 +140,9 @@ def checkpoint_func(): list_logger.info(f"{log_message:<44} {formatter.format_item(archive, jsonline=False)}") pi.finish() if sig_int: - # Ctrl-C / SIGINT: do not checkpoint (commit) again, we already have a checkpoint in this case. raise Error("Got Ctrl-C / SIGINT.") elif uncommitted_deletes > 0: - checkpoint_func() - if args.stats: - log_multi(str(stats), logger=logging.getLogger("borg.output.stats")) + manifest.write() def build_parser_prune(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog @@ -195,11 +160,6 @@ def build_parser_prune(self, subparsers, common_parser, mid_common_parser): `GFS `_ (Grandfather-father-son) backup rotation scheme. - Also, prune automatically removes checkpoint archives (incomplete archives left - behind by interrupted backup runs) except if the checkpoint is the latest - archive (and thus still needed). Checkpoint archives are not considered when - comparing archive counts against the retention limits (``--keep-X``). - If you use --match-archives (-a), then only archives that match the pattern are considered for deletion and only those archives count towards the totals specified by the rules. @@ -235,11 +195,6 @@ def build_parser_prune(self, subparsers, common_parser, mid_common_parser): keep the last N archives under the assumption that you do not create more than one backup archive in the same second). - When using ``--stats``, you will get some statistics about how much data was - deleted - the "Deleted data" deduplicated size there is most interesting as - that is how much your repository will shrink. - Please note that the "All archives" stats refer to the state after pruning. - You can influence how the ``--list`` output is formatted by using the ``--short`` option (less wide output) or by giving a custom format using ``--format`` (see the ``borg rlist`` description for more details about the format string). @@ -256,15 +211,6 @@ def build_parser_prune(self, subparsers, common_parser, mid_common_parser): ) subparser.set_defaults(func=self.do_prune) subparser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", help="do not change repository") - subparser.add_argument( - "--force", - dest="forced", - action="store_true", - help="force pruning of corrupted archives, " "use ``--force --force`` in case ``--force`` does not work.", - ) - subparser.add_argument( - "-s", "--stats", dest="stats", action="store_true", help="print statistics for the deleted archive" - ) subparser.add_argument( "--list", dest="output_list", action="store_true", help="output verbose list of archives it keeps/prunes" ) @@ -353,13 +299,3 @@ def build_parser_prune(self, subparsers, common_parser, mid_common_parser): help="number of yearly archives to keep", ) define_archive_filters_group(subparser, sort_by=False, first_last=False) - subparser.add_argument( - "-c", - "--checkpoint-interval", - metavar="SECONDS", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) diff --git a/src/borg/archiver/rcompress_cmd.py b/src/borg/archiver/rcompress_cmd.py index 30706dcd6d..c9bfecfe08 100644 --- a/src/borg/archiver/rcompress_cmd.py +++ b/src/borg/archiver/rcompress_cmd.py @@ -5,7 +5,8 @@ from ..constants import * # NOQA from ..compress import CompressionSpec, ObfuscateSize, Auto, COMPRESSOR_TABLE from ..helpers import sig_int, ProgressIndicatorPercent, Error - +from ..repository import Repository +from ..remote import RemoteRepository from ..manifest import Manifest from ..logger import create_logger @@ -15,27 +16,16 @@ def find_chunks(repository, repo_objs, stats, ctype, clevel, olevel): """find chunks that need processing (usually: recompression).""" - # to do it this way is maybe not obvious, thus keeping the essential design criteria here: - # - determine the chunk ids at one point in time (== do a **full** scan in one go) **before** - # writing to the repo (and especially before doing a compaction, which moves segment files around) - # - get the chunk ids in **on-disk order** (so we can efficiently compact while processing the chunks) - # - only put the ids into the list that actually need recompression (keeps it a little shorter in some cases) recompress_ids = [] compr_keys = stats["compr_keys"] = set() compr_wanted = ctype, clevel, olevel - state = None - chunks_count = len(repository) - chunks_limit = min(1000, max(100, chunks_count // 1000)) - pi = ProgressIndicatorPercent( - total=chunks_count, - msg="Searching for recompression candidates %3.1f%%", - step=0.1, - msgid="rcompress.find_chunks", - ) + marker = None while True: - chunk_ids, state = repository.scan(limit=chunks_limit, state=state) - if not chunk_ids: + result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + if not result: break + marker = result[-1][0] + chunk_ids = [id for id, _ in result] for id, chunk_no_data in zip(chunk_ids, repository.get_many(chunk_ids, read_data=False)): meta = repo_objs.parse_meta(id, chunk_no_data, ro_type=ROBJ_DONTCARE) compr_found = meta["ctype"], meta["clevel"], meta.get("olevel", -1) @@ -44,8 +34,6 @@ def find_chunks(repository, repo_objs, stats, ctype, clevel, olevel): compr_keys.add(compr_found) stats[compr_found] += 1 stats["checked_count"] += 1 - pi.show(increase=1) - pi.finish() return recompress_ids @@ -100,7 +88,7 @@ def format_compression_spec(ctype, clevel, olevel): class RCompressMixIn: - @with_repository(cache=False, manifest=True, exclusive=True, compatibility=(Manifest.Operation.CHECK,)) + @with_repository(cache=False, manifest=True, compatibility=(Manifest.Operation.CHECK,)) def do_rcompress(self, args, repository, manifest): """Repository (re-)compression""" @@ -114,25 +102,17 @@ def get_csettings(c): ctype, clevel, olevel = c.ID, c.level, -1 return ctype, clevel, olevel + if not isinstance(repository, (Repository, RemoteRepository)): + raise Error("rcompress not supported for legacy repositories.") + repo_objs = manifest.repo_objs ctype, clevel, olevel = get_csettings(repo_objs.compressor) # desired compression set by --compression - def checkpoint_func(): - while repository.async_response(wait=True) is not None: - pass - repository.commit(compact=True) - stats_find = defaultdict(int) stats_process = defaultdict(int) recompress_ids = find_chunks(repository, repo_objs, stats_find, ctype, clevel, olevel) recompress_candidate_count = len(recompress_ids) chunks_limit = min(1000, max(100, recompress_candidate_count // 1000)) - uncommitted_chunks = 0 - - # start a new transaction - data = repository.get(Manifest.MANIFEST_ID) - repository.put(Manifest.MANIFEST_ID, data) - uncommitted_chunks += 1 pi = ProgressIndicatorPercent( total=len(recompress_ids), msg="Recompressing %3.1f%%", step=0.1, msgid="rcompress.process_chunks" @@ -143,16 +123,13 @@ def checkpoint_func(): ids, recompress_ids = recompress_ids[:chunks_limit], recompress_ids[chunks_limit:] process_chunks(repository, repo_objs, stats_process, ids, olevel) pi.show(increase=len(ids)) - checkpointed = self.maybe_checkpoint( - checkpoint_func=checkpoint_func, checkpoint_interval=args.checkpoint_interval - ) - uncommitted_chunks = 0 if checkpointed else (uncommitted_chunks + len(ids)) pi.finish() if sig_int: - # Ctrl-C / SIGINT: do not checkpoint (commit) again, we already have a checkpoint in this case. + # Ctrl-C / SIGINT: do not commit raise Error("Got Ctrl-C / SIGINT.") - elif uncommitted_chunks > 0: - checkpoint_func() + else: + while repository.async_response(wait=True) is not None: + pass if args.stats: print() print("Recompression stats:") @@ -185,20 +162,14 @@ def build_parser_rcompress(self, subparsers, common_parser, mid_common_parser): """ Repository (re-)compression (and/or re-obfuscation). - Reads all chunks in the repository (in on-disk order, this is important for - compaction) and recompresses them if they are not already using the compression - type/level and obfuscation level given via ``--compression``. + Reads all chunks in the repository and recompresses them if they are not already + using the compression type/level and obfuscation level given via ``--compression``. If the outcome of the chunk processing indicates a change in compression type/level or obfuscation level, the processed chunk is written to the repository. Please note that the outcome might not always be the desired compression type/level - if no compression gives a shorter output, that might be chosen. - Every ``--checkpoint-interval``, progress is committed to the repository and - the repository is compacted (this is to keep temporary repo space usage in bounds). - A lower checkpoint interval means lower temporary repo space usage, but also - slower progress due to higher overhead (and vice versa). - Please note that this command can not work in low (or zero) free disk space conditions. @@ -234,14 +205,3 @@ def build_parser_rcompress(self, subparsers, common_parser, mid_common_parser): ) subparser.add_argument("-s", "--stats", dest="stats", action="store_true", help="print statistics") - - subparser.add_argument( - "-c", - "--checkpoint-interval", - metavar="SECONDS", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) diff --git a/src/borg/archiver/rcreate_cmd.py b/src/borg/archiver/rcreate_cmd.py index b4a66645d4..d96e6d6ad2 100644 --- a/src/borg/archiver/rcreate_cmd.py +++ b/src/borg/archiver/rcreate_cmd.py @@ -4,7 +4,7 @@ from ..cache import Cache from ..constants import * # NOQA from ..crypto.key import key_creator, key_argument_names -from ..helpers import CancelledByUser +from ..helpers import CancelledByUser, CommandError from ..helpers import location_validator, Location from ..helpers import parse_storage_quota from ..manifest import Manifest @@ -19,6 +19,10 @@ class RCreateMixIn: @with_other_repository(manifest=True, compatibility=(Manifest.Operation.READ,)) def do_rcreate(self, args, repository, *, other_repository=None, other_manifest=None): """Create a new, empty repository""" + if args.storage_quota is not None: + raise CommandError("storage-quota is not supported (yet?)") + if args.append_only: + raise CommandError("append-only is not supported (yet?)") other_key = other_manifest.key if other_manifest is not None else None path = args.location.canonical_path() logger.info('Initializing repository at "%s"' % path) @@ -32,7 +36,6 @@ def do_rcreate(self, args, repository, *, other_repository=None, other_manifest= manifest = Manifest(key, repository) manifest.key = key manifest.write() - repository.commit(compact=False) with Cache(repository, manifest, warn_if_unencrypted=False): pass if key.NAME != "plaintext": @@ -49,16 +52,22 @@ def do_rcreate(self, args, repository, *, other_repository=None, other_manifest= " borg key export -r REPOSITORY encrypted-key-backup\n" " borg key export -r REPOSITORY --paper encrypted-key-backup.txt\n" " borg key export -r REPOSITORY --qr-html encrypted-key-backup.html\n" - "2. Write down the borg key passphrase and store it at safe place.\n" + "2. Write down the borg key passphrase and store it at safe place." ) + logger.warning( + "\n" + "Reserve some repository storage space now for emergencies like 'disk full'\n" + "by running:\n" + " borg rspace --reserve 1G" + ) def build_parser_rcreate(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog rcreate_epilog = process_epilog( """ - This command creates a new, empty repository. A repository is a filesystem - directory containing the deduplicated data from zero or more archives. + This command creates a new, empty repository. A repository is a ``borgstore`` store + containing the deduplicated data from zero or more archives. Encryption mode TLDR ++++++++++++++++++++ @@ -172,6 +181,14 @@ def build_parser_rcreate(self, subparsers, common_parser, mid_common_parser): keys to manage. Creating related repositories is useful e.g. if you want to use ``borg transfer`` later. + + Creating a related repository for data migration from borg 1.2 or 1.4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + You can use ``borg rcreate --other-repo ORIG_REPO --from-borg1 ...`` to create a related + repository that uses the same secret key material as the given other/original repository. + + Then use ``borg transfer --other-repo ORIG_REPO --from-borg1 ...`` to transfer the archives. """ ) subparser = subparsers.add_parser( @@ -193,6 +210,9 @@ def build_parser_rcreate(self, subparsers, common_parser, mid_common_parser): action=Highlander, help="reuse the key material from the other repository", ) + subparser.add_argument( + "--from-borg1", dest="v1_or_v2", action="store_true", help="other repository is borg 1.x" + ) subparser.add_argument( "-e", "--encryption", diff --git a/src/borg/archiver/rdelete_cmd.py b/src/borg/archiver/rdelete_cmd.py index 30bb66d962..e1cfc43e93 100644 --- a/src/borg/archiver/rdelete_cmd.py +++ b/src/borg/archiver/rdelete_cmd.py @@ -29,7 +29,7 @@ def do_rdelete(self, args, repository): msg = [] try: manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - n_archives = len(manifest.archives) + n_archives = manifest.archives.count() msg.append( f"You requested to DELETE the following repository completely " f"*including* {n_archives} archives it contains:" diff --git a/src/borg/archiver/recreate_cmd.py b/src/borg/archiver/recreate_cmd.py index 9ba12579aa..68116edab2 100644 --- a/src/borg/archiver/recreate_cmd.py +++ b/src/borg/archiver/recreate_cmd.py @@ -15,7 +15,7 @@ class RecreateMixIn: - @with_repository(cache=True, exclusive=True, compatibility=(Manifest.Operation.CHECK,)) + @with_repository(cache=True, compatibility=(Manifest.Operation.CHECK,)) def do_recreate(self, args, repository, manifest, cache): """Re-create archives""" matcher = build_matcher(args.patterns, args.paths) @@ -34,8 +34,6 @@ def do_recreate(self, args, repository, manifest, cache): progress=args.progress, stats=args.stats, file_status_printer=self.print_file_status, - checkpoint_interval=args.checkpoint_interval, - checkpoint_volume=args.checkpoint_volume, dry_run=args.dry_run, timestamp=args.timestamp, ) @@ -51,8 +49,6 @@ def do_recreate(self, args, repository, manifest, cache): logger.info("Skipped archive %s: Nothing to do. Archive was not processed.", name) if not args.dry_run: manifest.write() - repository.commit(compact=False) - cache.commit() def build_parser_recreate(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog @@ -142,25 +138,6 @@ def build_parser_recreate(self, subparsers, common_parser, mid_common_parser): help="create a new archive with the name ARCHIVE, do not replace existing archive " "(only applies for a single archive)", ) - archive_group.add_argument( - "-c", - "--checkpoint-interval", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - metavar="SECONDS", - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) - archive_group.add_argument( - "--checkpoint-volume", - metavar="BYTES", - dest="checkpoint_volume", - type=int, - default=0, - action=Highlander, - help="write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing)", - ) archive_group.add_argument( "--comment", metavar="COMMENT", diff --git a/src/borg/archiver/rename_cmd.py b/src/borg/archiver/rename_cmd.py index 67dff512b6..ba94a0691e 100644 --- a/src/borg/archiver/rename_cmd.py +++ b/src/borg/archiver/rename_cmd.py @@ -11,14 +11,12 @@ class RenameMixIn: - @with_repository(exclusive=True, cache=True, compatibility=(Manifest.Operation.CHECK,)) + @with_repository(cache=True, compatibility=(Manifest.Operation.CHECK,)) @with_archive def do_rename(self, args, repository, manifest, cache, archive): """Rename an existing archive""" archive.rename(args.newname) manifest.write() - repository.commit(compact=False) - cache.commit() def build_parser_rename(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog diff --git a/src/borg/archiver/rinfo_cmd.py b/src/borg/archiver/rinfo_cmd.py index bba038117c..91a7627f3a 100644 --- a/src/borg/archiver/rinfo_cmd.py +++ b/src/borg/archiver/rinfo_cmd.py @@ -64,7 +64,6 @@ def do_rinfo(self, args, repository, manifest, cache): output += "Security dir: {security_dir}\n".format(**info) print(output) - print(str(cache)) def build_parser_rinfo(self, subparsers, common_parser, mid_common_parser): from ._common import process_epilog @@ -72,15 +71,6 @@ def build_parser_rinfo(self, subparsers, common_parser, mid_common_parser): rinfo_epilog = process_epilog( """ This command displays detailed information about the repository. - - Please note that the deduplicated sizes of the individual archives do not add - up to the deduplicated size of the repository ("all archives"), because the two - are meaning different things: - - This archive / deduplicated size = amount of data stored ONLY for this archive - = unique chunks of this archive. - All archives / deduplicated size = amount of data stored in the repo - = all chunks in the repository. """ ) subparser = subparsers.add_parser( diff --git a/src/borg/archiver/rlist_cmd.py b/src/borg/archiver/rlist_cmd.py index 0ef621f141..8d1b24c356 100644 --- a/src/borg/archiver/rlist_cmd.py +++ b/src/borg/archiver/rlist_cmd.py @@ -92,12 +92,6 @@ def build_parser_rlist(self, subparsers, common_parser, mid_common_parser): help="list repository contents", ) subparser.set_defaults(func=self.do_rlist) - subparser.add_argument( - "--consider-checkpoints", - action="store_true", - dest="consider_checkpoints", - help="Show checkpoint archives in the repository contents list (default: hidden).", - ) subparser.add_argument( "--short", dest="short", action="store_true", help="only print the archive names, nothing else" ) diff --git a/src/borg/archiver/rspace_cmd.py b/src/borg/archiver/rspace_cmd.py new file mode 100644 index 0000000000..8352590d8b --- /dev/null +++ b/src/borg/archiver/rspace_cmd.py @@ -0,0 +1,110 @@ +import argparse +import math +import os + +from ._common import with_repository, Highlander +from ..constants import * # NOQA +from ..helpers import parse_file_size, format_file_size + +from ..logger import create_logger + +logger = create_logger() + + +class RSpaceMixIn: + @with_repository(lock=False, manifest=False) + def do_rspace(self, args, repository): + """Manage reserved space in repository""" + # we work without locking here because locks don't work with full disk. + if args.reserve_space > 0: + storage_space_reserve_object_size = 64 * 2**20 # 64 MiB per object + count = math.ceil(float(args.reserve_space) / storage_space_reserve_object_size) # round up + size = 0 + for i in range(count): + data = os.urandom(storage_space_reserve_object_size) # counter-act fs compression/dedup + repository.store_store(f"config/space-reserve.{i}", data) + size += len(data) + print(f"There is {format_file_size(size, iec=False)} reserved space in this repository now.") + elif args.free_space: + infos = repository.store_list("config") + size = 0 + for info in infos: + if info.name.startswith("space-reserve."): + size += info.size + repository.store_delete(f"config/{info.name}") + print(f"Freed {format_file_size(size, iec=False)} in repository.") + print("Now run borg prune or borg delete plus borg compact to free more space.") + print("After that, do not forget to reserve space again for next time!") + else: # print amount currently reserved + infos = repository.store_list("config") + size = 0 + for info in infos: + if info.name.startswith("space-reserve."): + size += info.size + print(f"There is {format_file_size(size, iec=False)} reserved space in this repository.") + print("In case you want to change the amount, use --free first to free all reserved space,") + print("then use --reserve with the desired amount.") + + def build_parser_rspace(self, subparsers, common_parser, mid_common_parser): + from ._common import process_epilog + + rspace_epilog = process_epilog( + """ + This command manages reserved space in a repository. + + Borg can not work in disk-full conditions (can not lock a repo and thus can + not run prune/delete or compact operations to free disk space). + + To avoid running into dead-end situations like that, you can put some objects + into a repository that take up some disk space. If you ever run into a + disk-full situation, you can free that space and then borg will be able to + run normally, so you can free more disk space by using prune/delete/compact. + After that, don't forget to reserve space again, in case you run into that + situation again at a later time. + + Examples:: + + # Create a new repository: + $ borg rcreate ... + # Reserve approx. 1GB of space for emergencies: + $ borg rspace --reserve 1G + + # Check amount of reserved space in the repository: + $ borg rspace + + # EMERGENCY! Free all reserved space to get things back to normal: + $ borg rspace --free + $ borg prune ... + $ borg delete ... + $ borg compact -v # only this actually frees space of deleted archives + $ borg rspace --reserve 1G # reserve space again for next time + + + Reserved space is always rounded up to use full reservation blocks of 64MiB. + """ + ) + subparser = subparsers.add_parser( + "rspace", + parents=[common_parser], + add_help=False, + description=self.do_rspace.__doc__, + epilog=rspace_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help="manage reserved space in a repository", + ) + subparser.set_defaults(func=self.do_rspace) + subparser.add_argument( + "--reserve", + metavar="SPACE", + dest="reserve_space", + default=0, + type=parse_file_size, + action=Highlander, + help="Amount of space to reserve (e.g. 100M, 1G). Default: 0.", + ) + subparser.add_argument( + "--free", + dest="free_space", + action="store_true", + help="Free all reserved space. Don't forget to reserve space later again.", + ) diff --git a/src/borg/archiver/serve_cmd.py b/src/borg/archiver/serve_cmd.py index 8cc613c583..d169498348 100644 --- a/src/borg/archiver/serve_cmd.py +++ b/src/borg/archiver/serve_cmd.py @@ -2,7 +2,7 @@ from ._common import Highlander from ..constants import * # NOQA -from ..helpers import parse_storage_quota +from ..helpers import parse_storage_quota, CommandError from ..remote import RepositoryServer from ..logger import create_logger @@ -13,6 +13,10 @@ class ServeMixIn: def do_serve(self, args): """Start in server mode. This command is usually not used manually.""" + if args.append_only: + raise CommandError("append-only is not supported (yet?)") + if args.storage_quota is not None: + raise CommandError("storage-quota is not supported (yet?)") RepositoryServer( restrict_to_paths=args.restrict_to_paths, restrict_to_repositories=args.restrict_to_repositories, diff --git a/src/borg/archiver/tar_cmds.py b/src/borg/archiver/tar_cmds.py index c68d37c2a9..6ede4b348d 100644 --- a/src/borg/archiver/tar_cmds.py +++ b/src/borg/archiver/tar_cmds.py @@ -240,7 +240,7 @@ def item_to_paxheaders(format, item): for pattern in matcher.get_unmatched_include_patterns(): self.print_warning_instance(IncludePatternNeverMatchedWarning(pattern)) - @with_repository(cache=True, exclusive=True, compatibility=(Manifest.Operation.WRITE,)) + @with_repository(cache=True, compatibility=(Manifest.Operation.WRITE,)) def do_import_tar(self, args, repository, manifest, cache): """Create a backup archive from a tarball""" self.output_filter = args.output_filter @@ -269,16 +269,7 @@ def _import_tar(self, args, repository, manifest, key, cache, tarstream): start_monotonic=t0_monotonic, log_json=args.log_json, ) - cp = ChunksProcessor( - cache=cache, - key=key, - add_item=archive.add_item, - prepare_checkpoint=archive.prepare_checkpoint, - write_checkpoint=archive.write_checkpoint, - checkpoint_interval=args.checkpoint_interval, - checkpoint_volume=args.checkpoint_volume, - rechunkify=False, - ) + cp = ChunksProcessor(cache=cache, key=key, add_item=archive.add_item, rechunkify=False) tfo = TarfileObjectProcessors( cache=cache, key=key, @@ -524,25 +515,6 @@ def build_parser_tar(self, subparsers, common_parser, mid_common_parser): help="manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, " "(+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory.", ) - archive_group.add_argument( - "-c", - "--checkpoint-interval", - dest="checkpoint_interval", - type=int, - default=1800, - action=Highlander, - metavar="SECONDS", - help="write checkpoint every SECONDS seconds (Default: 1800)", - ) - archive_group.add_argument( - "--checkpoint-volume", - metavar="BYTES", - dest="checkpoint_volume", - type=int, - default=0, - action=Highlander, - help="write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing)", - ) archive_group.add_argument( "--chunker-params", dest="chunker_params", diff --git a/src/borg/archiver/transfer_cmd.py b/src/borg/archiver/transfer_cmd.py index 1ba8ed3c88..ac407f6534 100644 --- a/src/borg/archiver/transfer_cmd.py +++ b/src/borg/archiver/transfer_cmd.py @@ -17,7 +17,7 @@ class TransferMixIn: @with_other_repository(manifest=True, compatibility=(Manifest.Operation.READ,)) - @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,)) + @with_repository(manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,)) def do_transfer(self, args, *, repository, manifest, cache, other_repository=None, other_manifest=None): """archives transfer from other repository, optionally upgrade data format""" key = manifest.key @@ -33,7 +33,6 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non ) dry_run = args.dry_run - args.consider_checkpoints = True archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args)) if not archive_names: return @@ -61,10 +60,15 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non from .. import upgrade as upgrade_mod + v1_or_v2 = getattr(args, "v1_or_v2", False) + upgrader = args.upgrader + if upgrader == "NoOp" and v1_or_v2: + upgrader = "From12To20" + try: - UpgraderCls = getattr(upgrade_mod, f"Upgrader{args.upgrader}") + UpgraderCls = getattr(upgrade_mod, f"Upgrader{upgrader}") except AttributeError: - raise Error(f"No such upgrader: {args.upgrader}") + raise Error(f"No such upgrader: {upgrader}") if UpgraderCls is not upgrade_mod.UpgraderFrom12To20 and other_manifest.repository.version == 1: raise Error("To transfer from a borg 1.x repo, you need to use: --upgrader=From12To20") @@ -74,7 +78,7 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non for name in archive_names: transfer_size = 0 present_size = 0 - if name in manifest.archives and not dry_run: + if manifest.archives.exists(name) and not dry_run: print(f"{name}: archive is already present in destination repo, skipping.") else: if not dry_run: @@ -96,8 +100,8 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non if "chunks" in item: chunks = [] for chunk_id, size in item.chunks: - refcount = cache.seen_chunk(chunk_id, size) - if refcount == 0: # target repo does not yet have this chunk + chunk_present = cache.seen_chunk(chunk_id, size) + if not chunk_present: # target repo does not yet have this chunk if not dry_run: cdata = other_repository.get(chunk_id) if args.recompress == "never": @@ -143,7 +147,7 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non transfer_size += size else: if not dry_run: - chunk_entry = cache.chunk_incref(chunk_id, size, archive.stats) + chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats) chunks.append(chunk_entry) present_size += size if not dry_run: @@ -188,32 +192,41 @@ def build_parser_transfer(self, subparsers, common_parser, mid_common_parser): If you want to globally change compression while transferring archives to the DST_REPO, give ``--compress=WANTED_COMPRESSION --recompress=always``. - Suggested use for general purpose archive transfer (not repo upgrades):: + The default is to transfer all archives. + + You could use the misc. archive filter options to limit which archives it will + transfer, e.g. using the ``-a`` option. This is recommended for big + repositories with multiple data sets to keep the runtime per invocation lower. + + General purpose archive transfer + ++++++++++++++++++++++++++++++++ + + Transfer borg2 archives into a related other borg2 repository:: # create a related DST_REPO (reusing key material from SRC_REPO), so that # chunking and chunk id generation will work in the same way as before. - borg --repo=DST_REPO rcreate --other-repo=SRC_REPO --encryption=DST_ENC + borg --repo=DST_REPO rcreate --encryption=DST_ENC --other-repo=SRC_REPO # transfer archives from SRC_REPO to DST_REPO borg --repo=DST_REPO transfer --other-repo=SRC_REPO --dry-run # check what it would do borg --repo=DST_REPO transfer --other-repo=SRC_REPO # do it! borg --repo=DST_REPO transfer --other-repo=SRC_REPO --dry-run # check! anything left? - The default is to transfer all archives, including checkpoint archives. - You could use the misc. archive filter options to limit which archives it will - transfer, e.g. using the ``-a`` option. This is recommended for big - repositories with multiple data sets to keep the runtime per invocation lower. + Data migration / upgrade from borg 1.x + ++++++++++++++++++++++++++++++++++++++ - For repository upgrades (e.g. from a borg 1.2 repo to a related borg 2.0 repo), usage is - quite similar to the above:: + To migrate your borg 1.x archives into a related, new borg2 repository, usage is quite similar + to the above, but you need the ``--from-borg1`` option:: - # fast: compress metadata with zstd,3, but keep data chunks compressed as they are: - borg --repo=DST_REPO transfer --other-repo=SRC_REPO --upgrader=From12To20 \\ - --compress=zstd,3 --recompress=never + borg --repo=DST_REPO rcreate --encryption=DST_ENC --other-repo=SRC_REPO --from-borg1 - # compress metadata and recompress data with zstd,3 - borg --repo=DST_REPO transfer --other-repo=SRC_REPO --upgrader=From12To20 \\ + # to continue using lz4 compression as you did in SRC_REPO: + borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \\ + --compress=lz4 --recompress=never + + # alternatively, to recompress everything to zstd,3: + borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \\ --compress=zstd,3 --recompress=always @@ -241,6 +254,9 @@ def build_parser_transfer(self, subparsers, common_parser, mid_common_parser): action=Highlander, help="transfer archives from the other repository", ) + subparser.add_argument( + "--from-borg1", dest="v1_or_v2", action="store_true", help="other repository is borg 1.x" + ) subparser.add_argument( "--upgrader", metavar="UPGRADER", diff --git a/src/borg/cache.py b/src/borg/cache.py index 88fe32902e..5107d847ba 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -12,27 +12,24 @@ files_cache_logger = create_logger("borg.debug.files_cache") from .constants import CACHE_README, FILES_CACHE_MODE_DISABLED, ROBJ_FILE_STREAM -from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer +from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Error from .helpers import get_cache_dir, get_security_dir -from .helpers import bin_to_hex, hex_to_bin, parse_stringified_list +from .helpers import hex_to_bin, parse_stringified_list from .helpers import format_file_size from .helpers import safe_ns from .helpers import yes -from .helpers import remove_surrogates -from .helpers import ProgressIndicatorPercent, ProgressIndicatorMessage -from .helpers import set_ec, EXIT_WARNING -from .helpers import safe_unlink +from .helpers import ProgressIndicatorMessage from .helpers import msgpack from .helpers.msgpack import int_to_timestamp, timestamp_to_int -from .item import ArchiveItem, ChunkListEntry +from .item import ChunkListEntry from .crypto.key import PlaintextKey -from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError -from .locking import Lock +from .crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError +from .fslocking import Lock from .manifest import Manifest from .platform import SaveFile -from .remote import cache_if_remote -from .repository import LIST_SCAN_LIMIT +from .remote import RemoteRepository +from .repository import LIST_SCAN_LIMIT, Repository # note: cmtime might be either a ctime or a mtime timestamp, chunks is a list of ChunkListEntry FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunks") @@ -354,24 +351,10 @@ def __new__( warn_if_unencrypted=True, progress=False, lock_wait=None, - no_cache_sync_permitted=False, - no_cache_sync_forced=False, prefer_adhoc_cache=False, cache_mode=FILES_CACHE_MODE_DISABLED, iec=False, ): - def local(): - return LocalCache( - manifest=manifest, - path=path, - sync=sync, - warn_if_unencrypted=warn_if_unencrypted, - progress=progress, - iec=iec, - lock_wait=lock_wait, - cache_mode=cache_mode, - ) - def adhocwithfiles(): return AdHocWithFilesCache( manifest=manifest, @@ -388,79 +371,14 @@ def adhoc(): impl = get_cache_impl() if impl != "cli": - methods = dict(local=local, adhocwithfiles=adhocwithfiles, adhoc=adhoc) + methods = dict(adhocwithfiles=adhocwithfiles, adhoc=adhoc) try: method = methods[impl] except KeyError: raise RuntimeError("Unknown BORG_CACHE_IMPL value: %s" % impl) return method() - if no_cache_sync_forced: - return adhoc() if prefer_adhoc_cache else adhocwithfiles() - - if not no_cache_sync_permitted: - return local() - - # no cache sync may be permitted, but if the local cache is in sync it'd be stupid to invalidate - # it by needlessly using the AdHocCache or the AdHocWithFilesCache. - # Check if the local cache exists and is in sync. - - cache_config = CacheConfig(repository, path, lock_wait) - if cache_config.exists(): - with cache_config: - cache_in_sync = cache_config.manifest_id == manifest.id - # Don't nest cache locks - if cache_in_sync: - # Local cache is in sync, use it - logger.debug("Cache: choosing local cache (in sync)") - return local() - if prefer_adhoc_cache: # adhoc cache, without files cache - logger.debug("Cache: choosing AdHocCache (local cache does not exist or is not in sync)") - return adhoc() - else: - logger.debug("Cache: choosing AdHocWithFilesCache (local cache does not exist or is not in sync)") - return adhocwithfiles() - - -class CacheStatsMixin: - str_format = """\ -Original size: {0.total_size} -Deduplicated size: {0.unique_size} -Unique chunks: {0.total_unique_chunks} -Total chunks: {0.total_chunks} -""" - - def __init__(self, iec=False): - self.iec = iec - - def __str__(self): - return self.str_format.format(self.format_tuple()) - - Summary = namedtuple("Summary", ["total_size", "unique_size", "total_unique_chunks", "total_chunks"]) - - def stats(self): - from .archive import Archive - - if isinstance(self, AdHocCache) and getattr(self, "chunks", None) is None: - self.chunks = self._load_chunks_from_repo() # AdHocCache usually only has .chunks after begin_txn. - - # XXX: this should really be moved down to `hashindex.pyx` - total_size, unique_size, total_unique_chunks, total_chunks = self.chunks.summarize() - # since borg 1.2 we have new archive metadata telling the total size per archive, - # so we can just sum up all archives to get the "all archives" stats: - total_size = 0 - for archive_name in self.manifest.archives: - archive = Archive(self.manifest, archive_name) - stats = archive.calc_stats(self, want_unique=False) - total_size += stats.osize - stats = self.Summary(total_size, unique_size, total_unique_chunks, total_chunks)._asdict() - return stats - - def format_tuple(self): - stats = self.stats() - for field in ["total_size", "unique_size"]: - stats[field] = format_file_size(stats[field], iec=self.iec) - return self.Summary(**stats) + return adhoc() if prefer_adhoc_cache else adhocwithfiles() class FilesCacheMixin: @@ -473,9 +391,15 @@ class FilesCacheMixin: def __init__(self, cache_mode): self.cache_mode = cache_mode - self.files = None + self._files = None self._newest_cmtime = None + @property + def files(self): + if self._files is None: + self._files = self._read_files_cache() + return self._files + def files_cache_name(self): suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "") return self.FILES_CACHE_NAME + "." + suffix if suffix else self.FILES_CACHE_NAME @@ -494,7 +418,7 @@ def _read_files_cache(self): if "d" in self.cache_mode: # d(isabled) return - self.files = {} + files = {} logger.debug("Reading files cache ...") files_cache_logger.debug("FILES-CACHE-LOAD: starting...") msg = None @@ -514,7 +438,7 @@ def _read_files_cache(self): for path_hash, item in u: entry = FileCacheEntry(*item) # in the end, this takes about 240 Bytes per file - self.files[path_hash] = msgpack.packb(entry._replace(age=entry.age + 1)) + files[path_hash] = msgpack.packb(entry._replace(age=entry.age + 1)) except (TypeError, ValueError) as exc: msg = "The files cache seems invalid. [%s]" % str(exc) break @@ -525,18 +449,20 @@ def _read_files_cache(self): if msg is not None: logger.warning(msg) logger.warning("Continuing without files cache - expect lower performance.") - self.files = {} - files_cache_logger.debug("FILES-CACHE-LOAD: finished, %d entries loaded.", len(self.files)) + files = {} + files_cache_logger.debug("FILES-CACHE-LOAD: finished, %d entries loaded.", len(files)) + return files - def _write_files_cache(self): + def _write_files_cache(self, files): if self._newest_cmtime is None: # was never set because no files were modified/added self._newest_cmtime = 2**63 - 1 # nanoseconds, good until y2262 ttl = int(os.environ.get("BORG_FILES_CACHE_TTL", 20)) files_cache_logger.debug("FILES-CACHE-SAVE: starting...") + # TODO: use something like SaveFile here, but that didn't work due to SyncFile missing .seek(). with IntegrityCheckedFile(path=os.path.join(self.path, self.files_cache_name()), write=True) as fd: entry_count = 0 - for path_hash, item in self.files.items(): + for path_hash, item in files.items(): # Only keep files seen in this backup that are older than newest cmtime seen in this backup - # this is to avoid issues with filesystem snapshots and cmtime granularity. # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet. @@ -639,50 +565,65 @@ def memorize_file(self, hashed_path, path_hash, st, chunks): ) +def build_chunkindex_from_repo(repository): + logger.debug("querying the chunk IDs list from the repo...") + chunks = ChunkIndex() + t0 = perf_counter() + num_requests = 0 + num_chunks = 0 + marker = None + while True: + result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + num_requests += 1 + if not result: + break + marker = result[-1][0] + # The repo says it has these chunks, so we assume they are referenced chunks. + # We do not care for refcounting anymore, so we just set refcount = MAX_VALUE. + # We do not know the plaintext size (!= stored_size), thus we set size = 0. + init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) + for id, stored_size in result: + num_chunks += 1 + chunks[id] = init_entry + # Cache does not contain the manifest. + if not isinstance(repository, (Repository, RemoteRepository)): + del chunks[Manifest.MANIFEST_ID] + duration = perf_counter() - t0 or 0.001 + # Chunk IDs in a list are encoded in 34 bytes: 1 byte msgpack header, 1 byte length, 32 ID bytes. + # Protocol overhead is neglected in this calculation. + speed = format_file_size(num_chunks * 34 / duration) + logger.debug(f"queried {num_chunks} chunk IDs in {duration} s ({num_requests} requests), ~{speed}/s") + return chunks + + class ChunksMixin: """ Chunks index related code for misc. Cache implementations. """ - def chunk_incref(self, id, size, stats): - assert isinstance(size, int) and size > 0 - if not self._txn_active: - self.begin_txn() - count, _size = self.chunks.incref(id) - stats.update(size, False) - return ChunkListEntry(id, size) + def __init__(self): + self._chunks = None - def chunk_decref(self, id, size, stats, wait=True): - assert isinstance(size, int) and size > 0 - if not self._txn_active: - self.begin_txn() - count, _size = self.chunks.decref(id) - if count == 0: - del self.chunks[id] - self.repository.delete(id, wait=wait) - stats.update(-size, True) - else: - stats.update(-size, False) + @property + def chunks(self): + if self._chunks is None: + self._chunks = build_chunkindex_from_repo(self.repository) + return self._chunks def seen_chunk(self, id, size=None): - if not self._txn_active: - self.begin_txn() entry = self.chunks.get(id, ChunkIndexEntry(0, None)) if entry.refcount and size is not None: assert isinstance(entry.size, int) - if entry.size: - # LocalCache: has existing size information and uses *size* to make an effort at detecting collisions. - if size != entry.size: - # we already have a chunk with that id, but different size. - # this is either a hash collision (unlikely) or corruption or a bug. - raise Exception( - "chunk has same id [%r], but different size (stored: %d new: %d)!" % (id, entry.size, size) - ) - else: + if not entry.size: # AdHocWithFilesCache / AdHocCache: # Here *size* is used to update the chunk's size information, which will be zero for existing chunks. self.chunks[id] = entry._replace(size=size) - return entry.refcount + return entry.refcount != 0 + + def reuse_chunk(self, id, size, stats): + assert isinstance(size, int) and size > 0 + stats.update(size, False) + return ChunkListEntry(id, size) def add_chunk( self, @@ -699,474 +640,24 @@ def add_chunk( ro_type=ROBJ_FILE_STREAM, ): assert ro_type is not None - if not self._txn_active: - self.begin_txn() if size is None: if compress: size = len(data) # data is still uncompressed else: raise ValueError("when giving compressed data for a chunk, the uncompressed size must be given also") - refcount = self.seen_chunk(id, size) - if refcount: - return self.chunk_incref(id, size, stats) + exists = self.seen_chunk(id, size) + if exists: + return self.reuse_chunk(id, size, stats) cdata = self.repo_objs.format( id, meta, data, compress=compress, size=size, ctype=ctype, clevel=clevel, ro_type=ro_type ) self.repository.put(id, cdata, wait=wait) - self.chunks.add(id, 1, size) - stats.update(size, not refcount) + self.chunks.add(id, ChunkIndex.MAX_VALUE, size) + stats.update(size, not exists) return ChunkListEntry(id, size) - def _load_chunks_from_repo(self): - # Explicitly set the initial usable hash table capacity to avoid performance issues - # due to hash table "resonance". - # Since we're creating an archive, add 10 % from the start. - num_chunks = len(self.repository) - chunks = ChunkIndex(usable=num_chunks * 1.1) - pi = ProgressIndicatorPercent( - total=num_chunks, msg="Downloading chunk list... %3.0f%%", msgid="cache.download_chunks" - ) - t0 = perf_counter() - num_requests = 0 - marker = None - while True: - result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker) - num_requests += 1 - if not result: - break - pi.show(increase=len(result)) - marker = result[-1] - # All chunks from the repository have a refcount of MAX_VALUE, which is sticky, - # therefore we can't/won't delete them. Chunks we added ourselves in this transaction - # (e.g. checkpoint archives) are tracked correctly. - init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) - for id_ in result: - chunks[id_] = init_entry - assert len(chunks) == num_chunks - # LocalCache does not contain the manifest, either. - del chunks[self.manifest.MANIFEST_ID] - duration = perf_counter() - t0 or 0.01 - pi.finish() - logger.debug( - "Cache: downloaded %d chunk IDs in %.2f s (%d requests), ~%s/s", - num_chunks, - duration, - num_requests, - format_file_size(num_chunks * 34 / duration), - ) - # Chunk IDs in a list are encoded in 34 bytes: 1 byte msgpack header, 1 byte length, 32 ID bytes. - # Protocol overhead is neglected in this calculation. - return chunks - - -class LocalCache(CacheStatsMixin, FilesCacheMixin, ChunksMixin): - """ - Persistent, local (client-side) cache. - """ - - def __init__( - self, - manifest, - path=None, - sync=True, - warn_if_unencrypted=True, - progress=False, - lock_wait=None, - cache_mode=FILES_CACHE_MODE_DISABLED, - iec=False, - ): - """ - :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository - :param lock_wait: timeout for lock acquisition (int [s] or None [wait forever]) - :param sync: do :meth:`.sync` - :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison - """ - CacheStatsMixin.__init__(self, iec=iec) - FilesCacheMixin.__init__(self, cache_mode) - assert isinstance(manifest, Manifest) - self.manifest = manifest - self.repository = manifest.repository - self.key = manifest.key - self.repo_objs = manifest.repo_objs - self.progress = progress - self._txn_active = False - self.do_cache = os.environ.get("BORG_USE_CHUNKS_ARCHIVE", "yes").lower() in ["yes", "1", "true"] - - self.path = cache_dir(self.repository, path) - self.security_manager = SecurityManager(self.repository) - self.cache_config = CacheConfig(self.repository, self.path, lock_wait) - - # Warn user before sending data to a never seen before unencrypted repository - if not os.path.exists(self.path): - self.security_manager.assert_access_unknown(warn_if_unencrypted, manifest, self.key) - self.create() - - try: - self.open() - except (FileNotFoundError, FileIntegrityError): - self.wipe_cache() - self.open() - - try: - self.security_manager.assert_secure(manifest, self.key) - - if not self.check_cache_compatibility(): - self.wipe_cache() - - self.update_compatibility() - - if sync and self.manifest.id != self.cache_config.manifest_id: - self.sync() - self.commit() - except: # noqa - self.close() - raise - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - def create(self): - """Create a new empty cache at `self.path`""" - os.makedirs(self.path) - with open(os.path.join(self.path, "README"), "w") as fd: - fd.write(CACHE_README) - self.cache_config.create() - ChunkIndex().write(os.path.join(self.path, "chunks")) - os.makedirs(os.path.join(self.path, "chunks.archive.d")) - self._create_empty_files_cache(self.path) - - def _do_open(self): - self.cache_config.load() - with IntegrityCheckedFile( - path=os.path.join(self.path, "chunks"), - write=False, - integrity_data=self.cache_config.integrity.get("chunks"), - ) as fd: - self.chunks = ChunkIndex.read(fd) - self._read_files_cache() - - def open(self): - if not os.path.isdir(self.path): - raise Exception("%s Does not look like a Borg cache" % self.path) - self.cache_config.open() - self.rollback() - - def close(self): - if self.cache_config is not None: - self.cache_config.close() - self.cache_config = None - - def begin_txn(self): - # Initialize transaction snapshot - pi = ProgressIndicatorMessage(msgid="cache.begin_transaction") - txn_dir = os.path.join(self.path, "txn.tmp") - os.mkdir(txn_dir) - pi.output("Initializing cache transaction: Reading config") - shutil.copy(os.path.join(self.path, "config"), txn_dir) - pi.output("Initializing cache transaction: Reading chunks") - shutil.copy(os.path.join(self.path, "chunks"), txn_dir) - pi.output("Initializing cache transaction: Reading files") - try: - shutil.copy(os.path.join(self.path, self.files_cache_name()), txn_dir) - except FileNotFoundError: - self._create_empty_files_cache(txn_dir) - os.replace(txn_dir, os.path.join(self.path, "txn.active")) - self._txn_active = True - pi.finish() - - def commit(self): - """Commit transaction""" - if not self._txn_active: - return - self.security_manager.save(self.manifest, self.key) - pi = ProgressIndicatorMessage(msgid="cache.commit") - if self.files is not None: - pi.output("Saving files cache") - integrity_data = self._write_files_cache() - self.cache_config.integrity[self.files_cache_name()] = integrity_data - pi.output("Saving chunks cache") - with IntegrityCheckedFile(path=os.path.join(self.path, "chunks"), write=True) as fd: - self.chunks.write(fd) - self.cache_config.integrity["chunks"] = fd.integrity_data - pi.output("Saving cache config") - self.cache_config.save(self.manifest) - os.replace(os.path.join(self.path, "txn.active"), os.path.join(self.path, "txn.tmp")) - shutil.rmtree(os.path.join(self.path, "txn.tmp")) - self._txn_active = False - pi.finish() - - def rollback(self): - """Roll back partial and aborted transactions""" - # Remove partial transaction - if os.path.exists(os.path.join(self.path, "txn.tmp")): - shutil.rmtree(os.path.join(self.path, "txn.tmp")) - # Roll back active transaction - txn_dir = os.path.join(self.path, "txn.active") - if os.path.exists(txn_dir): - shutil.copy(os.path.join(txn_dir, "config"), self.path) - shutil.copy(os.path.join(txn_dir, "chunks"), self.path) - shutil.copy(os.path.join(txn_dir, self.discover_files_cache_name(txn_dir)), self.path) - txn_tmp = os.path.join(self.path, "txn.tmp") - os.replace(txn_dir, txn_tmp) - if os.path.exists(txn_tmp): - shutil.rmtree(txn_tmp) - self._txn_active = False - self._do_open() - - def sync(self): - """Re-synchronize chunks cache with repository. - - Maintains a directory with known backup archive indexes, so it only - needs to fetch infos from repo and build a chunk index once per backup - archive. - If out of sync, missing archive indexes get added, outdated indexes - get removed and a new master chunks index is built by merging all - archive indexes. - """ - archive_path = os.path.join(self.path, "chunks.archive.d") - # Instrumentation - processed_item_metadata_bytes = 0 - processed_item_metadata_chunks = 0 - compact_chunks_archive_saved_space = 0 - - def mkpath(id, suffix=""): - id_hex = bin_to_hex(id) - path = os.path.join(archive_path, id_hex + suffix) - return path - - def cached_archives(): - if self.do_cache: - fns = os.listdir(archive_path) - # filenames with 64 hex digits == 256bit, - # or compact indices which are 64 hex digits + ".compact" - return {hex_to_bin(fn) for fn in fns if len(fn) == 64} | { - hex_to_bin(fn[:64]) for fn in fns if len(fn) == 72 and fn.endswith(".compact") - } - else: - return set() - - def repo_archives(): - return {info.id for info in self.manifest.archives.list()} - - def cleanup_outdated(ids): - for id in ids: - cleanup_cached_archive(id) - - def cleanup_cached_archive(id, cleanup_compact=True): - try: - os.unlink(mkpath(id)) - os.unlink(mkpath(id) + ".integrity") - except FileNotFoundError: - pass - if not cleanup_compact: - return - try: - os.unlink(mkpath(id, suffix=".compact")) - os.unlink(mkpath(id, suffix=".compact") + ".integrity") - except FileNotFoundError: - pass - - def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx): - nonlocal processed_item_metadata_bytes - nonlocal processed_item_metadata_chunks - csize, data = decrypted_repository.get(archive_id) - chunk_idx.add(archive_id, 1, len(data)) - archive = self.key.unpack_archive(data) - archive = ArchiveItem(internal_dict=archive) - if archive.version not in (1, 2): # legacy - raise Exception("Unknown archive metadata version") - if archive.version == 1: - items = archive.items - elif archive.version == 2: - items = [] - for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)): - chunk_idx.add(chunk_id, 1, len(data)) - ids = msgpack.unpackb(data) - items.extend(ids) - sync = CacheSynchronizer(chunk_idx) - for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)): - chunk_idx.add(item_id, 1, len(data)) - processed_item_metadata_bytes += len(data) - processed_item_metadata_chunks += 1 - sync.feed(data) - if self.do_cache: - write_archive_index(archive_id, chunk_idx) - - def write_archive_index(archive_id, chunk_idx): - nonlocal compact_chunks_archive_saved_space - compact_chunks_archive_saved_space += chunk_idx.compact() - fn = mkpath(archive_id, suffix=".compact") - fn_tmp = mkpath(archive_id, suffix=".tmp") - try: - with DetachedIntegrityCheckedFile( - path=fn_tmp, write=True, filename=bin_to_hex(archive_id) + ".compact" - ) as fd: - chunk_idx.write(fd) - except Exception: - safe_unlink(fn_tmp) - else: - os.replace(fn_tmp, fn) - - def read_archive_index(archive_id, archive_name): - archive_chunk_idx_path = mkpath(archive_id) - logger.info("Reading cached archive chunk index for %s", archive_name) - try: - try: - # Attempt to load compact index first - with DetachedIntegrityCheckedFile(path=archive_chunk_idx_path + ".compact", write=False) as fd: - archive_chunk_idx = ChunkIndex.read(fd, permit_compact=True) - # In case a non-compact index exists, delete it. - cleanup_cached_archive(archive_id, cleanup_compact=False) - # Compact index read - return index, no conversion necessary (below). - return archive_chunk_idx - except FileNotFoundError: - # No compact index found, load non-compact index, and convert below. - with DetachedIntegrityCheckedFile(path=archive_chunk_idx_path, write=False) as fd: - archive_chunk_idx = ChunkIndex.read(fd) - except FileIntegrityError as fie: - logger.error("Cached archive chunk index of %s is corrupted: %s", archive_name, fie) - # Delete corrupted index, set warning. A new index must be build. - cleanup_cached_archive(archive_id) - set_ec(EXIT_WARNING) - return None - - # Convert to compact index. Delete the existing index first. - logger.debug("Found non-compact index for %s, converting to compact.", archive_name) - cleanup_cached_archive(archive_id) - write_archive_index(archive_id, archive_chunk_idx) - return archive_chunk_idx - - def get_archive_ids_to_names(archive_ids): - # Pass once over all archives and build a mapping from ids to names. - # The easier approach, doing a similar loop for each archive, has - # square complexity and does about a dozen million functions calls - # with 1100 archives (which takes 30s CPU seconds _alone_). - archive_names = {} - for info in self.manifest.archives.list(): - if info.id in archive_ids: - archive_names[info.id] = info.name - assert len(archive_names) == len(archive_ids) - return archive_names - - def create_master_idx(chunk_idx): - logger.debug("Synchronizing chunks index...") - cached_ids = cached_archives() - archive_ids = repo_archives() - logger.info( - "Cached archive chunk indexes: %d fresh, %d stale, %d need fetching.", - len(archive_ids & cached_ids), - len(cached_ids - archive_ids), - len(archive_ids - cached_ids), - ) - # deallocates old hashindex, creates empty hashindex: - chunk_idx.clear() - cleanup_outdated(cached_ids - archive_ids) - # Explicitly set the usable initial hash table capacity to avoid performance issues - # due to hash table "resonance". - master_index_capacity = len(self.repository) - if archive_ids: - chunk_idx = None if not self.do_cache else ChunkIndex(usable=master_index_capacity) - pi = ProgressIndicatorPercent( - total=len(archive_ids), - step=0.1, - msg="%3.0f%% Syncing chunks index. Processing archive %s.", - msgid="cache.sync", - ) - archive_ids_to_names = get_archive_ids_to_names(archive_ids) - for archive_id, archive_name in archive_ids_to_names.items(): - pi.show(info=[remove_surrogates(archive_name)]) # legacy. borg2 always has pure unicode arch names. - if self.do_cache: - if archive_id in cached_ids: - archive_chunk_idx = read_archive_index(archive_id, archive_name) - if archive_chunk_idx is None: - cached_ids.remove(archive_id) - if archive_id not in cached_ids: - # Do not make this an else branch; the FileIntegrityError exception handler - # above can remove *archive_id* from *cached_ids*. - logger.info("Fetching and building archive index for %s.", archive_name) - archive_chunk_idx = ChunkIndex() - fetch_and_build_idx(archive_id, decrypted_repository, archive_chunk_idx) - logger.debug("Merging into master chunks index.") - chunk_idx.merge(archive_chunk_idx) - else: - chunk_idx = chunk_idx or ChunkIndex(usable=master_index_capacity) - logger.info("Fetching archive index for %s.", archive_name) - fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx) - pi.finish() - logger.debug( - "Chunks index sync: processed %s (%d chunks) of metadata.", - format_file_size(processed_item_metadata_bytes), - processed_item_metadata_chunks, - ) - logger.debug( - "Chunks index sync: compact chunks.archive.d storage saved %s bytes.", - format_file_size(compact_chunks_archive_saved_space), - ) - logger.debug("Chunks index sync done.") - return chunk_idx - - # The cache can be used by a command that e.g. only checks against Manifest.Operation.WRITE, - # which does not have to include all flags from Manifest.Operation.READ. - # Since the sync will attempt to read archives, check compatibility with Manifest.Operation.READ. - self.manifest.check_repository_compatibility((Manifest.Operation.READ,)) - self.begin_txn() - with cache_if_remote(self.repository, decrypted_cache=self.repo_objs) as decrypted_repository: - self.chunks = create_master_idx(self.chunks) - - def check_cache_compatibility(self): - my_features = Manifest.SUPPORTED_REPO_FEATURES - if self.cache_config.ignored_features & my_features: - # The cache might not contain references of chunks that need a feature that is mandatory for some operation - # and which this version supports. To avoid corruption while executing that operation force rebuild. - return False - if not self.cache_config.mandatory_features <= my_features: - # The cache was build with consideration to at least one feature that this version does not understand. - # This client might misinterpret the cache. Thus force a rebuild. - return False - return True - - def wipe_cache(self): - logger.warning("Discarding incompatible or corrupted cache and forcing a cache rebuild") - archive_path = os.path.join(self.path, "chunks.archive.d") - if os.path.isdir(archive_path): - shutil.rmtree(os.path.join(self.path, "chunks.archive.d")) - os.makedirs(os.path.join(self.path, "chunks.archive.d")) - self.chunks = ChunkIndex() - with IntegrityCheckedFile(path=os.path.join(self.path, "chunks"), write=True) as fd: - self.chunks.write(fd) - self.cache_config.integrity["chunks"] = fd.integrity_data - integrity_data = self._create_empty_files_cache(self.path) - self.cache_config.integrity[self.files_cache_name()] = integrity_data - self.cache_config.manifest_id = "" - self.cache_config._config.set("cache", "manifest", "") - if not self.cache_config._config.has_section("integrity"): - self.cache_config._config.add_section("integrity") - for file, integrity_data in self.cache_config.integrity.items(): - self.cache_config._config.set("integrity", file, integrity_data) - # This is needed to pass the integrity check later on inside CacheConfig.load() - self.cache_config._config.set("integrity", "manifest", "") - - self.cache_config.ignored_features = set() - self.cache_config.mandatory_features = set() - with SaveFile(self.cache_config.config_path) as fd: - self.cache_config._config.write(fd) - - def update_compatibility(self): - operation_to_features_map = self.manifest.get_all_mandatory_features() - my_features = Manifest.SUPPORTED_REPO_FEATURES - repo_features = set() - for operation, features in operation_to_features_map.items(): - repo_features.update(features) - - self.cache_config.ignored_features.update(repo_features - my_features) - self.cache_config.mandatory_features.update(repo_features & my_features) - - -class AdHocWithFilesCache(CacheStatsMixin, FilesCacheMixin, ChunksMixin): +class AdHocWithFilesCache(FilesCacheMixin, ChunksMixin): """ Like AdHocCache, but with a files cache. """ @@ -1186,15 +677,14 @@ def __init__( :param lock_wait: timeout for lock acquisition (int [s] or None [wait forever]) :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison """ - CacheStatsMixin.__init__(self, iec=iec) FilesCacheMixin.__init__(self, cache_mode) + ChunksMixin.__init__(self) assert isinstance(manifest, Manifest) self.manifest = manifest self.repository = manifest.repository self.key = manifest.key self.repo_objs = manifest.repo_objs self.progress = progress - self._txn_active = False self.path = cache_dir(self.repository, path) self.security_manager = SecurityManager(self.repository) @@ -1218,10 +708,12 @@ def __init__( raise def __enter__(self): + self._chunks = None return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() + self._chunks = None def create(self): """Create a new empty cache at `self.path`""" @@ -1231,69 +723,24 @@ def create(self): self.cache_config.create() self._create_empty_files_cache(self.path) - def _do_open(self): - self.cache_config.load() - self.chunks = self._load_chunks_from_repo() - self._read_files_cache() - def open(self): if not os.path.isdir(self.path): raise Exception("%s Does not look like a Borg cache" % self.path) self.cache_config.open() - self.rollback() + self.cache_config.load() def close(self): - if self.cache_config is not None: - self.cache_config.close() - self.cache_config = None - - def begin_txn(self): - # Initialize transaction snapshot - pi = ProgressIndicatorMessage(msgid="cache.begin_transaction") - txn_dir = os.path.join(self.path, "txn.tmp") - os.mkdir(txn_dir) - pi.output("Initializing cache transaction: Reading config") - shutil.copy(os.path.join(self.path, "config"), txn_dir) - pi.output("Initializing cache transaction: Reading files") - try: - shutil.copy(os.path.join(self.path, self.files_cache_name()), txn_dir) - except FileNotFoundError: - self._create_empty_files_cache(txn_dir) - os.replace(txn_dir, os.path.join(self.path, "txn.active")) - pi.finish() - self._txn_active = True - - def commit(self): - if not self._txn_active: - return self.security_manager.save(self.manifest, self.key) - pi = ProgressIndicatorMessage(msgid="cache.commit") - if self.files is not None: + pi = ProgressIndicatorMessage(msgid="cache.close") + if self._files is not None: pi.output("Saving files cache") - integrity_data = self._write_files_cache() + integrity_data = self._write_files_cache(self._files) self.cache_config.integrity[self.files_cache_name()] = integrity_data pi.output("Saving cache config") self.cache_config.save(self.manifest) - os.replace(os.path.join(self.path, "txn.active"), os.path.join(self.path, "txn.tmp")) - shutil.rmtree(os.path.join(self.path, "txn.tmp")) - self._txn_active = False + self.cache_config.close() pi.finish() - - def rollback(self): - # Remove partial transaction - if os.path.exists(os.path.join(self.path, "txn.tmp")): - shutil.rmtree(os.path.join(self.path, "txn.tmp")) - # Roll back active transaction - txn_dir = os.path.join(self.path, "txn.active") - if os.path.exists(txn_dir): - shutil.copy(os.path.join(txn_dir, "config"), self.path) - shutil.copy(os.path.join(txn_dir, self.discover_files_cache_name(txn_dir)), self.path) - txn_tmp = os.path.join(self.path, "txn.tmp") - os.replace(txn_dir, txn_tmp) - if os.path.exists(txn_tmp): - shutil.rmtree(txn_tmp) - self._txn_active = False - self._do_open() + self.cache_config = None def check_cache_compatibility(self): my_features = Manifest.SUPPORTED_REPO_FEATURES @@ -1309,7 +756,7 @@ def check_cache_compatibility(self): def wipe_cache(self): logger.warning("Discarding incompatible cache and forcing a cache rebuild") - self.chunks = ChunkIndex() + self._chunks = ChunkIndex() self._create_empty_files_cache(self.path) self.cache_config.manifest_id = "" self.cache_config._config.set("cache", "manifest", "") @@ -1328,30 +775,23 @@ def update_compatibility(self): self.cache_config.mandatory_features.update(repo_features & my_features) -class AdHocCache(CacheStatsMixin, ChunksMixin): +class AdHocCache(ChunksMixin): """ Ad-hoc, non-persistent cache. - Compared to the standard LocalCache the AdHocCache does not maintain accurate reference count, - nor does it provide a files cache (which would require persistence). Chunks that were not added - during the current AdHocCache lifetime won't have correct size set (0 bytes) and will - have an infinite reference count (MAX_VALUE). + The AdHocCache does not maintain accurate reference count, nor does it provide a files cache + (which would require persistence). + Chunks that were not added during the current AdHocCache lifetime won't have correct size set + (0 bytes) and will have an infinite reference count (MAX_VALUE). """ - str_format = """\ -All archives: unknown unknown unknown - - Unique chunks Total chunks -Chunk index: {0.total_unique_chunks:20d} unknown""" - def __init__(self, manifest, warn_if_unencrypted=True, lock_wait=None, iec=False): - CacheStatsMixin.__init__(self, iec=iec) + ChunksMixin.__init__(self) assert isinstance(manifest, Manifest) self.manifest = manifest self.repository = manifest.repository self.key = manifest.key self.repo_objs = manifest.repo_objs - self._txn_active = False self.security_manager = SecurityManager(self.repository) self.security_manager.assert_secure(manifest, self.key, lock_wait=lock_wait) @@ -1359,10 +799,13 @@ def __init__(self, manifest, warn_if_unencrypted=True, lock_wait=None, iec=False # Public API def __enter__(self): + self._chunks = None return self def __exit__(self, exc_type, exc_val, exc_tb): - pass + if exc_type is None: + self.security_manager.save(self.manifest, self.key) + self._chunks = None files = None # type: ignore cache_mode = "d" @@ -1373,17 +816,3 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st): def memorize_file(self, hashed_path, path_hash, st, chunks): pass - - def commit(self): - if not self._txn_active: - return - self.security_manager.save(self.manifest, self.key) - self._txn_active = False - - def rollback(self): - self._txn_active = False - del self.chunks - - def begin_txn(self): - self._txn_active = True - self.chunks = self._load_chunks_from_repo() diff --git a/src/borg/cache_sync/cache_sync.c b/src/borg/cache_sync/cache_sync.c deleted file mode 100644 index eff765d493..0000000000 --- a/src/borg/cache_sync/cache_sync.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Borg cache synchronizer, - * high level interface. - * - * These routines parse msgpacked item metadata and update a HashIndex - * with all chunks that are referenced from the items. - * - * This file only contains some initialization and buffer management. - * - * The parser is split in two parts, somewhat similar to lexer/parser combinations: - * - * unpack_template.h munches msgpack and calls a specific callback for each object - * encountered (e.g. beginning of a map, an integer, a string, a map item etc.). - * - * unpack.h implements these callbacks and uses another state machine to - * extract chunk references from it. - */ - -#include "unpack.h" - -typedef struct { - unpack_context ctx; - - char *buf; - size_t head; - size_t tail; - size_t size; -} CacheSyncCtx; - -static CacheSyncCtx * -cache_sync_init(HashIndex *chunks) -{ - CacheSyncCtx *ctx; - if (!(ctx = (CacheSyncCtx*)malloc(sizeof(CacheSyncCtx)))) { - return NULL; - } - - unpack_init(&ctx->ctx); - /* needs to be set only once */ - ctx->ctx.user.chunks = chunks; - ctx->ctx.user.totals.size = 0; - ctx->ctx.user.totals.num_files = 0; - ctx->buf = NULL; - ctx->head = 0; - ctx->tail = 0; - ctx->size = 0; - - return ctx; -} - -static void -cache_sync_free(CacheSyncCtx *ctx) -{ - if(ctx->buf) { - free(ctx->buf); - } - free(ctx); -} - -static const char * -cache_sync_error(const CacheSyncCtx *ctx) -{ - return ctx->ctx.user.last_error; -} - -static uint64_t -cache_sync_num_files_totals(const CacheSyncCtx *ctx) -{ - return ctx->ctx.user.totals.num_files; -} - -static uint64_t -cache_sync_size_totals(const CacheSyncCtx *ctx) -{ - return ctx->ctx.user.totals.size; -} - -/** - * feed data to the cache synchronizer - * 0 = abort, 1 = continue - * abort is a regular condition, check cache_sync_error - */ -static int -cache_sync_feed(CacheSyncCtx *ctx, void *data, uint32_t length) -{ - size_t new_size; - int ret; - char *new_buf; - - if(ctx->tail + length > ctx->size) { - if((ctx->tail - ctx->head) + length <= ctx->size) { - /* | XXXXX| -> move data in buffer backwards -> |XXXXX | */ - memmove(ctx->buf, ctx->buf + ctx->head, ctx->tail - ctx->head); - ctx->tail -= ctx->head; - ctx->head = 0; - } else { - /* must expand buffer to fit all data */ - new_size = (ctx->tail - ctx->head) + length; - new_buf = (char*) malloc(new_size); - if(!new_buf) { - ctx->ctx.user.last_error = "cache_sync_feed: unable to allocate buffer"; - return 0; - } - if(ctx->buf) { - memcpy(new_buf, ctx->buf + ctx->head, ctx->tail - ctx->head); - free(ctx->buf); - } - ctx->buf = new_buf; - ctx->tail -= ctx->head; - ctx->head = 0; - ctx->size = new_size; - } - } - - memcpy(ctx->buf + ctx->tail, data, length); - ctx->tail += length; - - while(1) { - if(ctx->head >= ctx->tail) { - return 1; /* request more bytes */ - } - - ret = unpack_execute(&ctx->ctx, ctx->buf, ctx->tail, &ctx->head); - if(ret == 1) { - unpack_init(&ctx->ctx); - continue; - } else if(ret == 0) { - return 1; - } else { - if(!ctx->ctx.user.last_error) { - ctx->ctx.user.last_error = "Unknown error"; - } - return 0; - } - } - /* unreachable */ - return 1; -} diff --git a/src/borg/cache_sync/sysdep.h b/src/borg/cache_sync/sysdep.h deleted file mode 100644 index e4ce7850f0..0000000000 --- a/src/borg/cache_sync/sysdep.h +++ /dev/null @@ -1,194 +0,0 @@ -/* - * MessagePack system dependencies - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MSGPACK_SYSDEP_H__ -#define MSGPACK_SYSDEP_H__ - -#include -#include -#if defined(_MSC_VER) && _MSC_VER < 1600 -typedef __int8 int8_t; -typedef unsigned __int8 uint8_t; -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#elif defined(_MSC_VER) // && _MSC_VER >= 1600 -#include -#else -#include -#include -#endif - -#ifdef _WIN32 -#define _msgpack_atomic_counter_header -typedef long _msgpack_atomic_counter_t; -#define _msgpack_sync_decr_and_fetch(ptr) InterlockedDecrement(ptr) -#define _msgpack_sync_incr_and_fetch(ptr) InterlockedIncrement(ptr) -#elif defined(__GNUC__) && ((__GNUC__*10 + __GNUC_MINOR__) < 41) -#define _msgpack_atomic_counter_header "gcc_atomic.h" -#else -typedef unsigned int _msgpack_atomic_counter_t; -#define _msgpack_sync_decr_and_fetch(ptr) __sync_sub_and_fetch(ptr, 1) -#define _msgpack_sync_incr_and_fetch(ptr) __sync_add_and_fetch(ptr, 1) -#endif - -#ifdef _WIN32 - -#ifdef __cplusplus -/* numeric_limits::min,max */ -#ifdef max -#undef max -#endif -#ifdef min -#undef min -#endif -#endif - -#else -#include /* __BYTE_ORDER */ -#endif - -#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define __LITTLE_ENDIAN__ -#elif __BYTE_ORDER == __BIG_ENDIAN -#define __BIG_ENDIAN__ -#elif _WIN32 -#define __LITTLE_ENDIAN__ -#endif -#endif - - -#ifdef __LITTLE_ENDIAN__ - -#ifdef _WIN32 -# if defined(ntohs) -# define _msgpack_be16(x) ntohs(x) -# elif defined(_byteswap_ushort) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be16(x) ((uint16_t)_byteswap_ushort((unsigned short)x)) -# else -# define _msgpack_be16(x) ( \ - ((((uint16_t)x) << 8) ) | \ - ((((uint16_t)x) >> 8) ) ) -# endif -#else -# define _msgpack_be16(x) ntohs(x) -#endif - -#ifdef _WIN32 -# if defined(ntohl) -# define _msgpack_be32(x) ntohl(x) -# elif defined(_byteswap_ulong) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be32(x) ((uint32_t)_byteswap_ulong((unsigned long)x)) -# else -# define _msgpack_be32(x) \ - ( ((((uint32_t)x) << 24) ) | \ - ((((uint32_t)x) << 8) & 0x00ff0000U ) | \ - ((((uint32_t)x) >> 8) & 0x0000ff00U ) | \ - ((((uint32_t)x) >> 24) ) ) -# endif -#else -# define _msgpack_be32(x) ntohl(x) -#endif - -#if defined(_byteswap_uint64) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be64(x) (_byteswap_uint64(x)) -#elif defined(bswap_64) -# define _msgpack_be64(x) bswap_64(x) -#elif defined(__DARWIN_OSSwapInt64) -# define _msgpack_be64(x) __DARWIN_OSSwapInt64(x) -#else -#define _msgpack_be64(x) \ - ( ((((uint64_t)x) << 56) ) | \ - ((((uint64_t)x) << 40) & 0x00ff000000000000ULL ) | \ - ((((uint64_t)x) << 24) & 0x0000ff0000000000ULL ) | \ - ((((uint64_t)x) << 8) & 0x000000ff00000000ULL ) | \ - ((((uint64_t)x) >> 8) & 0x00000000ff000000ULL ) | \ - ((((uint64_t)x) >> 24) & 0x0000000000ff0000ULL ) | \ - ((((uint64_t)x) >> 40) & 0x000000000000ff00ULL ) | \ - ((((uint64_t)x) >> 56) ) ) -#endif - -#define _msgpack_load16(cast, from) ((cast)( \ - (((uint16_t)((uint8_t*)(from))[0]) << 8) | \ - (((uint16_t)((uint8_t*)(from))[1]) ) )) - -#define _msgpack_load32(cast, from) ((cast)( \ - (((uint32_t)((uint8_t*)(from))[0]) << 24) | \ - (((uint32_t)((uint8_t*)(from))[1]) << 16) | \ - (((uint32_t)((uint8_t*)(from))[2]) << 8) | \ - (((uint32_t)((uint8_t*)(from))[3]) ) )) - -#define _msgpack_load64(cast, from) ((cast)( \ - (((uint64_t)((uint8_t*)(from))[0]) << 56) | \ - (((uint64_t)((uint8_t*)(from))[1]) << 48) | \ - (((uint64_t)((uint8_t*)(from))[2]) << 40) | \ - (((uint64_t)((uint8_t*)(from))[3]) << 32) | \ - (((uint64_t)((uint8_t*)(from))[4]) << 24) | \ - (((uint64_t)((uint8_t*)(from))[5]) << 16) | \ - (((uint64_t)((uint8_t*)(from))[6]) << 8) | \ - (((uint64_t)((uint8_t*)(from))[7]) ) )) - -#else - -#define _msgpack_be16(x) (x) -#define _msgpack_be32(x) (x) -#define _msgpack_be64(x) (x) - -#define _msgpack_load16(cast, from) ((cast)( \ - (((uint16_t)((uint8_t*)from)[0]) << 8) | \ - (((uint16_t)((uint8_t*)from)[1]) ) )) - -#define _msgpack_load32(cast, from) ((cast)( \ - (((uint32_t)((uint8_t*)from)[0]) << 24) | \ - (((uint32_t)((uint8_t*)from)[1]) << 16) | \ - (((uint32_t)((uint8_t*)from)[2]) << 8) | \ - (((uint32_t)((uint8_t*)from)[3]) ) )) - -#define _msgpack_load64(cast, from) ((cast)( \ - (((uint64_t)((uint8_t*)from)[0]) << 56) | \ - (((uint64_t)((uint8_t*)from)[1]) << 48) | \ - (((uint64_t)((uint8_t*)from)[2]) << 40) | \ - (((uint64_t)((uint8_t*)from)[3]) << 32) | \ - (((uint64_t)((uint8_t*)from)[4]) << 24) | \ - (((uint64_t)((uint8_t*)from)[5]) << 16) | \ - (((uint64_t)((uint8_t*)from)[6]) << 8) | \ - (((uint64_t)((uint8_t*)from)[7]) ) )) -#endif - - -#define _msgpack_store16(to, num) \ - do { uint16_t val = _msgpack_be16(num); memcpy(to, &val, 2); } while(0) -#define _msgpack_store32(to, num) \ - do { uint32_t val = _msgpack_be32(num); memcpy(to, &val, 4); } while(0) -#define _msgpack_store64(to, num) \ - do { uint64_t val = _msgpack_be64(num); memcpy(to, &val, 8); } while(0) - -/* -#define _msgpack_load16(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 2); _msgpack_be16(val); }) -#define _msgpack_load32(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 4); _msgpack_be32(val); }) -#define _msgpack_load64(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 8); _msgpack_be64(val); }) -*/ - - -#endif /* msgpack/sysdep.h */ diff --git a/src/borg/cache_sync/unpack.h b/src/borg/cache_sync/unpack.h deleted file mode 100644 index ad898655fd..0000000000 --- a/src/borg/cache_sync/unpack.h +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Borg cache synchronizer, - * based on a MessagePack for Python unpacking routine - * - * Copyright (C) 2009 Naoki INADA - * Copyright (c) 2017 Marian Beermann - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * This limits the depth of the structures we can unpack, i.e. how many containers - * are nestable. - */ -#define MSGPACK_EMBED_STACK_SIZE (16) -#include "unpack_define.h" - -// 2**32 - 1025 -#define _MAX_VALUE ( (uint32_t) 4294966271UL ) - -#define MIN(x, y) ((x) < (y) ? (x): (y)) - -#ifdef DEBUG -#define SET_LAST_ERROR(msg) \ - fprintf(stderr, "cache_sync parse error: %s\n", (msg)); \ - u->last_error = (msg); -#else -#define SET_LAST_ERROR(msg) \ - u->last_error = (msg); -#endif - -typedef struct unpack_user { - /* Item.chunks is at the top level; we don't care about anything else, - * only need to track the current level to navigate arbitrary and unknown structure. - * To discern keys from everything else on the top level we use expect_map_item_end. - */ - int level; - - const char *last_error; - - HashIndex *chunks; - - /* - * We don't care about most stuff. This flag tells us whether we're at the chunks structure, - * meaning: - * {'foo': 'bar', 'chunks': [...], 'stuff': ... } - * ^-HERE-^ - */ - int inside_chunks; - - /* does this item have a chunks list in it? */ - int has_chunks; - - enum { - /* the next thing is a map key at the Item root level, - * and it might be e.g. the "chunks" key we're looking for */ - expect_map_key, - - /* blocking state to expect_map_key - * { 'stuff': , 'chunks': [ - * emk -> emie -> -> -> -> emk ecb eeboce - * (nested containers are tracked via level) - * emk=expect_map_key, emie=expect_map_item_end, ecb=expect_chunks_begin, - * eeboce=expect_entry_begin_or_chunks_end - */ - expect_map_item_end, - - /* next thing must be the chunks array (array) */ - expect_chunks_begin, - - /* next thing must either be another CLE (array) or end of Item.chunks (array_end) */ - expect_entry_begin_or_chunks_end, - - /* - * processing ChunkListEntry tuple: - * expect_key, expect_size, expect_entry_end - */ - /* next thing must be the key (raw, l=32) */ - expect_key, - /* next thing must be the size (int) */ - expect_size, - /* next thing must be the end of the CLE (array_end) */ - expect_entry_end, - - expect_item_begin - } expect; - - /* collect values here for current chunklist entry */ - struct { - unsigned char key[32]; - uint32_t size; - } current; - - /* summing up chunks sizes here within a single item */ - struct { - uint64_t size; - } item; - - /* total sizes and files count coming from all files */ - struct { - uint64_t size, num_files; - } totals; - -} unpack_user; - -struct unpack_context; -typedef struct unpack_context unpack_context; -typedef int (*execute_fn)(unpack_context *ctx, const char* data, size_t len, size_t* off); - -#define UNEXPECTED(what) \ - if(u->inside_chunks || u->expect == expect_map_key) { \ - SET_LAST_ERROR("Unexpected object: " what); \ - return -1; \ - } - -static inline void unpack_init_user_state(unpack_user *u) -{ - u->last_error = NULL; - u->level = 0; - u->inside_chunks = 0; - u->expect = expect_item_begin; -} - -static inline int unpack_callback_uint64(unpack_user* u, int64_t d) -{ - switch(u->expect) { - case expect_size: - u->current.size = d; - u->expect = expect_entry_end; - break; - default: - UNEXPECTED("integer"); - } - return 0; -} - -static inline int unpack_callback_uint32(unpack_user* u, uint32_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_uint16(unpack_user* u, uint16_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_uint8(unpack_user* u, uint8_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_int64(unpack_user* u, uint64_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_int32(unpack_user* u, int32_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_int16(unpack_user* u, int16_t d) -{ - return unpack_callback_uint64(u, d); -} - -static inline int unpack_callback_int8(unpack_user* u, int8_t d) -{ - return unpack_callback_uint64(u, d); -} - -/* Ain't got anything to do with those floats */ -static inline int unpack_callback_double(unpack_user* u, double d) -{ - (void)d; - UNEXPECTED("double"); - return 0; -} - -static inline int unpack_callback_float(unpack_user* u, float d) -{ - (void)d; - UNEXPECTED("float"); - return 0; -} - -/* nil/true/false — I/don't/care */ -static inline int unpack_callback_nil(unpack_user* u) -{ - UNEXPECTED("nil"); - return 0; -} - -static inline int unpack_callback_true(unpack_user* u) -{ - UNEXPECTED("true"); - return 0; -} - -static inline int unpack_callback_false(unpack_user* u) -{ - UNEXPECTED("false"); - return 0; -} - -static inline int unpack_callback_array(unpack_user* u, unsigned int n) -{ - switch(u->expect) { - case expect_chunks_begin: - /* b'chunks': [ - * ^ */ - u->expect = expect_entry_begin_or_chunks_end; - break; - case expect_entry_begin_or_chunks_end: - /* b'chunks': [ ( - * ^ */ - if(n != 2) { - SET_LAST_ERROR("Invalid chunk list entry length"); - return -1; - } - u->expect = expect_key; - break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected array start"); - return -1; - } else { - u->level++; - return 0; - } - } - return 0; -} - -static inline int unpack_callback_array_item(unpack_user* u, unsigned int current) -{ - (void)u; (void)current; - return 0; -} - -static inline int unpack_callback_array_end(unpack_user* u) -{ - uint32_t *cache_entry; - uint32_t cache_values[3]; - uint64_t refcount; - - switch(u->expect) { - case expect_entry_end: - /* b'chunks': [ ( b'1234...', 123, 345 ) - * ^ */ - cache_entry = (uint32_t*) hashindex_get(u->chunks, u->current.key); - if(cache_entry) { - refcount = _le32toh(cache_entry[0]); - if(refcount > _MAX_VALUE) { - SET_LAST_ERROR("invalid reference count"); - return -1; - } - refcount += 1; - cache_entry[0] = _htole32(MIN(refcount, _MAX_VALUE)); - } else { - /* refcount, size */ - cache_values[0] = _htole32(1); - cache_values[1] = _htole32(u->current.size); - if(!hashindex_set(u->chunks, u->current.key, cache_values)) { - SET_LAST_ERROR("hashindex_set failed"); - return -1; - } - } - u->item.size += u->current.size; - u->expect = expect_entry_begin_or_chunks_end; - break; - case expect_entry_begin_or_chunks_end: - /* b'chunks': [ ] - * ^ */ - /* end of Item.chunks */ - u->inside_chunks = 0; - u->expect = expect_map_item_end; - break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Invalid state transition (unexpected array end)"); - return -1; - } else { - u->level--; - return 0; - } - } - return 0; -} - -static inline int unpack_callback_map(unpack_user* u, unsigned int n) -{ - (void)n; - - if(u->level == 0) { - if(u->expect != expect_item_begin) { - SET_LAST_ERROR("Invalid state transition"); /* unreachable */ - return -1; - } - /* This begins a new Item */ - u->expect = expect_map_key; - u->has_chunks = 0; - u->item.size = 0; - } - - if(u->inside_chunks) { - UNEXPECTED("map"); - } - - u->level++; - - return 0; -} - -static inline int unpack_callback_map_item(unpack_user* u, unsigned int current) -{ - (void)u; (void)current; - - if(u->level == 1) { - switch(u->expect) { - case expect_map_item_end: - u->expect = expect_map_key; - break; - default: - SET_LAST_ERROR("Unexpected map item"); - return -1; - } - } - return 0; -} - -static inline int unpack_callback_map_end(unpack_user* u) -{ - u->level--; - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected map end"); - return -1; - } - if(u->level == 0) { - /* This ends processing of an Item */ - if(u->has_chunks) { - u->totals.num_files += 1; - u->totals.size += u->item.size; - } - } - return 0; -} - -static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int length) -{ - /* raw = what Borg uses for text stuff */ - /* Note: p points to an internal buffer which contains l bytes. */ - (void)b; - - switch(u->expect) { - case expect_map_key: - if(length == 6 && !memcmp("chunks", p, 6)) { - u->expect = expect_chunks_begin; - u->inside_chunks = 1; - u->has_chunks = 1; - } else { - u->expect = expect_map_item_end; - } - break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected raw in chunks structure"); - return -1; - } - } - return 0; -} - -static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int length) -{ - /* bin = what Borg uses for binary stuff */ - /* Note: p points to an internal buffer which contains l bytes. */ - (void)b; - - switch(u->expect) { - case expect_key: - if(length != 32) { - SET_LAST_ERROR("Incorrect key length"); - return -1; - } - memcpy(u->current.key, p, 32); - u->expect = expect_size; - break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected bytes in chunks structure"); - return -1; - } - } - return 0; -} - -static inline int unpack_callback_ext(unpack_user* u, const char* base, const char* pos, - unsigned int length) -{ - (void)u; (void)base; (void)pos; (void)length; - UNEXPECTED("ext"); - return 0; -} - -#include "unpack_template.h" diff --git a/src/borg/cache_sync/unpack_define.h b/src/borg/cache_sync/unpack_define.h deleted file mode 100644 index 10c910861d..0000000000 --- a/src/borg/cache_sync/unpack_define.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * MessagePack unpacking routine template - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MSGPACK_UNPACK_DEFINE_H__ -#define MSGPACK_UNPACK_DEFINE_H__ - -#include "sysdep.h" -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -#ifndef MSGPACK_EMBED_STACK_SIZE -#define MSGPACK_EMBED_STACK_SIZE 32 -#endif - - -// CS is first byte & 0x1f -typedef enum { - CS_HEADER = 0x00, // nil - - //CS_ = 0x01, - //CS_ = 0x02, // false - //CS_ = 0x03, // true - - CS_BIN_8 = 0x04, - CS_BIN_16 = 0x05, - CS_BIN_32 = 0x06, - - CS_EXT_8 = 0x07, - CS_EXT_16 = 0x08, - CS_EXT_32 = 0x09, - - CS_FLOAT = 0x0a, - CS_DOUBLE = 0x0b, - CS_UINT_8 = 0x0c, - CS_UINT_16 = 0x0d, - CS_UINT_32 = 0x0e, - CS_UINT_64 = 0x0f, - CS_INT_8 = 0x10, - CS_INT_16 = 0x11, - CS_INT_32 = 0x12, - CS_INT_64 = 0x13, - - //CS_FIXEXT1 = 0x14, - //CS_FIXEXT2 = 0x15, - //CS_FIXEXT4 = 0x16, - //CS_FIXEXT8 = 0x17, - //CS_FIXEXT16 = 0x18, - - CS_RAW_8 = 0x19, - CS_RAW_16 = 0x1a, - CS_RAW_32 = 0x1b, - CS_ARRAY_16 = 0x1c, - CS_ARRAY_32 = 0x1d, - CS_MAP_16 = 0x1e, - CS_MAP_32 = 0x1f, - - ACS_RAW_VALUE, - ACS_BIN_VALUE, - ACS_EXT_VALUE, -} msgpack_unpack_state; - - -typedef enum { - CT_ARRAY_ITEM, - CT_MAP_KEY, - CT_MAP_VALUE, -} msgpack_container_type; - - -#ifdef __cplusplus -} -#endif - -#endif /* msgpack/unpack_define.h */ diff --git a/src/borg/cache_sync/unpack_template.h b/src/borg/cache_sync/unpack_template.h deleted file mode 100644 index 39f9f33141..0000000000 --- a/src/borg/cache_sync/unpack_template.h +++ /dev/null @@ -1,365 +0,0 @@ -/* - * MessagePack unpacking routine template - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * Copyright (c) 2017 Marian Beermann - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * - * This has been slightly adapted from the vanilla msgpack-{c, python} version. - * Since cache_sync does not intend to build an output data structure, - * msgpack_unpack_object and all of its uses was removed. - */ - -#ifndef USE_CASE_RANGE -#if !defined(_MSC_VER) -#define USE_CASE_RANGE -#endif -#endif - -typedef struct unpack_stack { - size_t size; - size_t count; - unsigned int ct; -} unpack_stack; - -struct unpack_context { - unpack_user user; - unsigned int cs; - unsigned int trail; - unsigned int top; - unpack_stack stack[MSGPACK_EMBED_STACK_SIZE]; -}; - -static inline void unpack_init(unpack_context* ctx) -{ - ctx->cs = CS_HEADER; - ctx->trail = 0; - ctx->top = 0; - unpack_init_user_state(&ctx->user); -} - -#define construct 1 - -static inline int unpack_execute(unpack_context* ctx, const char* data, size_t len, size_t* off) -{ - const unsigned char* p = (unsigned char*)data + *off; - const unsigned char* const pe = (unsigned char*)data + len; - const void* n = NULL; - - unsigned int trail = ctx->trail; - unsigned int cs = ctx->cs; - unsigned int top = ctx->top; - unpack_stack* stack = ctx->stack; - unpack_user* user = &ctx->user; - - unpack_stack* c = NULL; - - int ret; - - assert(len >= *off); - -#define construct_cb(name) \ - construct && unpack_callback ## name - -#define push_simple_value(func) \ - if(construct_cb(func)(user) < 0) { goto _failed; } \ - goto _push -#define push_fixed_value(func, arg) \ - if(construct_cb(func)(user, arg) < 0) { goto _failed; } \ - goto _push -#define push_variable_value(func, base, pos, len) \ - if(construct_cb(func)(user, \ - (const char*)base, (const char*)pos, len) < 0) { goto _failed; } \ - goto _push - -#define again_fixed_trail(_cs, trail_len) \ - trail = trail_len; \ - cs = _cs; \ - goto _fixed_trail_again -#define again_fixed_trail_if_zero(_cs, trail_len, ifzero) \ - trail = trail_len; \ - if(trail == 0) { goto ifzero; } \ - cs = _cs; \ - goto _fixed_trail_again - -#define start_container(func, count_, ct_) \ - if(top >= MSGPACK_EMBED_STACK_SIZE) { goto _failed; } /* FIXME */ \ - if(construct_cb(func)(user, count_) < 0) { goto _failed; } \ - if((count_) == 0) { \ - if (construct_cb(func##_end)(user) < 0) { goto _failed; } \ - goto _push; } \ - stack[top].ct = ct_; \ - stack[top].size = count_; \ - stack[top].count = 0; \ - ++top; \ - goto _header_again - -#define NEXT_CS(p) ((unsigned int)*p & 0x1f) - -#ifdef USE_CASE_RANGE -#define SWITCH_RANGE_BEGIN switch(*p) { -#define SWITCH_RANGE(FROM, TO) case FROM ... TO: -#define SWITCH_RANGE_DEFAULT default: -#define SWITCH_RANGE_END } -#else -#define SWITCH_RANGE_BEGIN { if(0) { -#define SWITCH_RANGE(FROM, TO) } else if(FROM <= *p && *p <= TO) { -#define SWITCH_RANGE_DEFAULT } else { -#define SWITCH_RANGE_END } } -#endif - - if(p == pe) { goto _out; } - do { - switch(cs) { - case CS_HEADER: - SWITCH_RANGE_BEGIN - SWITCH_RANGE(0x00, 0x7f) // Positive Fixnum - push_fixed_value(_uint8, *(uint8_t*)p); - SWITCH_RANGE(0xe0, 0xff) // Negative Fixnum - push_fixed_value(_int8, *(int8_t*)p); - SWITCH_RANGE(0xc0, 0xdf) // Variable - switch(*p) { - case 0xc0: // nil - push_simple_value(_nil); - //case 0xc1: // never used - case 0xc2: // false - push_simple_value(_false); - case 0xc3: // true - push_simple_value(_true); - case 0xc4: // bin 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xc5: // bin 16 - again_fixed_trail(NEXT_CS(p), 2); - case 0xc6: // bin 32 - again_fixed_trail(NEXT_CS(p), 4); - case 0xc7: // ext 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xc8: // ext 16 - again_fixed_trail(NEXT_CS(p), 2); - case 0xc9: // ext 32 - again_fixed_trail(NEXT_CS(p), 4); - case 0xca: // float - case 0xcb: // double - case 0xcc: // unsigned int 8 - case 0xcd: // unsigned int 16 - case 0xce: // unsigned int 32 - case 0xcf: // unsigned int 64 - case 0xd0: // signed int 8 - case 0xd1: // signed int 16 - case 0xd2: // signed int 32 - case 0xd3: // signed int 64 - again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03)); - case 0xd4: // fixext 1 - case 0xd5: // fixext 2 - case 0xd6: // fixext 4 - case 0xd7: // fixext 8 - again_fixed_trail_if_zero(ACS_EXT_VALUE, - (1 << (((unsigned int)*p) & 0x03))+1, - _ext_zero); - case 0xd8: // fixext 16 - again_fixed_trail_if_zero(ACS_EXT_VALUE, 16+1, _ext_zero); - case 0xd9: // str 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xda: // raw 16 - case 0xdb: // raw 32 - case 0xdc: // array 16 - case 0xdd: // array 32 - case 0xde: // map 16 - case 0xdf: // map 32 - again_fixed_trail(NEXT_CS(p), 2 << (((unsigned int)*p) & 0x01)); - default: - goto _failed; - } - SWITCH_RANGE(0xa0, 0xbf) // FixRaw - again_fixed_trail_if_zero(ACS_RAW_VALUE, ((unsigned int)*p & 0x1f), _raw_zero); - SWITCH_RANGE(0x90, 0x9f) // FixArray - start_container(_array, ((unsigned int)*p) & 0x0f, CT_ARRAY_ITEM); - SWITCH_RANGE(0x80, 0x8f) // FixMap - start_container(_map, ((unsigned int)*p) & 0x0f, CT_MAP_KEY); - - SWITCH_RANGE_DEFAULT - goto _failed; - SWITCH_RANGE_END - // end CS_HEADER - - - _fixed_trail_again: - ++p; // fallthrough - - default: - if((size_t)(pe - p) < trail) { goto _out; } - n = p; p += trail - 1; - switch(cs) { - case CS_EXT_8: - again_fixed_trail_if_zero(ACS_EXT_VALUE, *(uint8_t*)n+1, _ext_zero); - case CS_EXT_16: - again_fixed_trail_if_zero(ACS_EXT_VALUE, - _msgpack_load16(uint16_t,n)+1, - _ext_zero); - case CS_EXT_32: - again_fixed_trail_if_zero(ACS_EXT_VALUE, - _msgpack_load32(uint32_t,n)+1, - _ext_zero); - case CS_FLOAT: { - union { uint32_t i; float f; } mem; - mem.i = _msgpack_load32(uint32_t,n); - push_fixed_value(_float, mem.f); } - case CS_DOUBLE: { - union { uint64_t i; double f; } mem; - mem.i = _msgpack_load64(uint64_t,n); -#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi - // https://github.com/msgpack/msgpack-perl/pull/1 - mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); -#endif - push_fixed_value(_double, mem.f); } - case CS_UINT_8: - push_fixed_value(_uint8, *(uint8_t*)n); - case CS_UINT_16: - push_fixed_value(_uint16, _msgpack_load16(uint16_t,n)); - case CS_UINT_32: - push_fixed_value(_uint32, _msgpack_load32(uint32_t,n)); - case CS_UINT_64: - push_fixed_value(_uint64, _msgpack_load64(uint64_t,n)); - - case CS_INT_8: - push_fixed_value(_int8, *(int8_t*)n); - case CS_INT_16: - push_fixed_value(_int16, _msgpack_load16(int16_t,n)); - case CS_INT_32: - push_fixed_value(_int32, _msgpack_load32(int32_t,n)); - case CS_INT_64: - push_fixed_value(_int64, _msgpack_load64(int64_t,n)); - - case CS_BIN_8: - again_fixed_trail_if_zero(ACS_BIN_VALUE, *(uint8_t*)n, _bin_zero); - case CS_BIN_16: - again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load16(uint16_t,n), _bin_zero); - case CS_BIN_32: - again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load32(uint32_t,n), _bin_zero); - case ACS_BIN_VALUE: - _bin_zero: - push_variable_value(_bin, data, n, trail); - - case CS_RAW_8: - again_fixed_trail_if_zero(ACS_RAW_VALUE, *(uint8_t*)n, _raw_zero); - case CS_RAW_16: - again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load16(uint16_t,n), _raw_zero); - case CS_RAW_32: - again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load32(uint32_t,n), _raw_zero); - case ACS_RAW_VALUE: - _raw_zero: - push_variable_value(_raw, data, n, trail); - - case ACS_EXT_VALUE: - _ext_zero: - push_variable_value(_ext, data, n, trail); - - case CS_ARRAY_16: - start_container(_array, _msgpack_load16(uint16_t,n), CT_ARRAY_ITEM); - case CS_ARRAY_32: - /* FIXME security guard */ - start_container(_array, _msgpack_load32(uint32_t,n), CT_ARRAY_ITEM); - - case CS_MAP_16: - start_container(_map, _msgpack_load16(uint16_t,n), CT_MAP_KEY); - case CS_MAP_32: - /* FIXME security guard */ - start_container(_map, _msgpack_load32(uint32_t,n), CT_MAP_KEY); - - default: - goto _failed; - } - } - -_push: - if(top == 0) { goto _finish; } - c = &stack[top-1]; - switch(c->ct) { - case CT_ARRAY_ITEM: - if(construct_cb(_array_item)(user, c->count) < 0) { goto _failed; } - if(++c->count == c->size) { - if (construct_cb(_array_end)(user) < 0) { goto _failed; } - --top; - /*printf("stack pop %d\n", top);*/ - goto _push; - } - goto _header_again; - case CT_MAP_KEY: - c->ct = CT_MAP_VALUE; - goto _header_again; - case CT_MAP_VALUE: - if(construct_cb(_map_item)(user, c->count) < 0) { goto _failed; } - if(++c->count == c->size) { - if (construct_cb(_map_end)(user) < 0) { goto _failed; } - --top; - /*printf("stack pop %d\n", top);*/ - goto _push; - } - c->ct = CT_MAP_KEY; - goto _header_again; - - default: - goto _failed; - } - -_header_again: - cs = CS_HEADER; - ++p; - } while(p != pe); - goto _out; - - -_finish: - if (!construct) - unpack_callback_nil(user); - ++p; - ret = 1; - /* printf("-- finish --\n"); */ - goto _end; - -_failed: - /* printf("** FAILED **\n"); */ - ret = -1; - goto _end; - -_out: - ret = 0; - goto _end; - -_end: - ctx->cs = cs; - ctx->trail = trail; - ctx->top = top; - *off = p - (const unsigned char*)data; - - return ret; -#undef construct_cb -} - -#undef SWITCH_RANGE_BEGIN -#undef SWITCH_RANGE -#undef SWITCH_RANGE_DEFAULT -#undef SWITCH_RANGE_END -#undef push_simple_value -#undef push_fixed_value -#undef push_variable_value -#undef again_fixed_trail -#undef again_fixed_trail_if_zero -#undef start_container -#undef construct - -#undef NEXT_CS - -/* vim: set ts=4 sw=4 sts=4 expandtab */ diff --git a/src/borg/constants.py b/src/borg/constants.py index 76e63f7924..0511f62de0 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -78,7 +78,7 @@ # MAX_DATA_SIZE or it will trigger the check for that. MAX_ARCHIVES = 400000 -# repo.list() / .scan() result count limit the borg client uses +# repo.list() result count limit the borg client uses LIST_SCAN_LIMIT = 100000 FD_MAX_AGE = 4 * 60 # 4 minutes diff --git a/src/borg/crypto/keymanager.py b/src/borg/crypto/keymanager.py index d8d25893d4..63335c445a 100644 --- a/src/borg/crypto/keymanager.py +++ b/src/borg/crypto/keymanager.py @@ -4,8 +4,6 @@ from hashlib import sha256 from ..helpers import Error, yes, bin_to_hex, hex_to_bin, dash_open -from ..manifest import Manifest, NoManifestError -from ..repository import Repository from ..repoobj import RepoObj @@ -48,11 +46,7 @@ def __init__(self, repository): self.keyblob = None self.keyblob_storage = None - try: - manifest_chunk = self.repository.get(Manifest.MANIFEST_ID) - except Repository.ObjectNotFound: - raise NoManifestError - + manifest_chunk = repository.get_manifest() manifest_data = RepoObj.extract_crypted_data(manifest_chunk) key = identify_key(manifest_data) self.keyblob_storage = key.STORAGE diff --git a/src/borg/locking.py b/src/borg/fslocking.py similarity index 100% rename from src/borg/locking.py rename to src/borg/fslocking.py diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi index a50a76e4c6..722baaf9f8 100644 --- a/src/borg/hashindex.pyi +++ b/src/borg/hashindex.pyi @@ -38,13 +38,7 @@ class ChunkKeyIterator: class ChunkIndex(IndexBase): def add(self, key: bytes, refs: int, size: int) -> None: ... - def decref(self, key: bytes) -> CIE: ... - def incref(self, key: bytes) -> CIE: ... def iteritems(self, marker: bytes = ...) -> Iterator: ... - def merge(self, other_index) -> None: ... - def stats_against(self, master_index) -> Tuple: ... - def summarize(self) -> Tuple: ... - def zero_csize_ids(self) -> int: ... def __contains__(self, key: bytes) -> bool: ... def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ... def __setitem__(self, key: bytes, value: CIE) -> None: ... @@ -64,22 +58,14 @@ class NSIndex(IndexBase): def __contains__(self, key: bytes) -> bool: ... def __getitem__(self, key: bytes) -> Any: ... def __setitem__(self, key: bytes, value: Any) -> None: ... - def flags(self, key: bytes, mask: int, value: int = None) -> int: ... class NSIndex1(IndexBase): # legacy def iteritems(self, *args, **kwargs) -> Iterator: ... def __contains__(self, key: bytes) -> bool: ... def __getitem__(self, key: bytes) -> Any: ... def __setitem__(self, key: bytes, value: Any) -> None: ... - def flags(self, key: bytes, mask: int, value: int = None) -> int: ... class FuseVersionsIndex(IndexBase): def __contains__(self, key: bytes) -> bool: ... def __getitem__(self, key: bytes) -> Any: ... def __setitem__(self, key: bytes, value: Any) -> None: ... - -class CacheSynchronizer: - size_totals: int - num_files_totals: int - def __init__(self, chunks_index: Any) -> None: ... - def feed(self, chunk: bytes) -> None: ... diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index 207227035f..6e9404f10a 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -34,19 +34,7 @@ cdef extern from "_hashindex.c": double HASH_MAX_LOAD -cdef extern from "cache_sync/cache_sync.c": - ctypedef struct CacheSyncCtx: - pass - - CacheSyncCtx *cache_sync_init(HashIndex *chunks) - const char *cache_sync_error(const CacheSyncCtx *ctx) - uint64_t cache_sync_num_files_totals(const CacheSyncCtx *ctx) - uint64_t cache_sync_size_totals(const CacheSyncCtx *ctx) - int cache_sync_feed(CacheSyncCtx *ctx, void *data, uint32_t length) - void cache_sync_free(CacheSyncCtx *ctx) - - uint32_t _MAX_VALUE - +_MAX_VALUE = 4294966271UL # 2**32 - 1025 cdef _NoDefault = object() @@ -208,7 +196,7 @@ NSIndexEntry = namedtuple('NSIndexEntry', 'segment offset size') cdef class NSIndex(IndexBase): - value_size = 16 + value_size = 12 def __getitem__(self, key): assert len(key) == self.key_size @@ -221,13 +209,13 @@ cdef class NSIndex(IndexBase): def __setitem__(self, key, value): assert len(key) == self.key_size - cdef uint32_t[4] data + cdef uint32_t[3] data + assert len(value) == len(data) cdef uint32_t segment = value[0] assert segment <= _MAX_VALUE, "maximum number of segments reached" data[0] = _htole32(segment) data[1] = _htole32(value[1]) data[2] = _htole32(value[2]) - data[3] = 0 # init flags to all cleared if not hashindex_set(self.index, key, data): raise Exception('hashindex_set failed') @@ -240,12 +228,10 @@ cdef class NSIndex(IndexBase): assert segment <= _MAX_VALUE, "maximum number of segments reached" return data != NULL - def iteritems(self, marker=None, mask=0, value=0): + def iteritems(self, marker=None): """iterate over all items or optionally only over items having specific flag values""" cdef const unsigned char *key - assert isinstance(mask, int) - assert isinstance(value, int) - iter = NSKeyIterator(self.key_size, mask, value) + iter = NSKeyIterator(self.key_size) iter.idx = self iter.index = self.index if marker: @@ -255,20 +241,6 @@ cdef class NSIndex(IndexBase): iter.key = key - self.key_size return iter - def flags(self, key, mask=0xFFFFFFFF, value=None): - """query and optionally set flags""" - assert len(key) == self.key_size - assert isinstance(mask, int) - data = hashindex_get(self.index, key) - if not data: - raise KeyError(key) - flags = _le32toh(data[3]) - if isinstance(value, int): - new_flags = flags & ~mask # clear masked bits - new_flags |= value & mask # set value bits - data[3] = _htole32(new_flags) - return flags & mask # always return previous flags value - cdef class NSKeyIterator: cdef NSIndex idx @@ -276,15 +248,10 @@ cdef class NSKeyIterator: cdef const unsigned char *key cdef int key_size cdef int exhausted - cdef unsigned int flag_mask - cdef unsigned int flag_value - def __cinit__(self, key_size, mask, value): + def __cinit__(self, key_size): self.key = NULL self.key_size = key_size - # note: mask and value both default to 0, so they will match all entries - self.flag_mask = _htole32(mask) - self.flag_value = _htole32(value) self.exhausted = 0 def __iter__(self): @@ -294,16 +261,11 @@ cdef class NSKeyIterator: cdef uint32_t *value if self.exhausted: raise StopIteration - while True: - self.key = hashindex_next_key(self.index, self.key) - if not self.key: - self.exhausted = 1 - raise StopIteration - value = (self.key + self.key_size) - if value[3] & self.flag_mask == self.flag_value: - # we found a matching entry! - break - + self.key = hashindex_next_key(self.index, self.key) + if not self.key: + self.exhausted = 1 + raise StopIteration + value = (self.key + self.key_size) cdef uint32_t segment = _le32toh(value[0]) assert segment <= _MAX_VALUE, "maximum number of segments reached" return ((self.key)[:self.key_size], @@ -343,9 +305,8 @@ cdef class NSIndex1(IndexBase): # legacy borg 1.x assert segment <= _MAX_VALUE, "maximum number of segments reached" return data != NULL - def iteritems(self, marker=None, mask=0, value=0): + def iteritems(self, marker=None): cdef const unsigned char *key - assert mask == 0 and value == 0, "using mask/value is not supported for old index" iter = NSKeyIterator1(self.key_size) iter.idx = self iter.index = self.index @@ -356,9 +317,6 @@ cdef class NSIndex1(IndexBase): # legacy borg 1.x iter.key = key - self.key_size return iter - def flags(self, key, mask=0xFFFFFFFF, value=None): - raise NotImplemented("calling .flags() is not supported for old index") - cdef class NSKeyIterator1: # legacy borg 1.x cdef NSIndex1 idx @@ -436,33 +394,6 @@ cdef class ChunkIndex(IndexBase): assert _le32toh(data[0]) <= _MAX_VALUE, "invalid reference count" return data != NULL - def incref(self, key): - """Increase refcount for 'key', return (refcount, size)""" - assert len(key) == self.key_size - data = hashindex_get(self.index, key) - if not data: - raise KeyError(key) - cdef uint32_t refcount = _le32toh(data[0]) - assert refcount <= _MAX_VALUE, "invalid reference count" - if refcount != _MAX_VALUE: - refcount += 1 - data[0] = _htole32(refcount) - return refcount, _le32toh(data[1]) - - def decref(self, key): - """Decrease refcount for 'key', return (refcount, size)""" - assert len(key) == self.key_size - data = hashindex_get(self.index, key) - if not data: - raise KeyError(key) - cdef uint32_t refcount = _le32toh(data[0]) - # Never decrease a reference count of zero - assert 0 < refcount <= _MAX_VALUE, "invalid reference count" - if refcount != _MAX_VALUE: - refcount -= 1 - data[0] = _htole32(refcount) - return refcount, _le32toh(data[1]) - def iteritems(self, marker=None): cdef const unsigned char *key iter = ChunkKeyIterator(self.key_size) @@ -475,65 +406,6 @@ cdef class ChunkIndex(IndexBase): iter.key = key - self.key_size return iter - def summarize(self): - cdef uint64_t size = 0, unique_size = 0, chunks = 0, unique_chunks = 0 - cdef uint32_t *values - cdef uint32_t refcount - cdef unsigned char *key = NULL - - while True: - key = hashindex_next_key(self.index, key) - if not key: - break - unique_chunks += 1 - values = (key + self.key_size) - refcount = _le32toh(values[0]) - assert refcount <= _MAX_VALUE, "invalid reference count" - chunks += refcount - unique_size += _le32toh(values[1]) - size += _le32toh(values[1]) * _le32toh(values[0]) - - return size, unique_size, unique_chunks, chunks - - def stats_against(self, ChunkIndex master_index): - """ - Calculate chunk statistics of this index against *master_index*. - - A chunk is counted as unique if the number of references - in this index matches the number of references in *master_index*. - - This index must be a subset of *master_index*. - - Return the same statistics tuple as summarize: - size, unique_size, unique_chunks, chunks. - """ - cdef uint64_t size = 0, unique_size = 0, chunks = 0, unique_chunks = 0 - cdef uint32_t our_refcount, chunk_size - cdef const uint32_t *our_values - cdef const uint32_t *master_values - cdef const unsigned char *key = NULL - cdef HashIndex *master = master_index.index - - while True: - key = hashindex_next_key(self.index, key) - if not key: - break - our_values = (key + self.key_size) - master_values = hashindex_get(master, key) - if not master_values: - raise ValueError('stats_against: key contained in self but not in master_index.') - our_refcount = _le32toh(our_values[0]) - chunk_size = _le32toh(master_values[1]) - - chunks += our_refcount - size += chunk_size * our_refcount - if our_values[0] == master_values[0]: - # our refcount equals the master's refcount, so this chunk is unique to us - unique_chunks += 1 - unique_size += chunk_size - - return size, unique_size, unique_chunks, chunks - def add(self, key, refs, size): assert len(key) == self.key_size cdef uint32_t[2] data @@ -556,15 +428,6 @@ cdef class ChunkIndex(IndexBase): if not hashindex_set(self.index, key, data): raise Exception('hashindex_set failed') - def merge(self, ChunkIndex other): - cdef unsigned char *key = NULL - - while True: - key = hashindex_next_key(other.index, key) - if not key: - break - self._add(key, (key + self.key_size)) - cdef class ChunkKeyIterator: cdef ChunkIndex idx @@ -592,42 +455,3 @@ cdef class ChunkKeyIterator: cdef uint32_t refcount = _le32toh(value[0]) assert refcount <= _MAX_VALUE, "invalid reference count" return (self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1])) - - -cdef Py_buffer ro_buffer(object data) except *: - cdef Py_buffer view - PyObject_GetBuffer(data, &view, PyBUF_SIMPLE) - return view - - -cdef class CacheSynchronizer: - cdef ChunkIndex chunks - cdef CacheSyncCtx *sync - - def __cinit__(self, chunks): - self.chunks = chunks - self.sync = cache_sync_init(self.chunks.index) - if not self.sync: - raise Exception('cache_sync_init failed') - - def __dealloc__(self): - if self.sync: - cache_sync_free(self.sync) - - def feed(self, chunk): - cdef Py_buffer chunk_buf = ro_buffer(chunk) - cdef int rc - rc = cache_sync_feed(self.sync, chunk_buf.buf, chunk_buf.len) - PyBuffer_Release(&chunk_buf) - if not rc: - error = cache_sync_error(self.sync) - if error != NULL: - raise ValueError('cache_sync_feed failed: ' + error.decode('ascii')) - - @property - def num_files_totals(self): - return cache_sync_num_files_totals(self.sync) - - @property - def size_totals(self): - return cache_sync_size_totals(self.sync) diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 53555e7e6a..23833dd52d 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -5,6 +5,7 @@ Code used to be in borg/helpers.py but was split into the modules in this package, which are imported into here for compatibility. """ + import os from typing import List from collections import namedtuple @@ -38,7 +39,7 @@ from .parseformat import swidth_slice, ellipsis_truncate from .parseformat import BorgJsonEncoder, basic_json_data, json_print, json_dump, prepare_dump_dict from .parseformat import Highlander, MakePathSafeAction -from .process import daemonize, daemonizing +from .process import daemonize, daemonizing, ThreadRunner from .process import signal_handler, raising_signal_handler, sig_int, ignore_sigint, SigHup, SigTerm from .process import popen_with_error_handling, is_terminal, prepare_subprocess_env, create_filter_process from .progress import ProgressIndicatorPercent, ProgressIndicatorMessage diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index f422fc25a8..b3c602f90b 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -416,7 +416,7 @@ def safe_unlink(path): Use this when deleting potentially large files when recovering from a VFS error such as ENOSPC. It can help a full file system recover. Refer to the "File system interaction" section - in repository.py for further explanations. + in legacyrepository.py for further explanations. """ try: os.unlink(path) diff --git a/src/borg/helpers/misc.py b/src/borg/helpers/misc.py index 1f687a0bb5..6028b93a30 100644 --- a/src/borg/helpers/misc.py +++ b/src/borg/helpers/misc.py @@ -2,7 +2,7 @@ import io import os import os.path -import platform +import platform # python stdlib import - if this fails, check that cwd != src/borg/ import sys from collections import deque from itertools import islice diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index c69889b180..f1ff427752 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -11,7 +11,7 @@ import stat import uuid from typing import Dict, Set, Tuple, ClassVar, Any, TYPE_CHECKING, Literal -from collections import Counter, OrderedDict +from collections import OrderedDict from datetime import datetime, timezone from functools import partial from string import Formatter @@ -454,6 +454,19 @@ class Location: re.VERBOSE, ) # path + sftp_re = re.compile( + r""" + (?Psftp):// # sftp:// + """ + + optional_user_re + + host_re + + r""" # user@ (optional), host name or address + (?::(?P\d+))? # :port (optional) + """ + + abs_path_re, + re.VERBOSE, + ) # path + socket_re = re.compile( r""" (?Psocket):// # socket:// @@ -499,7 +512,7 @@ def parse(self, text, overrides={}): if not text: # we did not get a text to parse, so we try to fetch from the environment text = os.environ.get(self.repo_env_var) - if text is None: + if not text: # None or "" return self.raw = text # as given by user, might contain placeholders @@ -518,6 +531,14 @@ def normpath_special(p): return ("/." + p) if relative else p m = self.ssh_re.match(text) + if m: + self.proto = m.group("proto") + self.user = m.group("user") + self._host = m.group("host") + self.port = m.group("port") and int(m.group("port")) or None + self.path = normpath_special(m.group("path")) + return True + m = self.sftp_re.match(text) if m: self.proto = m.group("proto") self.user = m.group("user") @@ -837,9 +858,7 @@ class ItemFormatter(BaseFormatter): "flags": "file flags", "extra": 'prepends {target} with " -> " for soft links and " link to " for hard links', "size": "file size", - "dsize": "deduplicated size", "num_chunks": "number of chunks in this file", - "unique_chunks": "number of unique chunks in this file", "mtime": "file modification time", "ctime": "file change time", "atime": "file access time", @@ -853,14 +872,14 @@ class ItemFormatter(BaseFormatter): } KEY_GROUPS = ( ("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "flags"), - ("size", "dsize", "num_chunks", "unique_chunks"), + ("size", "num_chunks"), ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"), tuple(sorted(hash_algorithms)), ("archiveid", "archivename", "extra"), ("health",), ) - KEYS_REQUIRING_CACHE = ("dsize", "unique_chunks") + KEYS_REQUIRING_CACHE = () @classmethod def format_needs_cache(cls, format): @@ -878,9 +897,7 @@ def __init__(self, archive, format): self.format_keys = {f[1] for f in Formatter().parse(format)} self.call_keys = { "size": self.calculate_size, - "dsize": partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.size), "num_chunks": self.calculate_num_chunks, - "unique_chunks": partial(self.sum_unique_chunks_metadata, lambda chunk: 1), "isomtime": partial(self.format_iso_time, "mtime"), "isoctime": partial(self.format_iso_time, "ctime"), "isoatime": partial(self.format_iso_time, "atime"), @@ -925,20 +942,6 @@ def get_item_data(self, item, jsonline=False): item_data[key] = self.call_keys[key](item) return item_data - def sum_unique_chunks_metadata(self, metadata_func, item): - """ - sum unique chunks metadata, a unique chunk is a chunk which is referenced globally as often as it is in the - item - - item: The item to sum its unique chunks' metadata - metadata_func: A function that takes a parameter of type ChunkIndexEntry and returns a number, used to return - the metadata needed from the chunk - """ - chunk_index = self.archive.cache.chunks - chunks = item.get("chunks", []) - chunks_counter = Counter(c.id for c in chunks) - return sum(metadata_func(c) for c in chunks if chunk_index[c.id].refcount == chunks_counter[c.id]) - def calculate_num_chunks(self, item): return len(item.get("chunks", [])) @@ -1181,19 +1184,21 @@ def ellipsis_truncate(msg, space): class BorgJsonEncoder(json.JSONEncoder): def default(self, o): + from ..legacyrepository import LegacyRepository from ..repository import Repository + from ..legacyremote import LegacyRemoteRepository from ..remote import RemoteRepository from ..archive import Archive - from ..cache import LocalCache, AdHocCache, AdHocWithFilesCache + from ..cache import AdHocCache, AdHocWithFilesCache - if isinstance(o, Repository) or isinstance(o, RemoteRepository): + if isinstance(o, (LegacyRepository, LegacyRemoteRepository)) or isinstance(o, (Repository, RemoteRepository)): return {"id": bin_to_hex(o.id), "location": o._location.canonical_path()} if isinstance(o, Archive): return o.info() - if isinstance(o, (LocalCache, AdHocWithFilesCache)): - return {"path": o.path, "stats": o.stats()} + if isinstance(o, (AdHocWithFilesCache,)): + return {"path": o.path} if isinstance(o, AdHocCache): - return {"stats": o.stats()} + return {} if callable(getattr(o, "to_json", None)): return o.to_json() return super().default(o) diff --git a/src/borg/helpers/process.py b/src/borg/helpers/process.py index 7f20089f41..1112f98071 100644 --- a/src/borg/helpers/process.py +++ b/src/borg/helpers/process.py @@ -6,6 +6,7 @@ import subprocess import sys import time +import threading import traceback from .. import __version__ @@ -82,9 +83,11 @@ def daemonizing(*, timeout=5): logger.debug("Daemonizing: Foreground process (%s, %s, %s) is waiting for background process..." % old_id) exit_code = EXIT_SUCCESS # Indeed, SIGHUP and SIGTERM handlers should have been set on archiver.run(). Just in case... - with signal_handler("SIGINT", raising_signal_handler(KeyboardInterrupt)), signal_handler( - "SIGHUP", raising_signal_handler(SigHup) - ), signal_handler("SIGTERM", raising_signal_handler(SigTerm)): + with ( + signal_handler("SIGINT", raising_signal_handler(KeyboardInterrupt)), + signal_handler("SIGHUP", raising_signal_handler(SigHup)), + signal_handler("SIGTERM", raising_signal_handler(SigTerm)), + ): try: if timeout > 0: time.sleep(timeout) @@ -244,8 +247,7 @@ def __exit__(self, exception_type, exception_value, traceback): self.ctx = None -# global flag which might trigger some special behaviour on first ctrl-c / SIGINT, -# e.g. if this is interrupting "borg create", it shall try to create a checkpoint. +# global flag which might trigger some special behaviour on first ctrl-c / SIGINT. sig_int = SigIntManager() @@ -397,3 +399,50 @@ def create_filter_process(cmd, stream, stream_close, inbound=True): if borg_succeeded and rc: # if borg did not succeed, we know that we killed the filter process raise Error("filter %s failed, rc=%d" % (cmd, rc)) + + +class ThreadRunner: + def __init__(self, sleep_interval, target, *args, **kwargs): + """ + Initialize the ThreadRunner with a target function and its arguments. + + :param sleep_interval: The interval (in seconds) to sleep between executions of the target function. + :param target: The target function to be run in the thread. + :param args: The positional arguments to be passed to the target function. + :param kwargs: The keyword arguments to be passed to the target function. + """ + self._target = target + self._args = args + self._kwargs = kwargs + self._sleep_interval = sleep_interval + self._thread = None + self._keep_running = threading.Event() + self._keep_running.set() + + def _run_with_termination(self): + """ + Wrapper function to check if the thread should keep running. + """ + while self._keep_running.is_set(): + self._target(*self._args, **self._kwargs) + # sleep up to self._sleep_interval, but end the sleep early if we shall not keep running: + count = 1000 + micro_sleep = float(self._sleep_interval) / count + while self._keep_running.is_set() and count > 0: + time.sleep(micro_sleep) + count -= 1 + + def start(self): + """ + Start the thread. + """ + self._thread = threading.Thread(target=self._run_with_termination) + self._thread.start() + + def terminate(self): + """ + Signal the thread to stop and wait for it to finish. + """ + if self._thread is not None: + self._keep_running.clear() + self._thread.join() diff --git a/src/borg/legacyremote.py b/src/borg/legacyremote.py new file mode 100644 index 0000000000..496bd01e4e --- /dev/null +++ b/src/borg/legacyremote.py @@ -0,0 +1,953 @@ +import errno +import functools +import inspect +import logging +import os +import select +import shlex +import shutil +import socket +import struct +import sys +import tempfile +import textwrap +import time +from subprocess import Popen, PIPE + +from . import __version__ +from .compress import Compressor +from .constants import * # NOQA +from .helpers import Error, ErrorWithTraceback, IntegrityError +from .helpers import bin_to_hex +from .helpers import get_limited_unpacker +from .helpers import replace_placeholders +from .helpers import format_file_size +from .helpers import safe_unlink +from .helpers import prepare_subprocess_env, ignore_sigint +from .helpers import get_socket_filename +from .fslocking import LockTimeout, NotLocked, NotMyLock, LockFailed +from .logger import create_logger +from .helpers import msgpack +from .legacyrepository import LegacyRepository +from .version import parse_version, format_version +from .checksums import xxh64 +from .helpers.datastruct import EfficientCollectionQueue + +logger = create_logger(__name__) + +BORG_VERSION = parse_version(__version__) +MSGID, MSG, ARGS, RESULT, LOG = "i", "m", "a", "r", "l" + +MAX_INFLIGHT = 100 + +RATELIMIT_PERIOD = 0.1 + + +class ConnectionClosed(Error): + """Connection closed by remote host""" + + exit_mcode = 80 + + +class ConnectionClosedWithHint(ConnectionClosed): + """Connection closed by remote host. {}""" + + exit_mcode = 81 + + +class PathNotAllowed(Error): + """Repository path not allowed: {}""" + + exit_mcode = 83 + + +class InvalidRPCMethod(Error): + """RPC method {} is not valid""" + + exit_mcode = 82 + + +class UnexpectedRPCDataFormatFromClient(Error): + """Borg {}: Got unexpected RPC data format from client.""" + + exit_mcode = 85 + + +class UnexpectedRPCDataFormatFromServer(Error): + """Got unexpected RPC data format from server:\n{}""" + + exit_mcode = 86 + + def __init__(self, data): + try: + data = data.decode()[:128] + except UnicodeDecodeError: + data = data[:128] + data = ["%02X" % byte for byte in data] + data = textwrap.fill(" ".join(data), 16 * 3) + super().__init__(data) + + +class ConnectionBrokenWithHint(Error): + """Connection to remote host is broken. {}""" + + exit_mcode = 87 + + +# Protocol compatibility: +# In general the server is responsible for rejecting too old clients and the client it responsible for rejecting +# too old servers. This ensures that the knowledge what is compatible is always held by the newer component. +# +# For the client the return of the negotiate method is a dict which includes the server version. +# +# All method calls on the remote repository object must be allowlisted in RepositoryServer.rpc_methods and have api +# stubs in LegacyRemoteRepository. The @api decorator on these stubs is used to set server version requirements. +# +# Method parameters are identified only by name and never by position. Unknown parameters are ignored by the server. +# If a new parameter is important and may not be ignored, on the client a parameter specific version requirement needs +# to be added. +# When parameters are removed, they need to be preserved as defaulted parameters on the client stubs so that older +# servers still get compatible input. + + +class SleepingBandwidthLimiter: + def __init__(self, limit): + if limit: + self.ratelimit = int(limit * RATELIMIT_PERIOD) + self.ratelimit_last = time.monotonic() + self.ratelimit_quota = self.ratelimit + else: + self.ratelimit = None + + def write(self, fd, to_send): + if self.ratelimit: + now = time.monotonic() + if self.ratelimit_last + RATELIMIT_PERIOD <= now: + self.ratelimit_quota += self.ratelimit + if self.ratelimit_quota > 2 * self.ratelimit: + self.ratelimit_quota = 2 * self.ratelimit + self.ratelimit_last = now + if self.ratelimit_quota == 0: + tosleep = self.ratelimit_last + RATELIMIT_PERIOD - now + time.sleep(tosleep) + self.ratelimit_quota += self.ratelimit + self.ratelimit_last = time.monotonic() + if len(to_send) > self.ratelimit_quota: + to_send = to_send[: self.ratelimit_quota] + try: + written = os.write(fd, to_send) + except BrokenPipeError: + raise ConnectionBrokenWithHint("Broken Pipe") from None + if self.ratelimit: + self.ratelimit_quota -= written + return written + + +def api(*, since, **kwargs_decorator): + """Check version requirements and use self.call to do the remote method call. + + specifies the version in which borg introduced this method. + Calling this method when connected to an older version will fail without transmitting anything to the server. + + Further kwargs can be used to encode version specific restrictions: + + is the value resulting in the behaviour before introducing the new parameter. + If a previous hardcoded behaviour is parameterized in a version, this allows calls that use the previously + hardcoded behaviour to pass through and generates an error if another behaviour is requested by the client. + E.g. when 'append_only' was introduced in 1.0.7 the previous behaviour was what now is append_only=False. + Thus @api(..., append_only={'since': parse_version('1.0.7'), 'previously': False}) allows calls + with append_only=False for all version but rejects calls using append_only=True on versions older than 1.0.7. + + is a flag to set the behaviour if an old version is called the new way. + If set to True, the method is called without the (not yet supported) parameter (this should be done if that is the + more desirable behaviour). If False, an exception is generated. + E.g. before 'threshold' was introduced in 1.2.0a8, a hardcoded threshold of 0.1 was used in commit(). + """ + + def decorator(f): + @functools.wraps(f) + def do_rpc(self, *args, **kwargs): + sig = inspect.signature(f) + bound_args = sig.bind(self, *args, **kwargs) + named = {} # Arguments for the remote process + extra = {} # Arguments for the local process + for name, param in sig.parameters.items(): + if name == "self": + continue + if name in bound_args.arguments: + if name == "wait": + extra[name] = bound_args.arguments[name] + else: + named[name] = bound_args.arguments[name] + else: + if param.default is not param.empty: + named[name] = param.default + + if self.server_version < since: + raise self.RPCServerOutdated(f.__name__, format_version(since)) + + for name, restriction in kwargs_decorator.items(): + if restriction["since"] <= self.server_version: + continue + if "previously" in restriction and named[name] == restriction["previously"]: + continue + if restriction.get("dontcare", False): + continue + + raise self.RPCServerOutdated( + f"{f.__name__} {name}={named[name]!s}", format_version(restriction["since"]) + ) + + return self.call(f.__name__, named, **extra) + + return do_rpc + + return decorator + + +class LegacyRemoteRepository: + extra_test_args = [] # type: ignore + + class RPCError(Exception): + def __init__(self, unpacked): + # unpacked has keys: 'exception_args', 'exception_full', 'exception_short', 'sysinfo' + self.unpacked = unpacked + + def get_message(self): + return "\n".join(self.unpacked["exception_short"]) + + @property + def traceback(self): + return self.unpacked.get("exception_trace", True) + + @property + def exception_class(self): + return self.unpacked["exception_class"] + + @property + def exception_full(self): + return "\n".join(self.unpacked["exception_full"]) + + @property + def sysinfo(self): + return self.unpacked["sysinfo"] + + class RPCServerOutdated(Error): + """Borg server is too old for {}. Required version {}""" + + exit_mcode = 84 + + @property + def method(self): + return self.args[0] + + @property + def required_version(self): + return self.args[1] + + def __init__( + self, + location, + create=False, + exclusive=False, + lock_wait=None, + lock=True, + append_only=False, + make_parent_dirs=False, + args=None, + ): + self.location = self._location = location + self.preload_ids = [] + self.msgid = 0 + self.rx_bytes = 0 + self.tx_bytes = 0 + self.to_send = EfficientCollectionQueue(1024 * 1024, bytes) + self.stdin_fd = self.stdout_fd = self.stderr_fd = None + self.stderr_received = b"" # incomplete stderr line bytes received (no \n yet) + self.chunkid_to_msgids = {} + self.ignore_responses = set() + self.responses = {} + self.async_responses = {} + self.shutdown_time = None + self.ratelimit = SleepingBandwidthLimiter(args.upload_ratelimit * 1024 if args and args.upload_ratelimit else 0) + self.upload_buffer_size_limit = args.upload_buffer * 1024 * 1024 if args and args.upload_buffer else 0 + self.unpacker = get_limited_unpacker("client") + self.server_version = None # we update this after server sends its version + self.p = self.sock = None + self._args = args + if self.location.proto == "ssh": + testing = location.host == "__testsuite__" + # when testing, we invoke and talk to a borg process directly (no ssh). + # when not testing, we invoke the system-installed ssh binary to talk to a remote borg. + env = prepare_subprocess_env(system=not testing) + borg_cmd = self.borg_cmd(args, testing) + if not testing: + borg_cmd = self.ssh_cmd(location) + borg_cmd + logger.debug("SSH command line: %s", borg_cmd) + # we do not want the ssh getting killed by Ctrl-C/SIGINT because it is needed for clean shutdown of borg. + self.p = Popen(borg_cmd, bufsize=0, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env, preexec_fn=ignore_sigint) + self.stdin_fd = self.p.stdin.fileno() + self.stdout_fd = self.p.stdout.fileno() + self.stderr_fd = self.p.stderr.fileno() + self.r_fds = [self.stdout_fd, self.stderr_fd] + self.x_fds = [self.stdin_fd, self.stdout_fd, self.stderr_fd] + elif self.location.proto == "socket": + if args.use_socket is False or args.use_socket is True: # nothing or --socket + socket_path = get_socket_filename() + else: # --socket=/some/path + socket_path = args.use_socket + self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM) + try: + self.sock.connect(socket_path) # note: socket_path length is rather limited. + except FileNotFoundError: + self.sock = None + raise Error(f"The socket file {socket_path} does not exist.") + except ConnectionRefusedError: + self.sock = None + raise Error(f"There is no borg serve running for the socket file {socket_path}.") + self.stdin_fd = self.sock.makefile("wb").fileno() + self.stdout_fd = self.sock.makefile("rb").fileno() + self.stderr_fd = None + self.r_fds = [self.stdout_fd] + self.x_fds = [self.stdin_fd, self.stdout_fd] + else: + raise Error(f"Unsupported protocol {location.proto}") + + os.set_blocking(self.stdin_fd, False) + assert not os.get_blocking(self.stdin_fd) + os.set_blocking(self.stdout_fd, False) + assert not os.get_blocking(self.stdout_fd) + if self.stderr_fd is not None: + os.set_blocking(self.stderr_fd, False) + assert not os.get_blocking(self.stderr_fd) + + try: + try: + version = self.call("negotiate", {"client_data": {"client_version": BORG_VERSION}}) + except ConnectionClosed: + raise ConnectionClosedWithHint("Is borg working on the server?") from None + if isinstance(version, dict): + self.server_version = version["server_version"] + else: + raise Exception("Server insisted on using unsupported protocol version %s" % version) + + self.id = self.open( + path=self.location.path, + create=create, + lock_wait=lock_wait, + lock=lock, + exclusive=exclusive, + append_only=append_only, + make_parent_dirs=make_parent_dirs, + v1_or_v2=True, # make remote use LegacyRepository + ) + info = self.info() + self.version = info["version"] + self.append_only = info["append_only"] + + except Exception: + self.close() + raise + + def __del__(self): + if len(self.responses): + logging.debug("still %d cached responses left in LegacyRemoteRepository" % (len(self.responses),)) + if self.p or self.sock: + self.close() + assert False, "cleanup happened in LegacyRemoteRepository.__del__" + + def __repr__(self): + return f"<{self.__class__.__name__} {self.location.canonical_path()}>" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + if exc_type is not None: + self.shutdown_time = time.monotonic() + 30 + self.rollback() + finally: + # in any case, we want to close the repo cleanly, even if the + # rollback can not succeed (e.g. because the connection was + # already closed) and raised another exception: + logger.debug( + "LegacyRemoteRepository: %s bytes sent, %s bytes received, %d messages sent", + format_file_size(self.tx_bytes), + format_file_size(self.rx_bytes), + self.msgid, + ) + self.close() + + @property + def id_str(self): + return bin_to_hex(self.id) + + def borg_cmd(self, args, testing): + """return a borg serve command line""" + # give some args/options to 'borg serve' process as they were given to us + opts = [] + if args is not None: + root_logger = logging.getLogger() + if root_logger.isEnabledFor(logging.DEBUG): + opts.append("--debug") + elif root_logger.isEnabledFor(logging.INFO): + opts.append("--info") + elif root_logger.isEnabledFor(logging.WARNING): + pass # warning is default + elif root_logger.isEnabledFor(logging.ERROR): + opts.append("--error") + elif root_logger.isEnabledFor(logging.CRITICAL): + opts.append("--critical") + else: + raise ValueError("log level missing, fix this code") + + # Tell the remote server about debug topics it may need to consider. + # Note that debug topics are usable for "spew" or "trace" logs which would + # be too plentiful to transfer for normal use, so the server doesn't send + # them unless explicitly enabled. + # + # Needless to say, if you do --debug-topic=repository.compaction, for example, + # with a 1.0.x server it won't work, because the server does not recognize the + # option. + # + # This is not considered a problem, since this is a debugging feature that + # should not be used for regular use. + for topic in args.debug_topics: + if "." not in topic: + topic = "borg.debug." + topic + if "repository" in topic: + opts.append("--debug-topic=%s" % topic) + + if "storage_quota" in args and args.storage_quota: + opts.append("--storage-quota=%s" % args.storage_quota) + env_vars = [] + if testing: + return env_vars + [sys.executable, "-m", "borg", "serve"] + opts + self.extra_test_args + else: # pragma: no cover + remote_path = args.remote_path or os.environ.get("BORG_REMOTE_PATH", "borg") + remote_path = replace_placeholders(remote_path) + return env_vars + [remote_path, "serve"] + opts + + def ssh_cmd(self, location): + """return a ssh command line that can be prefixed to a borg command line""" + rsh = self._args.rsh or os.environ.get("BORG_RSH", "ssh") + args = shlex.split(rsh) + if location.port: + args += ["-p", str(location.port)] + if location.user: + args.append(f"{location.user}@{location.host}") + else: + args.append("%s" % location.host) + return args + + def call(self, cmd, args, **kw): + for resp in self.call_many(cmd, [args], **kw): + return resp + + def call_many(self, cmd, calls, wait=True, is_preloaded=False, async_wait=True): + if not calls and cmd != "async_responses": + return + + def send_buffer(): + if self.to_send: + try: + written = self.ratelimit.write(self.stdin_fd, self.to_send.peek_front()) + self.tx_bytes += written + self.to_send.pop_front(written) + except OSError as e: + # io.write might raise EAGAIN even though select indicates + # that the fd should be writable. + # EWOULDBLOCK is added for defensive programming sake. + if e.errno not in [errno.EAGAIN, errno.EWOULDBLOCK]: + raise + + def pop_preload_msgid(chunkid): + msgid = self.chunkid_to_msgids[chunkid].pop(0) + if not self.chunkid_to_msgids[chunkid]: + del self.chunkid_to_msgids[chunkid] + return msgid + + def handle_error(unpacked): + if "exception_class" not in unpacked: + return + + error = unpacked["exception_class"] + args = unpacked["exception_args"] + + if error == "Error": + raise Error(args[0]) + elif error == "ErrorWithTraceback": + raise ErrorWithTraceback(args[0]) + elif error == "DoesNotExist": + raise LegacyRepository.DoesNotExist(self.location.processed) + elif error == "AlreadyExists": + raise LegacyRepository.AlreadyExists(self.location.processed) + elif error == "CheckNeeded": + raise LegacyRepository.CheckNeeded(self.location.processed) + elif error == "IntegrityError": + raise IntegrityError(args[0]) + elif error == "PathNotAllowed": + raise PathNotAllowed(args[0]) + elif error == "PathPermissionDenied": + raise LegacyRepository.PathPermissionDenied(args[0]) + elif error == "ParentPathDoesNotExist": + raise LegacyRepository.ParentPathDoesNotExist(args[0]) + elif error == "ObjectNotFound": + raise LegacyRepository.ObjectNotFound(args[0], self.location.processed) + elif error == "InvalidRPCMethod": + raise InvalidRPCMethod(args[0]) + elif error == "LockTimeout": + raise LockTimeout(args[0]) + elif error == "LockFailed": + raise LockFailed(args[0], args[1]) + elif error == "NotLocked": + raise NotLocked(args[0]) + elif error == "NotMyLock": + raise NotMyLock(args[0]) + else: + raise self.RPCError(unpacked) + + calls = list(calls) + waiting_for = [] + maximum_to_send = 0 if wait else self.upload_buffer_size_limit + send_buffer() # Try to send data, as some cases (async_response) will never try to send data otherwise. + while wait or calls: + if self.shutdown_time and time.monotonic() > self.shutdown_time: + # we are shutting this LegacyRemoteRepository down already, make sure we do not waste + # a lot of time in case a lot of async stuff is coming in or remote is gone or slow. + logger.debug( + "shutdown_time reached, shutting down with %d waiting_for and %d async_responses.", + len(waiting_for), + len(self.async_responses), + ) + return + while waiting_for: + try: + unpacked = self.responses.pop(waiting_for[0]) + waiting_for.pop(0) + handle_error(unpacked) + yield unpacked[RESULT] + if not waiting_for and not calls: + return + except KeyError: + break + if cmd == "async_responses": + while True: + try: + msgid, unpacked = self.async_responses.popitem() + except KeyError: + # there is nothing left what we already have received + if async_wait and self.ignore_responses: + # but do not return if we shall wait and there is something left to wait for: + break + else: + return + else: + handle_error(unpacked) + yield unpacked[RESULT] + if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT): + w_fds = [self.stdin_fd] + else: + w_fds = [] + r, w, x = select.select(self.r_fds, w_fds, self.x_fds, 1) + if x: + raise Exception("FD exception occurred") + for fd in r: + if fd is self.stdout_fd: + data = os.read(fd, BUFSIZE) + if not data: + raise ConnectionClosed() + self.rx_bytes += len(data) + self.unpacker.feed(data) + for unpacked in self.unpacker: + if not isinstance(unpacked, dict): + raise UnexpectedRPCDataFormatFromServer(data) + + lr_dict = unpacked.get(LOG) + if lr_dict is not None: + # Re-emit remote log messages locally. + _logger = logging.getLogger(lr_dict["name"]) + if _logger.isEnabledFor(lr_dict["level"]): + _logger.handle(logging.LogRecord(**lr_dict)) + continue + + msgid = unpacked[MSGID] + if msgid in self.ignore_responses: + self.ignore_responses.remove(msgid) + # async methods never return values, but may raise exceptions. + if "exception_class" in unpacked: + self.async_responses[msgid] = unpacked + else: + # we currently do not have async result values except "None", + # so we do not add them into async_responses. + if unpacked[RESULT] is not None: + self.async_responses[msgid] = unpacked + else: + self.responses[msgid] = unpacked + elif fd is self.stderr_fd: + data = os.read(fd, 32768) + if not data: + raise ConnectionClosed() + self.rx_bytes += len(data) + # deal with incomplete lines (may appear due to block buffering) + if self.stderr_received: + data = self.stderr_received + data + self.stderr_received = b"" + lines = data.splitlines(keepends=True) + if lines and not lines[-1].endswith((b"\r", b"\n")): + self.stderr_received = lines.pop() + # now we have complete lines in and any partial line in self.stderr_received. + _logger = logging.getLogger() + for line in lines: + # borg serve (remote/server side) should not emit stuff on stderr, + # but e.g. the ssh process (local/client side) might output errors there. + assert line.endswith((b"\r", b"\n")) + # something came in on stderr, log it to not lose it. + # decode late, avoid partial utf-8 sequences. + _logger.warning("stderr: " + line.decode().strip()) + if w: + while ( + (len(self.to_send) <= maximum_to_send) + and (calls or self.preload_ids) + and len(waiting_for) < MAX_INFLIGHT + ): + if calls: + if is_preloaded: + assert cmd == "get", "is_preload is only supported for 'get'" + if calls[0]["id"] in self.chunkid_to_msgids: + waiting_for.append(pop_preload_msgid(calls.pop(0)["id"])) + else: + args = calls.pop(0) + if cmd == "get" and args["id"] in self.chunkid_to_msgids: + waiting_for.append(pop_preload_msgid(args["id"])) + else: + self.msgid += 1 + waiting_for.append(self.msgid) + self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: cmd, ARGS: args})) + if not self.to_send and self.preload_ids: + chunk_id = self.preload_ids.pop(0) + args = {"id": chunk_id} + self.msgid += 1 + self.chunkid_to_msgids.setdefault(chunk_id, []).append(self.msgid) + self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: "get", ARGS: args})) + + send_buffer() + self.ignore_responses |= set(waiting_for) # we lose order here + + @api( + since=parse_version("1.0.0"), + append_only={"since": parse_version("1.0.7"), "previously": False}, + make_parent_dirs={"since": parse_version("1.1.9"), "previously": False}, + v1_or_v2={"since": parse_version("2.0.0b8"), "previously": True}, # TODO fix version + ) + def open( + self, + path, + create=False, + lock_wait=None, + lock=True, + exclusive=False, + append_only=False, + make_parent_dirs=False, + v1_or_v2=False, + ): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0a3")) + def info(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0"), max_duration={"since": parse_version("1.2.0a4"), "previously": 0}) + def check(self, repair=False, max_duration=0): + """actual remoting is done via self.call in the @api decorator""" + + @api( + since=parse_version("1.0.0"), + compact={"since": parse_version("1.2.0a0"), "previously": True, "dontcare": True}, + threshold={"since": parse_version("1.2.0a8"), "previously": 0.1, "dontcare": True}, + ) + def commit(self, compact=True, threshold=0.1): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def rollback(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def destroy(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def __len__(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def list(self, limit=None, marker=None): + """actual remoting is done via self.call in the @api decorator""" + + def get(self, id, read_data=True): + for resp in self.get_many([id], read_data=read_data): + return resp + + def get_many(self, ids, read_data=True, is_preloaded=False): + yield from self.call_many("get", [{"id": id, "read_data": read_data} for id in ids], is_preloaded=is_preloaded) + + @api(since=parse_version("1.0.0")) + def put(self, id, data, wait=True): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def delete(self, id, wait=True): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def save_key(self, keydata): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def load_key(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("1.0.0")) + def break_lock(self): + """actual remoting is done via self.call in the @api decorator""" + + def close(self): + if self.p or self.sock: + self.call("close", {}, wait=True) + if self.p: + self.p.stdin.close() + self.p.stdout.close() + self.p.wait() + self.p = None + if self.sock: + try: + self.sock.shutdown(socket.SHUT_RDWR) + except OSError as e: + if e.errno != errno.ENOTCONN: + raise + self.sock.close() + self.sock = None + + def async_response(self, wait=True): + for resp in self.call_many("async_responses", calls=[], wait=True, async_wait=wait): + return resp + + def preload(self, ids): + self.preload_ids += ids + + @api(since=parse_version("2.0.0b8")) + def get_manifest(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def put_manifest(self, data): + """actual remoting is done via self.call in the @api decorator""" + + +class RepositoryNoCache: + """A not caching Repository wrapper, passes through to repository. + + Just to have same API (including the context manager) as RepositoryCache. + + *transform* is a callable taking two arguments, key and raw repository data. + The return value is returned from get()/get_many(). By default, the raw + repository data is returned. + """ + + def __init__(self, repository, transform=None): + self.repository = repository + self.transform = transform or (lambda key, data: data) + + def close(self): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def get(self, key, read_data=True): + return next(self.get_many([key], read_data=read_data, cache=False)) + + def get_many(self, keys, read_data=True, cache=True): + for key, data in zip(keys, self.repository.get_many(keys, read_data=read_data)): + yield self.transform(key, data) + + def log_instrumentation(self): + pass + + +class RepositoryCache(RepositoryNoCache): + """ + A caching Repository wrapper. + + Caches Repository GET operations locally. + + *pack* and *unpack* complement *transform* of the base class. + *pack* receives the output of *transform* and should return bytes, + which are stored in the cache. *unpack* receives these bytes and + should return the initial data (as returned by *transform*). + """ + + def __init__(self, repository, pack=None, unpack=None, transform=None): + super().__init__(repository, transform) + self.pack = pack or (lambda data: data) + self.unpack = unpack or (lambda data: data) + self.cache = set() + self.basedir = tempfile.mkdtemp(prefix="borg-cache-") + self.query_size_limit() + self.size = 0 + # Instrumentation + self.hits = 0 + self.misses = 0 + self.slow_misses = 0 + self.slow_lat = 0.0 + self.evictions = 0 + self.enospc = 0 + + def query_size_limit(self): + available_space = shutil.disk_usage(self.basedir).free + self.size_limit = int(min(available_space * 0.25, 2**31)) + + def prefixed_key(self, key, complete): + # just prefix another byte telling whether this key refers to a complete chunk + # or a without-data-metadata-only chunk (see also read_data param). + prefix = b"\x01" if complete else b"\x00" + return prefix + key + + def key_filename(self, key): + return os.path.join(self.basedir, bin_to_hex(key)) + + def backoff(self): + self.query_size_limit() + target_size = int(0.9 * self.size_limit) + while self.size > target_size and self.cache: + key = self.cache.pop() + file = self.key_filename(key) + self.size -= os.stat(file).st_size + os.unlink(file) + self.evictions += 1 + + def add_entry(self, key, data, cache, complete): + transformed = self.transform(key, data) + if not cache: + return transformed + packed = self.pack(transformed) + pkey = self.prefixed_key(key, complete=complete) + file = self.key_filename(pkey) + try: + with open(file, "wb") as fd: + fd.write(packed) + except OSError as os_error: + try: + safe_unlink(file) + except FileNotFoundError: + pass # open() could have failed as well + if os_error.errno == errno.ENOSPC: + self.enospc += 1 + self.backoff() + else: + raise + else: + self.size += len(packed) + self.cache.add(pkey) + if self.size > self.size_limit: + self.backoff() + return transformed + + def log_instrumentation(self): + logger.debug( + "RepositoryCache: current items %d, size %s / %s, %d hits, %d misses, %d slow misses (+%.1fs), " + "%d evictions, %d ENOSPC hit", + len(self.cache), + format_file_size(self.size), + format_file_size(self.size_limit), + self.hits, + self.misses, + self.slow_misses, + self.slow_lat, + self.evictions, + self.enospc, + ) + + def close(self): + self.log_instrumentation() + self.cache.clear() + shutil.rmtree(self.basedir) + + def get_many(self, keys, read_data=True, cache=True): + # It could use different cache keys depending on read_data and cache full vs. meta-only chunks. + unknown_keys = [key for key in keys if self.prefixed_key(key, complete=read_data) not in self.cache] + repository_iterator = zip(unknown_keys, self.repository.get_many(unknown_keys, read_data=read_data)) + for key in keys: + pkey = self.prefixed_key(key, complete=read_data) + if pkey in self.cache: + file = self.key_filename(pkey) + with open(file, "rb") as fd: + self.hits += 1 + yield self.unpack(fd.read()) + else: + for key_, data in repository_iterator: + if key_ == key: + transformed = self.add_entry(key, data, cache, complete=read_data) + self.misses += 1 + yield transformed + break + else: + # slow path: eviction during this get_many removed this key from the cache + t0 = time.perf_counter() + data = self.repository.get(key, read_data=read_data) + self.slow_lat += time.perf_counter() - t0 + transformed = self.add_entry(key, data, cache, complete=read_data) + self.slow_misses += 1 + yield transformed + # Consume any pending requests + for _ in repository_iterator: + pass + + +def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None, transform=None, force_cache=False): + """ + Return a Repository(No)Cache for *repository*. + + If *decrypted_cache* is a repo_objs object, then get and get_many will return a tuple + (csize, plaintext) instead of the actual data in the repository. The cache will + store decrypted data, which increases CPU efficiency (by avoiding repeatedly decrypting + and more importantly MAC and ID checking cached objects). + Internally, objects are compressed with LZ4. + """ + if decrypted_cache and (pack or unpack or transform): + raise ValueError("decrypted_cache and pack/unpack/transform are incompatible") + elif decrypted_cache: + repo_objs = decrypted_cache + # 32 bit csize, 64 bit (8 byte) xxh64, 1 byte ctype, 1 byte clevel + cache_struct = struct.Struct("=I8sBB") + compressor = Compressor("lz4") + + def pack(data): + csize, decrypted = data + meta, compressed = compressor.compress({}, decrypted) + return cache_struct.pack(csize, xxh64(compressed), meta["ctype"], meta["clevel"]) + compressed + + def unpack(data): + data = memoryview(data) + csize, checksum, ctype, clevel = cache_struct.unpack(data[: cache_struct.size]) + compressed = data[cache_struct.size :] + if checksum != xxh64(compressed): + raise IntegrityError("detected corrupted data in metadata cache") + meta = dict(ctype=ctype, clevel=clevel, csize=len(compressed)) + _, decrypted = compressor.decompress(meta, compressed) + return csize, decrypted + + def transform(id_, data): + meta, decrypted = repo_objs.parse(id_, data, ro_type=ROBJ_DONTCARE) + csize = meta.get("csize", len(data)) + return csize, decrypted + + if isinstance(repository, LegacyRemoteRepository) or force_cache: + return RepositoryCache(repository, pack, unpack, transform) + else: + return RepositoryNoCache(repository, transform) diff --git a/src/borg/legacyrepository.py b/src/borg/legacyrepository.py new file mode 100644 index 0000000000..eebf83bcbc --- /dev/null +++ b/src/borg/legacyrepository.py @@ -0,0 +1,1824 @@ +import errno +import mmap +import os +import shutil +import stat +import struct +import time +from collections import defaultdict +from configparser import ConfigParser +from datetime import datetime, timezone +from functools import partial +from itertools import islice +from typing import Callable, DefaultDict + +from .constants import * # NOQA +from .hashindex import NSIndexEntry, NSIndex, NSIndex1, hashindex_variant +from .helpers import Error, ErrorWithTraceback, IntegrityError, format_file_size, parse_file_size +from .helpers import Location +from .helpers import ProgressIndicatorPercent +from .helpers import bin_to_hex, hex_to_bin +from .helpers import secure_erase, safe_unlink +from .helpers import msgpack +from .helpers.lrucache import LRUCache +from .fslocking import Lock, LockError, LockErrorT +from .logger import create_logger +from .manifest import Manifest, NoManifestError +from .platform import SaveFile, SyncFile, sync_dir, safe_fadvise +from .repoobj import RepoObj +from .checksums import crc32, StreamingXXH64 +from .crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError + +logger = create_logger(__name__) + +MAGIC = b"BORG_SEG" +MAGIC_LEN = len(MAGIC) + +TAG_PUT = 0 +TAG_DELETE = 1 +TAG_COMMIT = 2 +TAG_PUT2 = 3 + +# Highest ID usable as TAG_* value +# +# Code may expect not to find any tags exceeding this value. In particular, +# in order to speed up `borg check --repair`, any tag greater than MAX_TAG_ID +# is assumed to be corrupted. When increasing this value, in order to add more +# tags, keep in mind that old versions of Borg accessing a new repository +# may not be able to handle the new tags. +MAX_TAG_ID = 15 + +FreeSpace: Callable[[], DefaultDict] = partial(defaultdict, int) + + +def header_size(tag): + if tag == TAG_PUT2: + size = LoggedIO.HEADER_ID_SIZE + LoggedIO.ENTRY_HASH_SIZE + elif tag == TAG_PUT or tag == TAG_DELETE: + size = LoggedIO.HEADER_ID_SIZE + elif tag == TAG_COMMIT: + size = LoggedIO.header_fmt.size + else: + raise ValueError(f"unsupported tag: {tag!r}") + return size + + +class LegacyRepository: + """ + Filesystem based transactional key value store + + Transactionality is achieved by using a log (aka journal) to record changes. The log is a series of numbered files + called segments. Each segment is a series of log entries. The segment number together with the offset of each + entry relative to its segment start establishes an ordering of the log entries. This is the "definition" of + time for the purposes of the log. + + Log entries are either PUT, DELETE or COMMIT. + + A COMMIT is always the final log entry in a segment and marks all data from the beginning of the log until the + segment ending with the COMMIT as committed and consistent. The segment number of a segment ending with a COMMIT + is called the transaction ID of that commit, and a segment ending with a COMMIT is called committed. + + When reading from a repository it is first checked whether the last segment is committed. If it is not, then + all segments after the last committed segment are deleted; they contain log entries whose consistency is not + established by a COMMIT. + + Note that the COMMIT can't establish consistency by itself, but only manages to do so with proper support from + the platform (including the hardware). See platform.base.SyncFile for details. + + A PUT inserts a key-value pair. The value is stored in the log entry, hence the repository implements + full data logging, meaning that all data is consistent, not just metadata (which is common in file systems). + + A DELETE marks a key as deleted. + + For a given key only the last entry regarding the key, which is called current (all other entries are called + superseded), is relevant: If there is no entry or the last entry is a DELETE then the key does not exist. + Otherwise the last PUT defines the value of the key. + + By superseding a PUT (with either another PUT or a DELETE) the log entry becomes obsolete. A segment containing + such obsolete entries is called sparse, while a segment containing no such entries is called compact. + + Sparse segments can be compacted and thereby disk space freed. This destroys the transaction for which the + superseded entries where current. + + On disk layout: + + dir/README + dir/config + dir/data// + dir/index.X + dir/hints.X + + File system interaction + ----------------------- + + LoggedIO generally tries to rely on common behaviours across transactional file systems. + + Segments that are deleted are truncated first, which avoids problems if the FS needs to + allocate space to delete the dirent of the segment. This mostly affects CoW file systems, + traditional journaling file systems have a fairly good grip on this problem. + + Note that deletion, i.e. unlink(2), is atomic on every file system that uses inode reference + counts, which includes pretty much all of them. To remove a dirent the inodes refcount has + to be decreased, but you can't decrease the refcount before removing the dirent nor can you + decrease the refcount after removing the dirent. File systems solve this with a lock, + and by ensuring it all stays within the same FS transaction. + + Truncation is generally not atomic in itself, and combining truncate(2) and unlink(2) is of + course never guaranteed to be atomic. Truncation in a classic extent-based FS is done in + roughly two phases, first the extents are removed then the inode is updated. (In practice + this is of course way more complex). + + LoggedIO gracefully handles truncate/unlink splits as long as the truncate resulted in + a zero length file. Zero length segments are considered not to exist, while LoggedIO.cleanup() + will still get rid of them. + """ + + class AlreadyExists(Error): + """A repository already exists at {}.""" + + exit_mcode = 10 + + class CheckNeeded(ErrorWithTraceback): + """Inconsistency detected. Please run "borg check {}".""" + + exit_mcode = 12 + + class DoesNotExist(Error): + """Repository {} does not exist.""" + + exit_mcode = 13 + + class InsufficientFreeSpaceError(Error): + """Insufficient free space to complete transaction (required: {}, available: {}).""" + + exit_mcode = 14 + + class InvalidRepository(Error): + """{} is not a valid repository. Check repo config.""" + + exit_mcode = 15 + + class InvalidRepositoryConfig(Error): + """{} does not have a valid configuration. Check repo config [{}].""" + + exit_mcode = 16 + + class ObjectNotFound(ErrorWithTraceback): + """Object with key {} not found in repository {}.""" + + exit_mcode = 17 + + def __init__(self, id, repo): + if isinstance(id, bytes): + id = bin_to_hex(id) + super().__init__(id, repo) + + class ParentPathDoesNotExist(Error): + """The parent path of the repo directory [{}] does not exist.""" + + exit_mcode = 18 + + class PathAlreadyExists(Error): + """There is already something at {}.""" + + exit_mcode = 19 + + class StorageQuotaExceeded(Error): + """The storage quota ({}) has been exceeded ({}). Try deleting some archives.""" + + exit_mcode = 20 + + class PathPermissionDenied(Error): + """Permission denied to {}.""" + + exit_mcode = 21 + + def __init__( + self, + path, + create=False, + exclusive=False, + lock_wait=None, + lock=True, + append_only=False, + storage_quota=None, + make_parent_dirs=False, + send_log_cb=None, + ): + self.path = os.path.abspath(path) + self._location = Location("file://%s" % self.path) + self.version = None + # long-running repository methods which emit log or progress output are responsible for calling + # the ._send_log method periodically to get log and progress output transferred to the borg client + # in a timely manner, in case we have a LegacyRemoteRepository. + # for local repositories ._send_log can be called also (it will just do nothing in that case). + self._send_log = send_log_cb or (lambda: None) + self.io = None # type: LoggedIO + self.lock = None + self.index = None + # This is an index of shadowed log entries during this transaction. Consider the following sequence: + # segment_n PUT A, segment_x DELETE A + # After the "DELETE A" in segment_x the shadow index will contain "A -> [n]". + # .delete() is updating this index, it is persisted into "hints" file and is later used by .compact_segments(). + # We need the entries in the shadow_index to not accidentally drop the "DELETE A" when we compact segment_x + # only (and we do not compact segment_n), because DELETE A is still needed then because PUT A will be still + # there. Otherwise chunk A would reappear although it was previously deleted. + self.shadow_index = {} + self._active_txn = False + self.lock_wait = lock_wait + self.do_lock = lock + self.do_create = create + self.created = False + self.exclusive = exclusive + self.append_only = append_only + self.storage_quota = storage_quota + self.storage_quota_use = 0 + self.transaction_doomed = None + self.make_parent_dirs = make_parent_dirs + # v2 is the default repo version for borg 2.0 + # v1 repos must only be used in a read-only way, e.g. for + # --other-repo=V1_REPO with borg init and borg transfer! + self.acceptable_repo_versions = (1, 2) + + def __del__(self): + if self.lock: + self.close() + assert False, "cleanup happened in Repository.__del__" + + def __repr__(self): + return f"<{self.__class__.__name__} {self.path}>" + + def __enter__(self): + if self.do_create: + self.do_create = False + self.create(self.path) + self.created = True + self.open(self.path, bool(self.exclusive), lock_wait=self.lock_wait, lock=self.do_lock) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + no_space_left_on_device = exc_type is OSError and exc_val.errno == errno.ENOSPC + # The ENOSPC could have originated somewhere else besides the Repository. The cleanup is always safe, unless + # EIO or FS corruption ensues, which is why we specifically check for ENOSPC. + if self._active_txn and no_space_left_on_device: + logger.warning("No space left on device, cleaning up partial transaction to free space.") + cleanup = True + else: + cleanup = False + self._rollback(cleanup=cleanup) + self.close() + + @property + def id_str(self): + return bin_to_hex(self.id) + + @staticmethod + def is_repository(path): + """Check whether there is already a Borg repository at *path*.""" + try: + # Use binary mode to avoid troubles if a README contains some stuff not in our locale + with open(os.path.join(path, "README"), "rb") as fd: + # Read only the first ~100 bytes (if any), in case some README file we stumble upon is large. + readme_head = fd.read(100) + # The first comparison captures our current variant (REPOSITORY_README), the second comparison + # is an older variant of the README file (used by 1.0.x). + return b"Borg Backup repository" in readme_head or b"Borg repository" in readme_head + except OSError: + # Ignore FileNotFound, PermissionError, ... + return False + + def check_can_create_repository(self, path): + """ + Raise an exception if a repository already exists at *path* or any parent directory. + + Checking parent directories is done for two reasons: + (1) It's just a weird thing to do, and usually not intended. A Borg using the "parent" repository + may be confused, or we may accidentally put stuff into the "data/" or "data//" directories. + (2) When implementing repository quotas (which we currently don't), it's important to prohibit + folks from creating quota-free repositories. Since no one can create a repository within another + repository, user's can only use the quota'd repository, when their --restrict-to-path points + at the user's repository. + """ + try: + st = os.stat(path) + except FileNotFoundError: + pass # nothing there! + except PermissionError: + raise self.PathPermissionDenied(path) from None + else: + # there is something already there! + if self.is_repository(path): + raise self.AlreadyExists(path) + if not stat.S_ISDIR(st.st_mode): + raise self.PathAlreadyExists(path) + try: + files = os.listdir(path) + except PermissionError: + raise self.PathPermissionDenied(path) from None + else: + if files: # a dir, but not empty + raise self.PathAlreadyExists(path) + else: # an empty directory is acceptable for us. + pass + + while True: + # Check all parent directories for Borg's repository README + previous_path = path + # Thus, path = previous_path/.. + path = os.path.abspath(os.path.join(previous_path, os.pardir)) + if path == previous_path: + # We reached the root of the directory hierarchy (/.. = / and C:\.. = C:\). + break + if self.is_repository(path): + raise self.AlreadyExists(path) + + def create(self, path): + """Create a new empty repository at `path`""" + self.check_can_create_repository(path) + if self.make_parent_dirs: + parent_path = os.path.join(path, os.pardir) + os.makedirs(parent_path, exist_ok=True) + if not os.path.exists(path): + try: + os.mkdir(path) + except FileNotFoundError as err: + raise self.ParentPathDoesNotExist(path) from err + with open(os.path.join(path, "README"), "w") as fd: + fd.write(REPOSITORY_README) + os.mkdir(os.path.join(path, "data")) + config = ConfigParser(interpolation=None) + config.add_section("repository") + self.version = 2 + config.set("repository", "version", str(self.version)) + config.set("repository", "segments_per_dir", str(DEFAULT_SEGMENTS_PER_DIR)) + config.set("repository", "max_segment_size", str(DEFAULT_MAX_SEGMENT_SIZE)) + config.set("repository", "append_only", str(int(self.append_only))) + if self.storage_quota: + config.set("repository", "storage_quota", str(self.storage_quota)) + else: + config.set("repository", "storage_quota", "0") + config.set("repository", "additional_free_space", "0") + config.set("repository", "id", bin_to_hex(os.urandom(32))) + self.save_config(path, config) + + def save_config(self, path, config): + config_path = os.path.join(path, "config") + old_config_path = os.path.join(path, "config.old") + + if os.path.isfile(old_config_path): + logger.warning("Old config file not securely erased on previous config update") + secure_erase(old_config_path, avoid_collateral_damage=True) + + if os.path.isfile(config_path): + link_error_msg = ( + "Failed to erase old repository config file securely (hardlinks not supported). " + "Old repokey data, if any, might persist on physical storage." + ) + try: + os.link(config_path, old_config_path) + except OSError as e: + if e.errno in (errno.EMLINK, errno.ENOSYS, errno.EPERM, errno.EACCES, errno.ENOTSUP, errno.EIO): + logger.warning(link_error_msg) + else: + raise + except AttributeError: + # some python ports have no os.link, see #4901 + logger.warning(link_error_msg) + + try: + with SaveFile(config_path) as fd: + config.write(fd) + except PermissionError as e: + # error is only a problem if we even had a lock + if self.do_lock: + raise + logger.warning( + "%s: Failed writing to '%s'. This is expected when working on " + "read-only repositories." % (e.strerror, e.filename) + ) + + if os.path.isfile(old_config_path): + secure_erase(old_config_path, avoid_collateral_damage=True) + + def save_key(self, keydata): + assert self.config + keydata = keydata.decode("utf-8") # remote repo: msgpack issue #99, getting bytes + # note: saving an empty key means that there is no repokey any more + self.config.set("repository", "key", keydata) + self.save_config(self.path, self.config) + + def load_key(self): + keydata = self.config.get("repository", "key", fallback="").strip() + # note: if we return an empty string, it means there is no repo key + return keydata.encode("utf-8") # remote repo: msgpack issue #99, returning bytes + + def destroy(self): + """Destroy the repository at `self.path`""" + if self.append_only: + raise ValueError(self.path + " is in append-only mode") + self.close() + os.remove(os.path.join(self.path, "config")) # kill config first + shutil.rmtree(self.path) + + def get_index_transaction_id(self): + indices = sorted( + int(fn[6:]) + for fn in os.listdir(self.path) + if fn.startswith("index.") and fn[6:].isdigit() and os.stat(os.path.join(self.path, fn)).st_size != 0 + ) + if indices: + return indices[-1] + else: + return None + + def check_transaction(self): + index_transaction_id = self.get_index_transaction_id() + segments_transaction_id = self.io.get_segments_transaction_id() + if index_transaction_id is not None and segments_transaction_id is None: + # we have a transaction id from the index, but we did not find *any* + # commit in the segment files (thus no segments transaction id). + # this can happen if a lot of segment files are lost, e.g. due to a + # filesystem or hardware malfunction. it means we have no identifiable + # valid (committed) state of the repo which we could use. + msg = '%s" - although likely this is "beyond repair' % self.path # dirty hack + raise self.CheckNeeded(msg) + # Attempt to rebuild index automatically if we crashed between commit + # tag write and index save. + if index_transaction_id != segments_transaction_id: + if index_transaction_id is not None and index_transaction_id > segments_transaction_id: + replay_from = None + else: + replay_from = index_transaction_id + self.replay_segments(replay_from, segments_transaction_id) + + def get_transaction_id(self): + self.check_transaction() + return self.get_index_transaction_id() + + def break_lock(self): + Lock(os.path.join(self.path, "lock")).break_lock() + + def migrate_lock(self, old_id, new_id): + # note: only needed for local repos + if self.lock is not None: + self.lock.migrate_lock(old_id, new_id) + + def open(self, path, exclusive, lock_wait=None, lock=True): + self.path = path + try: + st = os.stat(path) + except FileNotFoundError: + raise self.DoesNotExist(path) + if not stat.S_ISDIR(st.st_mode): + raise self.InvalidRepository(path) + if lock: + self.lock = Lock(os.path.join(path, "lock"), exclusive, timeout=lock_wait).acquire() + else: + self.lock = None + self.config = ConfigParser(interpolation=None) + try: + with open(os.path.join(self.path, "config")) as fd: + self.config.read_file(fd) + except FileNotFoundError: + self.close() + raise self.InvalidRepository(self.path) + if "repository" not in self.config.sections(): + self.close() + raise self.InvalidRepositoryConfig(path, "no repository section found") + self.version = self.config.getint("repository", "version") + if self.version not in self.acceptable_repo_versions: + self.close() + raise self.InvalidRepositoryConfig( + path, "repository version %d is not supported by this borg version" % self.version + ) + self.max_segment_size = parse_file_size(self.config.get("repository", "max_segment_size")) + if self.max_segment_size >= MAX_SEGMENT_SIZE_LIMIT: + self.close() + raise self.InvalidRepositoryConfig(path, "max_segment_size >= %d" % MAX_SEGMENT_SIZE_LIMIT) # issue 3592 + self.segments_per_dir = self.config.getint("repository", "segments_per_dir") + self.additional_free_space = parse_file_size(self.config.get("repository", "additional_free_space", fallback=0)) + # append_only can be set in the constructor + # it shouldn't be overridden (True -> False) here + self.append_only = self.append_only or self.config.getboolean("repository", "append_only", fallback=False) + if self.storage_quota is None: + # self.storage_quota is None => no explicit storage_quota was specified, use repository setting. + self.storage_quota = parse_file_size(self.config.get("repository", "storage_quota", fallback=0)) + self.id = hex_to_bin(self.config.get("repository", "id").strip(), length=32) + self.io = LoggedIO(self.path, self.max_segment_size, self.segments_per_dir) + + def _load_hints(self): + if (transaction_id := self.get_transaction_id()) is None: + # self is a fresh repo, so transaction_id is None and there is no hints file + return + hints = self._unpack_hints(transaction_id) + self.version = hints["version"] + self.storage_quota_use = hints["storage_quota_use"] + self.shadow_index = hints["shadow_index"] + + def info(self): + """return some infos about the repo (must be opened first)""" + info = dict(id=self.id, version=self.version, append_only=self.append_only) + self._load_hints() + info["storage_quota"] = self.storage_quota + info["storage_quota_use"] = self.storage_quota_use + return info + + def close(self): + if self.lock: + if self.io: + self.io.close() + self.io = None + self.lock.release() + self.lock = None + + def commit(self, compact=True, threshold=0.1): + """Commit transaction""" + if self.transaction_doomed: + exception = self.transaction_doomed + self.rollback() + raise exception + self.check_free_space() + segment = self.io.write_commit() + self.segments.setdefault(segment, 0) + self.compact[segment] += LoggedIO.header_fmt.size + if compact and not self.append_only: + self.compact_segments(threshold) + self.write_index() + self.rollback() + + def _read_integrity(self, transaction_id, key): + integrity_file = "integrity.%d" % transaction_id + integrity_path = os.path.join(self.path, integrity_file) + try: + with open(integrity_path, "rb") as fd: + integrity = msgpack.unpack(fd) + except FileNotFoundError: + return + if integrity.get("version") != 2: + logger.warning("Unknown integrity data version %r in %s", integrity.get("version"), integrity_file) + return + return integrity[key] + + def open_index(self, transaction_id, auto_recover=True): + if transaction_id is None: + return NSIndex() + index_path = os.path.join(self.path, "index.%d" % transaction_id) + variant = hashindex_variant(index_path) + integrity_data = self._read_integrity(transaction_id, "index") + try: + with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: + if variant == 2: + return NSIndex.read(fd) + if variant == 1: # legacy + return NSIndex1.read(fd) + except (ValueError, OSError, FileIntegrityError) as exc: + logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc) + os.unlink(index_path) + if not auto_recover: + raise + self.prepare_txn(self.get_transaction_id()) + # don't leave an open transaction around + self.commit(compact=False) + return self.open_index(self.get_transaction_id()) + + def _unpack_hints(self, transaction_id): + hints_path = os.path.join(self.path, "hints.%d" % transaction_id) + integrity_data = self._read_integrity(transaction_id, "hints") + with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd: + return msgpack.unpack(fd) + + def prepare_txn(self, transaction_id, do_cleanup=True): + self._active_txn = True + if self.do_lock and not self.lock.got_exclusive_lock(): + if self.exclusive is not None: + # self.exclusive is either True or False, thus a new client is active here. + # if it is False and we get here, the caller did not use exclusive=True although + # it is needed for a write operation. if it is True and we get here, something else + # went very wrong, because we should have an exclusive lock, but we don't. + raise AssertionError("bug in code, exclusive lock should exist here") + # if we are here, this is an old client talking to a new server (expecting lock upgrade). + # or we are replaying segments and might need a lock upgrade for that. + try: + self.lock.upgrade() + except (LockError, LockErrorT): + # if upgrading the lock to exclusive fails, we do not have an + # active transaction. this is important for "serve" mode, where + # the repository instance lives on - even if exceptions happened. + self._active_txn = False + raise + if not self.index or transaction_id is None: + try: + self.index = self.open_index(transaction_id, auto_recover=False) + except (ValueError, OSError, FileIntegrityError) as exc: + logger.warning("Checking repository transaction due to previous error: %s", exc) + self.check_transaction() + self.index = self.open_index(transaction_id, auto_recover=False) + if transaction_id is None: + self.segments = {} # XXX bad name: usage_count_of_segment_x = self.segments[x] + self.compact = FreeSpace() # XXX bad name: freeable_space_of_segment_x = self.compact[x] + self.storage_quota_use = 0 + self.shadow_index.clear() + else: + if do_cleanup: + self.io.cleanup(transaction_id) + hints_path = os.path.join(self.path, "hints.%d" % transaction_id) + index_path = os.path.join(self.path, "index.%d" % transaction_id) + try: + hints = self._unpack_hints(transaction_id) + except (msgpack.UnpackException, FileNotFoundError, FileIntegrityError) as e: + logger.warning("Repository hints file missing or corrupted, trying to recover: %s", e) + if not isinstance(e, FileNotFoundError): + os.unlink(hints_path) + # index must exist at this point + os.unlink(index_path) + self.check_transaction() + self.prepare_txn(transaction_id) + return + if hints["version"] == 1: + logger.debug("Upgrading from v1 hints.%d", transaction_id) + self.segments = hints["segments"] + self.compact = FreeSpace() + self.storage_quota_use = 0 + self.shadow_index = {} + for segment in sorted(hints["compact"]): + logger.debug("Rebuilding sparse info for segment %d", segment) + self._rebuild_sparse(segment) + logger.debug("Upgrade to v2 hints complete") + elif hints["version"] != 2: + raise ValueError("Unknown hints file version: %d" % hints["version"]) + else: + self.segments = hints["segments"] + self.compact = FreeSpace(hints["compact"]) + self.storage_quota_use = hints.get("storage_quota_use", 0) + self.shadow_index = hints.get("shadow_index", {}) + # Drop uncommitted segments in the shadow index + for key, shadowed_segments in self.shadow_index.items(): + for segment in list(shadowed_segments): + if segment > transaction_id: + shadowed_segments.remove(segment) + + def write_index(self): + def flush_and_sync(fd): + fd.flush() + os.fsync(fd.fileno()) + + def rename_tmp(file): + os.replace(file + ".tmp", file) + + hints = { + "version": 2, + "segments": self.segments, + "compact": self.compact, + "storage_quota_use": self.storage_quota_use, + "shadow_index": self.shadow_index, + } + integrity = { + # Integrity version started at 2, the current hints version. + # Thus, integrity version == hints version, for now. + "version": 2 + } + transaction_id = self.io.get_segments_transaction_id() + assert transaction_id is not None + + # Log transaction in append-only mode + if self.append_only: + with open(os.path.join(self.path, "transactions"), "a") as log: + print( + "transaction %d, UTC time %s" + % (transaction_id, datetime.now(tz=timezone.utc).isoformat(timespec="microseconds")), + file=log, + ) + + # Write hints file + hints_name = "hints.%d" % transaction_id + hints_file = os.path.join(self.path, hints_name) + with IntegrityCheckedFile(hints_file + ".tmp", filename=hints_name, write=True) as fd: + msgpack.pack(hints, fd) + flush_and_sync(fd) + integrity["hints"] = fd.integrity_data + + # Write repository index + index_name = "index.%d" % transaction_id + index_file = os.path.join(self.path, index_name) + with IntegrityCheckedFile(index_file + ".tmp", filename=index_name, write=True) as fd: + # XXX: Consider using SyncFile for index write-outs. + self.index.write(fd) + flush_and_sync(fd) + integrity["index"] = fd.integrity_data + + # Write integrity file, containing checksums of the hints and index files + integrity_name = "integrity.%d" % transaction_id + integrity_file = os.path.join(self.path, integrity_name) + with open(integrity_file + ".tmp", "wb") as fd: + msgpack.pack(integrity, fd) + flush_and_sync(fd) + + # Rename the integrity file first + rename_tmp(integrity_file) + sync_dir(self.path) + # Rename the others after the integrity file is hypothetically on disk + rename_tmp(hints_file) + rename_tmp(index_file) + sync_dir(self.path) + + # Remove old auxiliary files + current = ".%d" % transaction_id + for name in os.listdir(self.path): + if not name.startswith(("index.", "hints.", "integrity.")): + continue + if name.endswith(current): + continue + os.unlink(os.path.join(self.path, name)) + self.index = None + + def check_free_space(self): + """Pre-commit check for sufficient free space necessary to perform the commit.""" + # As a baseline we take four times the current (on-disk) index size. + # At this point the index may only be updated by compaction, which won't resize it. + # We still apply a factor of four so that a later, separate invocation can free space + # (journaling all deletes for all chunks is one index size) or still make minor additions + # (which may grow the index up to twice its current size). + # Note that in a subsequent operation the committed index is still on-disk, therefore we + # arrive at index_size * (1 + 2 + 1). + # In that order: journaled deletes (1), hashtable growth (2), persisted index (1). + required_free_space = self.index.size() * 4 + + # Conservatively estimate hints file size: + # 10 bytes for each segment-refcount pair, 10 bytes for each segment-space pair + # Assume maximum of 5 bytes per integer. Segment numbers will usually be packed more densely (1-3 bytes), + # as will refcounts and free space integers. For 5 MiB segments this estimate is good to ~20 PB repo size. + # Add a generous 4K to account for constant format overhead. + hints_size = len(self.segments) * 10 + len(self.compact) * 10 + 4096 + required_free_space += hints_size + + required_free_space += self.additional_free_space + if not self.append_only: + full_segment_size = self.max_segment_size + MAX_OBJECT_SIZE + if len(self.compact) < 10: + # This is mostly for the test suite to avoid overestimated free space needs. This can be annoying + # if TMP is a small-ish tmpfs. + compact_working_space = 0 + for segment, free in self.compact.items(): + try: + compact_working_space += self.io.segment_size(segment) - free + except FileNotFoundError: + # looks like self.compact is referring to a nonexistent segment file, ignore it. + pass + logger.debug("check_free_space: Few segments, not requiring a full free segment") + compact_working_space = min(compact_working_space, full_segment_size) + logger.debug( + "check_free_space: Calculated working space for compact as %d bytes", compact_working_space + ) + required_free_space += compact_working_space + else: + # Keep one full worst-case segment free in non-append-only mode + required_free_space += full_segment_size + + try: + free_space = shutil.disk_usage(self.path).free + except OSError as os_error: + logger.warning("Failed to check free space before committing: " + str(os_error)) + return + logger.debug(f"check_free_space: Required bytes {required_free_space}, free bytes {free_space}") + if free_space < required_free_space: + if self.created: + logger.error("Not enough free space to initialize repository at this location.") + self.destroy() + else: + self._rollback(cleanup=True) + formatted_required = format_file_size(required_free_space) + formatted_free = format_file_size(free_space) + raise self.InsufficientFreeSpaceError(formatted_required, formatted_free) + + def compact_segments(self, threshold): + """Compact sparse segments by copying data into new segments""" + if not self.compact: + logger.debug("Nothing to do: compact empty") + return + quota_use_before = self.storage_quota_use + index_transaction_id = self.get_index_transaction_id() + segments = self.segments + unused = [] # list of segments, that are not used anymore + + def complete_xfer(intermediate=True): + # complete the current transfer (when some target segment is full) + nonlocal unused + # commit the new, compact, used segments + segment = self.io.write_commit(intermediate=intermediate) + self.segments.setdefault(segment, 0) + self.compact[segment] += LoggedIO.header_fmt.size + logger.debug( + "complete_xfer: Wrote %scommit at segment %d", "intermediate " if intermediate else "", segment + ) + # get rid of the old, sparse, unused segments. free space. + for segment in unused: + logger.debug("complete_xfer: Deleting unused segment %d", segment) + count = self.segments.pop(segment) + assert count == 0, "Corrupted segment reference count - corrupted index or hints" + self.io.delete_segment(segment) + del self.compact[segment] + unused = [] + + logger.debug("Compaction started (threshold is %i%%).", threshold * 100) + pi = ProgressIndicatorPercent( + total=len(self.compact), msg="Compacting segments %3.0f%%", step=1, msgid="repository.compact_segments" + ) + for segment, freeable_space in sorted(self.compact.items()): + if not self.io.segment_exists(segment): + logger.warning("Segment %d not found, but listed in compaction data", segment) + del self.compact[segment] + pi.show() + self._send_log() + continue + segment_size = self.io.segment_size(segment) + freeable_ratio = 1.0 * freeable_space / segment_size + # we want to compact if: + # - we can free a considerable relative amount of space (freeable_ratio over some threshold) + if not (freeable_ratio > threshold): + logger.debug( + "Not compacting segment %d (maybe freeable: %2.2f%% [%d bytes])", + segment, + freeable_ratio * 100.0, + freeable_space, + ) + pi.show() + self._send_log() + continue + segments.setdefault(segment, 0) + logger.debug( + "Compacting segment %d with usage count %d (maybe freeable: %2.2f%% [%d bytes])", + segment, + segments[segment], + freeable_ratio * 100.0, + freeable_space, + ) + for tag, key, offset, _, data in self.io.iter_objects(segment): + if tag == TAG_COMMIT: + continue + in_index = self.index.get(key) + is_index_object = in_index and (in_index.segment, in_index.offset) == (segment, offset) + if tag in (TAG_PUT2, TAG_PUT) and is_index_object: + try: + new_segment, offset = self.io.write_put(key, data, raise_full=True) + except LoggedIO.SegmentFull: + complete_xfer() + new_segment, offset = self.io.write_put(key, data) + self.index[key] = NSIndexEntry(new_segment, offset, len(data)) + segments.setdefault(new_segment, 0) + segments[new_segment] += 1 + segments[segment] -= 1 + if tag == TAG_PUT: + # old tag is PUT, but new will be PUT2 and use a bit more storage + self.storage_quota_use += self.io.ENTRY_HASH_SIZE + elif tag in (TAG_PUT2, TAG_PUT) and not is_index_object: + # If this is a PUT shadowed by a later tag, then it will be gone when this segment is deleted after + # this loop. Therefore it is removed from the shadow index. + try: + self.shadow_index[key].remove(segment) + except (KeyError, ValueError): + # do not remove entry with empty shadowed_segments list here, + # it is needed for shadowed_put_exists code (see below)! + pass + self.storage_quota_use -= header_size(tag) + len(data) + elif tag == TAG_DELETE and not in_index: + # If the shadow index doesn't contain this key, then we can't say if there's a shadowed older tag, + # therefore we do not drop the delete, but write it to a current segment. + key_not_in_shadow_index = key not in self.shadow_index + # If the key is in the shadow index and there is any segment with an older PUT of this + # key, we have a shadowed put. + shadowed_put_exists = key_not_in_shadow_index or any( + shadowed < segment for shadowed in self.shadow_index[key] + ) + delete_is_not_stable = index_transaction_id is None or segment > index_transaction_id + + if shadowed_put_exists or delete_is_not_stable: + # (introduced in 6425d16aa84be1eaaf88) + # This is needed to avoid object un-deletion if we crash between the commit and the deletion + # of old segments in complete_xfer(). + # + # However, this only happens if the crash also affects the FS to the effect that file deletions + # did not materialize consistently after journal recovery. If they always materialize in-order + # then this is not a problem, because the old segment containing a deleted object would be + # deleted before the segment containing the delete. + # + # Consider the following series of operations if we would not do this, i.e. this entire if: + # would be removed. + # Columns are segments, lines are different keys (line 1 = some key, line 2 = some other key) + # Legend: P=TAG_PUT/TAG_PUT2, D=TAG_DELETE, c=commit, i=index is written for latest commit + # + # Segment | 1 | 2 | 3 + # --------+-------+-----+------ + # Key 1 | P | D | + # Key 2 | P | | P + # commits | c i | c | c i + # --------+-------+-----+------ + # ^- compact_segments starts + # ^- complete_xfer commits, after that complete_xfer deletes + # segments 1 and 2 (and then the index would be written). + # + # Now we crash. But only segment 2 gets deleted, while segment 1 is still around. Now key 1 + # is suddenly undeleted (because the delete in segment 2 is now missing). + # Again, note the requirement here. We delete these in the correct order that this doesn't + # happen, and only if the FS materialization of these deletes is reordered or parts dropped + # this can happen. + # In this case it doesn't cause outright corruption, 'just' an index count mismatch, which + # will be fixed by borg-check --repair. + # + # Note that in this check the index state is the proxy for a "most definitely settled" + # repository state, i.e. the assumption is that *all* operations on segments <= index state + # are completed and stable. + try: + new_segment, size = self.io.write_delete(key, raise_full=True) + except LoggedIO.SegmentFull: + complete_xfer() + new_segment, size = self.io.write_delete(key) + self.compact[new_segment] += size + segments.setdefault(new_segment, 0) + else: + logger.debug( + "Dropping DEL for id %s - seg %d, iti %r, knisi %r, spe %r, dins %r, si %r", + bin_to_hex(key), + segment, + index_transaction_id, + key_not_in_shadow_index, + shadowed_put_exists, + delete_is_not_stable, + self.shadow_index.get(key), + ) + # we did not keep the delete tag for key (see if-branch) + if not self.shadow_index[key]: + # shadowed segments list is empty -> remove it + del self.shadow_index[key] + assert segments[segment] == 0, "Corrupted segment reference count - corrupted index or hints" + unused.append(segment) + pi.show() + self._send_log() + pi.finish() + self._send_log() + complete_xfer(intermediate=False) + self.io.clear_empty_dirs() + quota_use_after = self.storage_quota_use + logger.info("Compaction freed about %s repository space.", format_file_size(quota_use_before - quota_use_after)) + logger.debug("Compaction completed.") + + def replay_segments(self, index_transaction_id, segments_transaction_id): + # fake an old client, so that in case we do not have an exclusive lock yet, prepare_txn will upgrade the lock: + remember_exclusive = self.exclusive + self.exclusive = None + self.prepare_txn(index_transaction_id, do_cleanup=False) + try: + segment_count = sum(1 for _ in self.io.segment_iterator()) + pi = ProgressIndicatorPercent( + total=segment_count, msg="Replaying segments %3.0f%%", msgid="repository.replay_segments" + ) + for i, (segment, filename) in enumerate(self.io.segment_iterator()): + pi.show(i) + self._send_log() + if index_transaction_id is not None and segment <= index_transaction_id: + continue + if segment > segments_transaction_id: + break + objects = self.io.iter_objects(segment) + self._update_index(segment, objects) + pi.finish() + self._send_log() + self.write_index() + finally: + self.exclusive = remember_exclusive + self.rollback() + + def _update_index(self, segment, objects, report=None): + """some code shared between replay_segments and check""" + self.segments[segment] = 0 + for tag, key, offset, size, _ in objects: + if tag in (TAG_PUT2, TAG_PUT): + try: + # If this PUT supersedes an older PUT, mark the old segment for compaction and count the free space + in_index = self.index[key] + self.compact[in_index.segment] += header_size(tag) + size + self.segments[in_index.segment] -= 1 + self.shadow_index.setdefault(key, []).append(in_index.segment) + except KeyError: + pass + self.index[key] = NSIndexEntry(segment, offset, size) + self.segments[segment] += 1 + self.storage_quota_use += header_size(tag) + size + elif tag == TAG_DELETE: + try: + # if the deleted PUT is not in the index, there is nothing to clean up + in_index = self.index.pop(key) + except KeyError: + pass + else: + if self.io.segment_exists(in_index.segment): + # the old index is not necessarily valid for this transaction (e.g. compaction); if the segment + # is already gone, then it was already compacted. + self.segments[in_index.segment] -= 1 + self.compact[in_index.segment] += header_size(tag) + in_index.size + self.shadow_index.setdefault(key, []).append(in_index.segment) + elif tag == TAG_COMMIT: + continue + else: + msg = f"Unexpected tag {tag} in segment {segment}" + if report is None: + raise self.CheckNeeded(msg) + else: + report(msg) + if self.segments[segment] == 0: + self.compact[segment] = self.io.segment_size(segment) + + def _rebuild_sparse(self, segment): + """Rebuild sparse bytes count for a single segment relative to the current index.""" + try: + segment_size = self.io.segment_size(segment) + except FileNotFoundError: + # segment does not exist any more, remove it from the mappings. + # note: no need to self.compact.pop(segment), as we start from empty mapping. + self.segments.pop(segment) + return + + if self.segments[segment] == 0: + self.compact[segment] = segment_size + return + + self.compact[segment] = 0 + for tag, key, offset, size, _ in self.io.iter_objects(segment, read_data=False): + if tag in (TAG_PUT2, TAG_PUT): + in_index = self.index.get(key) + if not in_index or (in_index.segment, in_index.offset) != (segment, offset): + # This PUT is superseded later. + self.compact[segment] += header_size(tag) + size + elif tag == TAG_DELETE: + # The outcome of the DELETE has been recorded in the PUT branch already. + self.compact[segment] += header_size(tag) + size + + def check(self, repair=False, max_duration=0): + """Check repository consistency + + This method verifies all segment checksums and makes sure + the index is consistent with the data stored in the segments. + """ + if self.append_only and repair: + raise ValueError(self.path + " is in append-only mode") + error_found = False + + def report_error(msg, *args): + nonlocal error_found + error_found = True + logger.error(msg, *args) + + logger.info("Starting repository check") + assert not self._active_txn + try: + transaction_id = self.get_transaction_id() + current_index = self.open_index(transaction_id) + logger.debug("Read committed index of transaction %d", transaction_id) + except Exception as exc: + transaction_id = self.io.get_segments_transaction_id() + current_index = None + logger.debug("Failed to read committed index (%s)", exc) + if transaction_id is None: + logger.debug("No segments transaction found") + transaction_id = self.get_index_transaction_id() + if transaction_id is None: + logger.debug("No index transaction found, trying latest segment") + transaction_id = self.io.get_latest_segment() + if transaction_id is None: + report_error("This repository contains no valid data.") + return False + if repair: + self.io.cleanup(transaction_id) + segments_transaction_id = self.io.get_segments_transaction_id() + logger.debug("Segment transaction is %s", segments_transaction_id) + logger.debug("Determined transaction is %s", transaction_id) + self.prepare_txn(None) # self.index, self.compact, self.segments, self.shadow_index all empty now! + segment_count = sum(1 for _ in self.io.segment_iterator()) + logger.debug("Found %d segments", segment_count) + + partial = bool(max_duration) + assert not (repair and partial) + mode = "partial" if partial else "full" + if partial: + # continue a past partial check (if any) or start one from beginning + last_segment_checked = self.config.getint("repository", "last_segment_checked", fallback=-1) + logger.info("Skipping to segments >= %d", last_segment_checked + 1) + else: + # start from the beginning and also forget about any potential past partial checks + last_segment_checked = -1 + self.config.remove_option("repository", "last_segment_checked") + self.save_config(self.path, self.config) + t_start = time.monotonic() + pi = ProgressIndicatorPercent( + total=segment_count, msg="Checking segments %3.1f%%", step=0.1, msgid="repository.check" + ) + segment = -1 # avoid uninitialized variable if there are no segment files at all + for i, (segment, filename) in enumerate(self.io.segment_iterator()): + pi.show(i) + self._send_log() + if segment <= last_segment_checked: + continue + if segment > transaction_id: + continue + logger.debug("Checking segment file %s...", filename) + try: + objects = list(self.io.iter_objects(segment)) + except IntegrityError as err: + report_error(str(err)) + objects = [] + if repair: + self.io.recover_segment(segment, filename) + objects = list(self.io.iter_objects(segment)) + if not partial: + self._update_index(segment, objects, report_error) + if partial and time.monotonic() > t_start + max_duration: + logger.info("Finished partial segment check, last segment checked is %d", segment) + self.config.set("repository", "last_segment_checked", str(segment)) + self.save_config(self.path, self.config) + break + else: + logger.info("Finished segment check at segment %d", segment) + self.config.remove_option("repository", "last_segment_checked") + self.save_config(self.path, self.config) + + pi.finish() + self._send_log() + # self.index, self.segments, self.compact now reflect the state of the segment files up to . + # We might need to add a commit tag if no committed segment is found. + if repair and segments_transaction_id is None: + report_error(f"Adding commit tag to segment {transaction_id}") + self.io.segment = transaction_id + 1 + self.io.write_commit() + if not partial: + logger.info("Starting repository index check") + if current_index and not repair: + # current_index = "as found on disk" + # self.index = "as rebuilt in-memory from segments" + if len(current_index) != len(self.index): + report_error("Index object count mismatch.") + report_error("committed index: %d objects", len(current_index)) + report_error("rebuilt index: %d objects", len(self.index)) + else: + logger.info("Index object count match.") + line_format = "ID: %-64s rebuilt index: %-16s committed index: %-16s" + not_found = "" + for key, value in self.index.iteritems(): + current_value = current_index.get(key, not_found) + if current_value != value: + report_error(line_format, bin_to_hex(key), value, current_value) + self._send_log() + for key, current_value in current_index.iteritems(): + if key in self.index: + continue + value = self.index.get(key, not_found) + if current_value != value: + report_error(line_format, bin_to_hex(key), value, current_value) + self._send_log() + if repair: + self.write_index() + self.rollback() + if error_found: + if repair: + logger.info("Finished %s repository check, errors found and repaired.", mode) + else: + logger.error("Finished %s repository check, errors found.", mode) + else: + logger.info("Finished %s repository check, no problems found.", mode) + return not error_found or repair + + def _rollback(self, *, cleanup): + if cleanup: + self.io.cleanup(self.io.get_segments_transaction_id()) + self.index = None + self._active_txn = False + self.transaction_doomed = None + + def rollback(self): + # note: when used in remote mode, this is time limited, see LegacyRemoteRepository.shutdown_time. + self._rollback(cleanup=False) + + def __len__(self): + if not self.index: + self.index = self.open_index(self.get_transaction_id()) + return len(self.index) + + def __contains__(self, id): + if not self.index: + self.index = self.open_index(self.get_transaction_id()) + return id in self.index + + def list(self, limit=None, marker=None): + """ + list IDs starting from after id - in index (pseudo-random) order. + """ + if not self.index: + self.index = self.open_index(self.get_transaction_id()) + return [id_ for id_, _ in islice(self.index.iteritems(marker=marker), limit)] + + def get(self, id, read_data=True): + if not self.index: + self.index = self.open_index(self.get_transaction_id()) + try: + in_index = NSIndexEntry(*((self.index[id] + (None,))[:3])) # legacy: index entries have no size element + return self.io.read(in_index.segment, in_index.offset, id, expected_size=in_index.size, read_data=read_data) + except KeyError: + raise self.ObjectNotFound(id, self.path) from None + + def get_many(self, ids, read_data=True, is_preloaded=False): + for id_ in ids: + yield self.get(id_, read_data=read_data) + + def put(self, id, data, wait=True): + """put a repo object + + Note: when doing calls with wait=False this gets async and caller must + deal with async results / exceptions later. + """ + if not self._active_txn: + self.prepare_txn(self.get_transaction_id()) + try: + in_index = self.index[id] + except KeyError: + pass + else: + # this put call supersedes a previous put to same id. + # it is essential to do a delete first to get correct quota bookkeeping + # and also a correctly updated shadow_index, so that the compaction code + # does not wrongly resurrect an old PUT by dropping a DEL that is still needed. + self._delete(id, in_index.segment, in_index.offset, in_index.size) + segment, offset = self.io.write_put(id, data) + self.storage_quota_use += header_size(TAG_PUT2) + len(data) + self.segments.setdefault(segment, 0) + self.segments[segment] += 1 + self.index[id] = NSIndexEntry(segment, offset, len(data)) + if self.storage_quota and self.storage_quota_use > self.storage_quota: + self.transaction_doomed = self.StorageQuotaExceeded( + format_file_size(self.storage_quota), format_file_size(self.storage_quota_use) + ) + raise self.transaction_doomed + + def delete(self, id, wait=True): + """delete a repo object + + Note: when doing calls with wait=False this gets async and caller must + deal with async results / exceptions later. + """ + if not self._active_txn: + self.prepare_txn(self.get_transaction_id()) + try: + in_index = self.index.pop(id) + except KeyError: + raise self.ObjectNotFound(id, self.path) from None + self._delete(id, in_index.segment, in_index.offset, in_index.size) + + def _delete(self, id, segment, offset, size): + # common code used by put and delete + # because we'll write a DEL tag to the repository, we must update the shadow index. + # this is always true, no matter whether we are called from put() or delete(). + # the compaction code needs this to not drop DEL tags if they are still required + # to keep a PUT in an earlier segment in the "effectively deleted" state. + self.shadow_index.setdefault(id, []).append(segment) + self.segments[segment] -= 1 + self.compact[segment] += header_size(TAG_PUT2) + size + segment, size = self.io.write_delete(id) + self.compact[segment] += size + self.segments.setdefault(segment, 0) + + def async_response(self, wait=True): + """Get one async result (only applies to remote repositories). + + async commands (== calls with wait=False, e.g. delete and put) have no results, + but may raise exceptions. These async exceptions must get collected later via + async_response() calls. Repeat the call until it returns None. + The previous calls might either return one (non-None) result or raise an exception. + If wait=True is given and there are outstanding responses, it will wait for them + to arrive. With wait=False, it will only return already received responses. + """ + + def preload(self, ids): + """Preload objects (only applies to remote repositories)""" + + def get_manifest(self): + try: + return self.get(Manifest.MANIFEST_ID) + except self.ObjectNotFound: + raise NoManifestError + + def put_manifest(self, data): + return self.put(Manifest.MANIFEST_ID, data) + + +class LoggedIO: + class SegmentFull(Exception): + """raised when a segment is full, before opening next""" + + header_fmt = struct.Struct(" transaction_id: + self.delete_segment(segment) + count += 1 + else: + break + logger.debug("Cleaned up %d uncommitted segment files (== everything after segment %d).", count, transaction_id) + + def is_committed_segment(self, segment): + """Check if segment ends with a COMMIT_TAG tag""" + try: + iterator = self.iter_objects(segment) + except IntegrityError: + return False + with open(self.segment_filename(segment), "rb") as fd: + try: + fd.seek(-self.header_fmt.size, os.SEEK_END) + except OSError as e: + # return False if segment file is empty or too small + if e.errno == errno.EINVAL: + return False + raise e + if fd.read(self.header_fmt.size) != self.COMMIT: + return False + seen_commit = False + while True: + try: + tag, key, offset, _, _ = next(iterator) + except IntegrityError: + return False + except StopIteration: + break + if tag == TAG_COMMIT: + seen_commit = True + continue + if seen_commit: + return False + return seen_commit + + def segment_filename(self, segment): + return os.path.join(self.path, "data", str(segment // self.segments_per_dir), str(segment)) + + def get_write_fd(self, no_new=False, want_new=False, raise_full=False): + if not no_new and (want_new or self.offset and self.offset > self.limit): + if raise_full: + raise self.SegmentFull + self.close_segment() + if not self._write_fd: + if self.segment % self.segments_per_dir == 0: + dirname = os.path.join(self.path, "data", str(self.segment // self.segments_per_dir)) + if not os.path.exists(dirname): + os.mkdir(dirname) + sync_dir(os.path.join(self.path, "data")) + self._write_fd = SyncFile(self.segment_filename(self.segment), binary=True) + self._write_fd.write(MAGIC) + self.offset = MAGIC_LEN + if self.segment in self.fds: + # we may have a cached fd for a segment file we already deleted and + # we are writing now a new segment file to same file name. get rid of + # the cached fd that still refers to the old file, so it will later + # get repopulated (on demand) with a fd that refers to the new file. + del self.fds[self.segment] + return self._write_fd + + def get_fd(self, segment): + # note: get_fd() returns a fd with undefined file pointer position, + # so callers must always seek() to desired position afterwards. + now = time.monotonic() + + def open_fd(): + fd = open(self.segment_filename(segment), "rb") + self.fds[segment] = (now, fd) + return fd + + def clean_old(): + # we regularly get rid of all old FDs here: + if now - self._fds_cleaned > FD_MAX_AGE // 8: + self._fds_cleaned = now + for k, ts_fd in list(self.fds.items()): + ts, fd = ts_fd + if now - ts > FD_MAX_AGE: + # we do not want to touch long-unused file handles to + # avoid ESTALE issues (e.g. on network filesystems). + del self.fds[k] + + clean_old() + if self._write_fd is not None: + # without this, we have a test failure now + self._write_fd.sync() + try: + ts, fd = self.fds[segment] + except KeyError: + fd = open_fd() + else: + # we only have fresh enough stuff here. + # update the timestamp of the lru cache entry. + self.fds.replace(segment, (now, fd)) + return fd + + def close_segment(self): + # set self._write_fd to None early to guard against reentry from error handling code paths: + fd, self._write_fd = self._write_fd, None + if fd is not None: + self.segment += 1 + self.offset = 0 + fd.close() + + def delete_segment(self, segment): + if segment in self.fds: + del self.fds[segment] + try: + safe_unlink(self.segment_filename(segment)) + except FileNotFoundError: + pass + + def clear_empty_dirs(self): + """Delete empty segment dirs, i.e those with no segment files.""" + data_dir = os.path.join(self.path, "data") + segment_dirs = self.get_segment_dirs(data_dir) + for segment_dir in segment_dirs: + try: + # os.rmdir will only delete the directory if it is empty + # so we don't need to explicitly check for emptiness first. + os.rmdir(segment_dir) + except OSError: + # OSError is raised by os.rmdir if directory is not empty. This is expected. + # Its subclass FileNotFoundError may be raised if the directory already does not exist. Ignorable. + pass + sync_dir(data_dir) + + def segment_exists(self, segment): + filename = self.segment_filename(segment) + # When deleting segments, they are first truncated. If truncate(2) and unlink(2) are split + # across FS transactions, then logically deleted segments will show up as truncated. + return os.path.exists(filename) and os.path.getsize(filename) + + def segment_size(self, segment): + return os.path.getsize(self.segment_filename(segment)) + + def get_segment_magic(self, segment): + fd = self.get_fd(segment) + fd.seek(0) + return fd.read(MAGIC_LEN) + + def iter_objects(self, segment, read_data=True): + """ + Return object iterator for *segment*. + + See the _read() docstring about confidence in the returned data. + + The iterator returns five-tuples of (tag, key, offset, size, data). + """ + fd = self.get_fd(segment) + offset = 0 + fd.seek(offset) + if fd.read(MAGIC_LEN) != MAGIC: + raise IntegrityError(f"Invalid segment magic [segment {segment}, offset {offset}]") + offset = MAGIC_LEN + header = fd.read(self.header_fmt.size) + while header: + size, tag, key, data = self._read( + fd, header, segment, offset, (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT), read_data=read_data + ) + # tuple[3]: corresponds to len(data) == length of the full chunk payload (meta_len+enc_meta+enc_data) + # tuple[4]: data will be None if read_data is False. + yield tag, key, offset, size - header_size(tag), data + assert size >= 0 + offset += size + # we must get the fd via get_fd() here again as we yielded to our caller and it might + # have triggered closing of the fd we had before (e.g. by calling io.read() for + # different segment(s)). + # by calling get_fd() here again we also make our fd "recently used" so it likely + # does not get kicked out of self.fds LRUcache. + fd = self.get_fd(segment) + fd.seek(offset) + header = fd.read(self.header_fmt.size) + + def recover_segment(self, segment, filename): + logger.info("Attempting to recover " + filename) + if segment in self.fds: + del self.fds[segment] + if os.path.getsize(filename) < MAGIC_LEN + self.header_fmt.size: + # this is either a zero-byte file (which would crash mmap() below) or otherwise + # just too small to be a valid non-empty segment file, so do a shortcut here: + with SaveFile(filename, binary=True) as fd: + fd.write(MAGIC) + return + with SaveFile(filename, binary=True) as dst_fd: + with open(filename, "rb") as src_fd: + # note: file must not be 0 size or mmap() will crash. + with mmap.mmap(src_fd.fileno(), 0, access=mmap.ACCESS_READ) as mm: + # memoryview context manager is problematic, see https://bugs.python.org/issue35686 + data = memoryview(mm) + d = data + try: + dst_fd.write(MAGIC) + while len(d) >= self.header_fmt.size: + crc, size, tag = self.header_fmt.unpack(d[: self.header_fmt.size]) + size_invalid = size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d) + if size_invalid or tag > MAX_TAG_ID: + d = d[1:] + continue + if tag == TAG_PUT2: + c_offset = self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE + # skip if header is invalid + if crc32(d[4:c_offset]) & 0xFFFFFFFF != crc: + d = d[1:] + continue + # skip if content is invalid + if ( + self.entry_hash(d[4 : self.HEADER_ID_SIZE], d[c_offset:size]) + != d[self.HEADER_ID_SIZE : c_offset] + ): + d = d[1:] + continue + elif tag in (TAG_DELETE, TAG_COMMIT, TAG_PUT): + if crc32(d[4:size]) & 0xFFFFFFFF != crc: + d = d[1:] + continue + else: # tag unknown + d = d[1:] + continue + dst_fd.write(d[:size]) + d = d[size:] + finally: + del d + data.release() + + def entry_hash(self, *data): + h = StreamingXXH64() + for d in data: + h.update(d) + return h.digest() + + def read(self, segment, offset, id, *, read_data=True, expected_size=None): + """ + Read entry from *segment* at *offset* with *id*. + + See the _read() docstring about confidence in the returned data. + """ + if segment == self.segment and self._write_fd: + self._write_fd.sync() + fd = self.get_fd(segment) + fd.seek(offset) + header = fd.read(self.header_fmt.size) + size, tag, key, data = self._read(fd, header, segment, offset, (TAG_PUT2, TAG_PUT), read_data=read_data) + if id != key: + raise IntegrityError( + f"Invalid segment entry header, is not for wanted id [segment {segment}, offset {offset}]" + ) + data_size_from_header = size - header_size(tag) + if expected_size is not None and expected_size != data_size_from_header: + raise IntegrityError( + f"size from repository index: {expected_size} != " f"size from entry header: {data_size_from_header}" + ) + return data + + def _read(self, fd, header, segment, offset, acceptable_tags, read_data=True): + """ + Code shared by read() and iter_objects(). + + Confidence in returned data: + PUT2 tags, read_data == True: crc32 check (header) plus digest check (header+data) + PUT2 tags, read_data == False: crc32 check (header) + PUT tags, read_data == True: crc32 check (header+data) + PUT tags, read_data == False: crc32 check can not be done, all data obtained must be considered informational + + read_data == False behaviour: + PUT2 tags: return enough of the chunk so that the client is able to decrypt the metadata, + do not read, but just seek over the data. + PUT tags: return None and just seek over the data. + """ + + def check_crc32(wanted, header, *data): + result = crc32(memoryview(header)[4:]) # skip first 32 bits of the header, they contain the crc. + for d in data: + result = crc32(d, result) + if result & 0xFFFFFFFF != wanted: + raise IntegrityError(f"Segment entry header checksum mismatch [segment {segment}, offset {offset}]") + + # See comment on MAX_TAG_ID for details + assert max(acceptable_tags) <= MAX_TAG_ID, "Exceeding MAX_TAG_ID will break backwards compatibility" + key = data = None + fmt = self.header_fmt + try: + hdr_tuple = fmt.unpack(header) + except struct.error as err: + raise IntegrityError(f"Invalid segment entry header [segment {segment}, offset {offset}]: {err}") from None + crc, size, tag = hdr_tuple + length = size - fmt.size # we already read the header + if size > MAX_OBJECT_SIZE: + # if you get this on an archive made with borg < 1.0.7 and millions of files and + # you need to restore it, you can disable this check by using "if False:" above. + raise IntegrityError(f"Invalid segment entry size {size} - too big [segment {segment}, offset {offset}]") + if size < fmt.size: + raise IntegrityError(f"Invalid segment entry size {size} - too small [segment {segment}, offset {offset}]") + if tag not in (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT): + raise IntegrityError( + f"Invalid segment entry header, did not get a known tag " f"[segment {segment}, offset {offset}]" + ) + if tag not in acceptable_tags: + raise IntegrityError( + f"Invalid segment entry header, did not get acceptable tag " f"[segment {segment}, offset {offset}]" + ) + if tag == TAG_COMMIT: + check_crc32(crc, header) + # that's all for COMMITs. + else: + # all other tags (TAG_PUT2, TAG_DELETE, TAG_PUT) have a key + key = fd.read(32) + length -= 32 + if len(key) != 32: + raise IntegrityError( + f"Segment entry key short read [segment {segment}, offset {offset}]: " + f"expected {32}, got {len(key)} bytes" + ) + if tag == TAG_DELETE: + check_crc32(crc, header, key) + # that's all for DELETEs. + else: + # TAG_PUT: we can not do a crc32 header check here, because the crc32 is computed over header+data! + # for the check, see code below when read_data is True. + if tag == TAG_PUT2: + entry_hash = fd.read(self.ENTRY_HASH_SIZE) + length -= self.ENTRY_HASH_SIZE + if len(entry_hash) != self.ENTRY_HASH_SIZE: + raise IntegrityError( + f"Segment entry hash short read [segment {segment}, offset {offset}]: " + f"expected {self.ENTRY_HASH_SIZE}, got {len(entry_hash)} bytes" + ) + check_crc32(crc, header, key, entry_hash) + if not read_data: + if tag == TAG_PUT2: + # PUT2 is only used in new repos and they also have different RepoObj layout, + # supporting separately encrypted metadata and data. + # In this case, we return enough bytes so the client can decrypt the metadata + # and seek over the rest (over the encrypted data). + hdr_size = RepoObj.obj_header.size + hdr = fd.read(hdr_size) + length -= hdr_size + if len(hdr) != hdr_size: + raise IntegrityError( + f"Segment entry meta length short read [segment {segment}, offset {offset}]: " + f"expected {hdr_size}, got {len(hdr)} bytes" + ) + meta_size = RepoObj.obj_header.unpack(hdr)[0] + meta = fd.read(meta_size) + length -= meta_size + if len(meta) != meta_size: + raise IntegrityError( + f"Segment entry meta short read [segment {segment}, offset {offset}]: " + f"expected {meta_size}, got {len(meta)} bytes" + ) + data = hdr + meta # shortened chunk - enough so the client can decrypt the metadata + # in any case, we seek over the remainder of the chunk + oldpos = fd.tell() + seeked = fd.seek(length, os.SEEK_CUR) - oldpos + if seeked != length: + raise IntegrityError( + f"Segment entry data short seek [segment {segment}, offset {offset}]: " + f"expected {length}, got {seeked} bytes" + ) + else: # read data! + data = fd.read(length) + if len(data) != length: + raise IntegrityError( + f"Segment entry data short read [segment {segment}, offset {offset}]: " + f"expected {length}, got {len(data)} bytes" + ) + if tag == TAG_PUT2: + if self.entry_hash(memoryview(header)[4:], key, data) != entry_hash: + raise IntegrityError(f"Segment entry hash mismatch [segment {segment}, offset {offset}]") + elif tag == TAG_PUT: + check_crc32(crc, header, key, data) + return size, tag, key, data + + def write_put(self, id, data, raise_full=False): + data_size = len(data) + if data_size > MAX_DATA_SIZE: + # this would push the segment entry size beyond MAX_OBJECT_SIZE. + raise IntegrityError(f"More than allowed put data [{data_size} > {MAX_DATA_SIZE}]") + fd = self.get_write_fd(want_new=(id == Manifest.MANIFEST_ID), raise_full=raise_full) + size = data_size + self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE + offset = self.offset + header = self.header_no_crc_fmt.pack(size, TAG_PUT2) + entry_hash = self.entry_hash(header, id, data) + crc = self.crc_fmt.pack(crc32(entry_hash, crc32(id, crc32(header))) & 0xFFFFFFFF) + fd.write(b"".join((crc, header, id, entry_hash))) + fd.write(data) + self.offset += size + return self.segment, offset + + def write_delete(self, id, raise_full=False): + fd = self.get_write_fd(want_new=(id == Manifest.MANIFEST_ID), raise_full=raise_full) + header = self.header_no_crc_fmt.pack(self.HEADER_ID_SIZE, TAG_DELETE) + crc = self.crc_fmt.pack(crc32(id, crc32(header)) & 0xFFFFFFFF) + fd.write(b"".join((crc, header, id))) + self.offset += self.HEADER_ID_SIZE + return self.segment, self.HEADER_ID_SIZE + + def write_commit(self, intermediate=False): + # Intermediate commits go directly into the current segment - this makes checking their validity more + # expensive, but is faster and reduces clobber. Final commits go into a new segment. + fd = self.get_write_fd(want_new=not intermediate, no_new=intermediate) + if intermediate: + fd.sync() + header = self.header_no_crc_fmt.pack(self.header_fmt.size, TAG_COMMIT) + crc = self.crc_fmt.pack(crc32(header) & 0xFFFFFFFF) + fd.write(b"".join((crc, header))) + self.close_segment() + return self.segment - 1 # close_segment() increments it + + +assert LoggedIO.HEADER_ID_SIZE + LoggedIO.ENTRY_HASH_SIZE == 41 + 8 # see constants.MAX_OBJECT_SIZE diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 9b23cb63c7..a5eb7b89a8 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -1,17 +1,19 @@ import enum import re -from collections import abc, namedtuple +from collections import namedtuple from datetime import datetime, timedelta, timezone from operator import attrgetter from collections.abc import Sequence +from borgstore.store import ObjectNotFound, ItemInfo + from .logger import create_logger logger = create_logger() from .constants import * # NOQA from .helpers.datastruct import StableDict -from .helpers.parseformat import bin_to_hex +from .helpers.parseformat import bin_to_hex, hex_to_bin from .helpers.time import parse_timestamp, calculate_relative_offset, archive_ts_now from .helpers.errors import Error from .patterns import get_regex_from_pattern @@ -66,49 +68,169 @@ def get_first_and_last_archive_ts(archives_list): return archives -class Archives(abc.MutableMapping): +class Archives: """ - Nice wrapper around the archives dict, making sure only valid types/values get in - and we can deal with str keys (and it internally encodes to byte keys) and either - str timestamps or datetime timestamps. + Manage the list of archives. + + We still need to support the borg 1.x manifest-with-list-of-archives, + so borg transfer can work. + borg2 has separate items archives/* in the borgstore. """ - def __init__(self): + def __init__(self, repository, manifest): + from .repository import Repository + from .remote import RemoteRepository + + self.repository = repository + self.legacy = not isinstance(repository, (Repository, RemoteRepository)) # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts) self._archives = {} + self.manifest = manifest + + def prepare(self, manifest, m): + if not self.legacy: + pass + else: + self._set_raw_dict(m.archives) + + def finish(self, manifest): + if not self.legacy: + manifest_archives = {} + else: + manifest_archives = StableDict(self._get_raw_dict()) + return manifest_archives + + def count(self): + # return the count of archives in the repo + if not self.legacy: + try: + infos = list(self.repository.store_list("archives")) + except ObjectNotFound: + infos = [] + return len(infos) # we do not check here if entries are valid + else: + return len(self._archives) - def __len__(self): - return len(self._archives) + def exists(self, name): + # check if an archive with this name exists + assert isinstance(name, str) + if not self.legacy: + return name in self.names() + else: + return name in self._archives - def __iter__(self): - return iter(self._archives) + def exists_name_and_id(self, name, id): + # check if an archive with this name AND id exists + assert isinstance(name, str) + assert isinstance(id, bytes) + if not self.legacy: + for _, archive_info in self._infos(): + if archive_info["name"] == name and archive_info["id"] == id: + return True + else: + return False + else: + raise NotImplementedError + + def _infos(self): + # yield the infos of all archives: (store_key, archive_info) + from .helpers import msgpack + + if not self.legacy: + try: + infos = list(self.repository.store_list("archives")) + except ObjectNotFound: + infos = [] + for info in infos: + info = ItemInfo(*info) # RPC does not give us a NamedTuple + value = self.repository.store_load(f"archives/{info.name}") + _, value = self.manifest.repo_objs.parse(hex_to_bin(info.name), value, ro_type=ROBJ_MANIFEST) + archive_info = msgpack.unpackb(value) + yield info.name, archive_info + else: + for name in self._archives: + archive_info = dict(name=name, id=self._archives[name]["id"], time=self._archives[name]["time"]) + yield None, archive_info - def __getitem__(self, name): + def _lookup_name(self, name, raw=False): assert isinstance(name, str) - values = self._archives.get(name) - if values is None: - raise KeyError - ts = parse_timestamp(values["time"]) - return ArchiveInfo(name=name, id=values["id"], ts=ts) + assert not self.legacy + for store_key, archive_info in self._infos(): + if archive_info["name"] == name: + if not raw: + ts = parse_timestamp(archive_info["time"]) + return store_key, ArchiveInfo(name=name, id=archive_info["id"], ts=ts) + else: + return store_key, archive_info + else: + raise KeyError(name) + + def names(self): + # yield the names of all archives + if not self.legacy: + for _, archive_info in self._infos(): + yield archive_info["name"] + else: + yield from self._archives - def __setitem__(self, name, info): + def get(self, name, raw=False): + assert isinstance(name, str) + if not self.legacy: + try: + store_key, archive_info = self._lookup_name(name, raw=raw) + return archive_info + except KeyError: + return None + else: + values = self._archives.get(name) + if values is None: + return None + if not raw: + ts = parse_timestamp(values["time"]) + return ArchiveInfo(name=name, id=values["id"], ts=ts) + else: + return dict(name=name, id=values["id"], time=values["time"]) + + def create(self, name, id, ts, *, overwrite=False): assert isinstance(name, str) - assert isinstance(info, tuple) - id, ts = info assert isinstance(id, bytes) if isinstance(ts, datetime): ts = ts.isoformat(timespec="microseconds") assert isinstance(ts, str) - self._archives[name] = {"id": id, "time": ts} + if not self.legacy: + try: + store_key, _ = self._lookup_name(name) + except KeyError: + pass + else: + # looks like we already have an archive list entry with that name + if not overwrite: + raise KeyError("archive already exists") + else: + self.repository.store_delete(f"archives/{store_key}") + archive = dict(name=name, id=id, time=ts) + value = self.manifest.key.pack_metadata(archive) + id = self.manifest.repo_objs.id_hash(value) + key = bin_to_hex(id) + value = self.manifest.repo_objs.format(id, {}, value, ro_type=ROBJ_MANIFEST) + self.repository.store_store(f"archives/{key}", value) + else: + if self.exists(name) and not overwrite: + raise KeyError("archive already exists") + self._archives[name] = {"id": id, "time": ts} - def __delitem__(self, name): + def delete(self, name): + # delete an archive assert isinstance(name, str) - del self._archives[name] + if not self.legacy: + store_key, archive_info = self._lookup_name(name) + self.repository.store_delete(f"archives/{store_key}") + else: + self._archives.pop(name) def list( self, *, - consider_checkpoints=True, match=None, match_end=r"\Z", sort_by=(), @@ -140,15 +262,13 @@ def list( if isinstance(sort_by, (str, bytes)): raise TypeError("sort_by must be a sequence of str") - archives = self.values() + archives = [self.get(name) for name in self.names()] regex = get_regex_from_pattern(match or "re:.*") regex = re.compile(regex + match_end) archives = [x for x in archives if regex.match(x.name) is not None] if any([oldest, newest, older, newer]): archives = filter_archives_by_date(archives, oldest=oldest, newest=newest, newer=newer, older=older) - if not consider_checkpoints: - archives = [x for x in archives if ".checkpoint" not in x.name] for sortkey in reversed(sort_by): archives.sort(key=attrgetter(sortkey)) if first: @@ -161,18 +281,15 @@ def list( def list_considering(self, args): """ - get a list of archives, considering --first/last/prefix/match-archives/sort/consider-checkpoints cmdline args + get a list of archives, considering --first/last/prefix/match-archives/sort cmdline args """ name = getattr(args, "name", None) - consider_checkpoints = getattr(args, "consider_checkpoints", None) if name is not None: raise Error( - "Giving a specific name is incompatible with options --first, --last, " - "-a / --match-archives, and --consider-checkpoints." + "Giving a specific name is incompatible with options --first, --last " "and -a / --match-archives." ) return self.list( sort_by=args.sort_by.split(","), - consider_checkpoints=consider_checkpoints, match=args.match_archives, first=getattr(args, "first", None), last=getattr(args, "last", None), @@ -182,14 +299,14 @@ def list_considering(self, args): newest=getattr(args, "newest", None), ) - def set_raw_dict(self, d): + def _set_raw_dict(self, d): """set the dict we get from the msgpack unpacker""" for k, v in d.items(): assert isinstance(k, str) assert isinstance(v, dict) and "id" in v and "time" in v self._archives[k] = v - def get_raw_dict(self): + def _get_raw_dict(self): """get the dict we can give to the msgpack packer""" return self._archives @@ -226,7 +343,7 @@ class Operation(enum.Enum): MANIFEST_ID = b"\0" * 32 def __init__(self, key, repository, item_keys=None, ro_cls=RepoObj): - self.archives = Archives() + self.archives = Archives(repository, self) self.config = {} self.key = key self.repo_objs = ro_cls(key) @@ -246,12 +363,8 @@ def last_timestamp(self): def load(cls, repository, operations, key=None, *, ro_cls=RepoObj): from .item import ManifestItem from .crypto.key import key_factory - from .repository import Repository - try: - cdata = repository.get(cls.MANIFEST_ID) - except Repository.ObjectNotFound: - raise NoManifestError + cdata = repository.get_manifest() if not key: key = key_factory(repository, cdata, ro_cls=ro_cls) manifest = cls(key, repository, ro_cls=ro_cls) @@ -261,7 +374,7 @@ def load(cls, repository, operations, key=None, *, ro_cls=RepoObj): manifest.id = manifest.repo_objs.id_hash(data) if m.get("version") not in (1, 2): raise ValueError("Invalid manifest version") - manifest.archives.set_raw_dict(m.archives) + manifest.archives.prepare(manifest, m) manifest.timestamp = m.get("timestamp") manifest.config = m.config # valid item keys are whatever is known in the repo or every key we know @@ -308,16 +421,15 @@ def write(self): max_ts = max(incremented_ts, now_ts) self.timestamp = max_ts.isoformat(timespec="microseconds") # include checks for limits as enforced by limited unpacker (used by load()) - assert len(self.archives) <= MAX_ARCHIVES - assert all(len(name) <= 255 for name in self.archives) + assert self.archives.count() <= MAX_ARCHIVES + assert all(len(name) <= 255 for name in self.archives.names()) assert len(self.item_keys) <= 100 self.config["item_keys"] = tuple(sorted(self.item_keys)) + manifest_archives = self.archives.finish(self) manifest = ManifestItem( - version=2, - archives=StableDict(self.archives.get_raw_dict()), - timestamp=self.timestamp, - config=StableDict(self.config), + version=2, archives=manifest_archives, timestamp=self.timestamp, config=StableDict(self.config) ) data = self.key.pack_metadata(manifest.as_dict()) self.id = self.repo_objs.id_hash(data) - self.repository.put(self.MANIFEST_ID, self.repo_objs.format(self.MANIFEST_ID, {}, data, ro_type=ROBJ_MANIFEST)) + robj = self.repo_objs.format(self.MANIFEST_ID, {}, data, ro_type=ROBJ_MANIFEST) + self.repository.put_manifest(robj) diff --git a/src/borg/remote.py b/src/borg/remote.py index 924b36ad7a..3bd85fd90b 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -30,9 +30,11 @@ from .helpers import safe_unlink from .helpers import prepare_subprocess_env, ignore_sigint from .helpers import get_socket_filename -from .locking import LockTimeout, NotLocked, NotMyLock, LockFailed +from .fslocking import LockTimeout, NotLocked, NotMyLock, LockFailed from .logger import create_logger, borg_serve_log_queue +from .manifest import NoManifestError from .helpers import msgpack +from .legacyrepository import LegacyRepository from .repository import Repository from .version import parse_version, format_version from .checksums import xxh64 @@ -125,7 +127,7 @@ class ConnectionBrokenWithHint(Error): # For the client the return of the negotiate method is a dict which includes the server version. # # All method calls on the remote repository object must be allowlisted in RepositoryServer.rpc_methods and have api -# stubs in RemoteRepository. The @api decorator on these stubs is used to set server version requirements. +# stubs in RemoteRepository*. The @api decorator on these stubs is used to set server version requirements. # # Method parameters are identified only by name and never by position. Unknown parameters are ignored by the server. # If a new parameter is important and may not be ignored, on the client a parameter specific version requirement needs @@ -135,17 +137,14 @@ class ConnectionBrokenWithHint(Error): class RepositoryServer: # pragma: no cover - rpc_methods = ( + _legacy_rpc_methods = ( # LegacyRepository "__len__", "check", "commit", "delete", "destroy", - "flags", - "flags_many", "get", "list", - "scan", "negotiate", "open", "close", @@ -158,8 +157,34 @@ class RepositoryServer: # pragma: no cover "inject_exception", ) + _rpc_methods = ( # Repository + "__len__", + "check", + "delete", + "destroy", + "get", + "list", + "negotiate", + "open", + "close", + "info", + "put", + "save_key", + "load_key", + "break_lock", + "inject_exception", + "get_manifest", + "put_manifest", + "store_list", + "store_load", + "store_store", + "store_delete", + ) + def __init__(self, restrict_to_paths, restrict_to_repositories, append_only, storage_quota, use_socket): self.repository = None + self.RepoCls = None + self.rpc_methods = ("open", "close", "negotiate") self.restrict_to_paths = restrict_to_paths self.restrict_to_repositories = restrict_to_repositories # This flag is parsed from the serve command line via Archiver.do_serve, @@ -228,6 +253,7 @@ def inner_serve(): self.repository.close() raise UnexpectedRPCDataFormatFromClient(__version__) try: + # logger.debug(f"{type(self)} method: {type(self.repository)}.{method}") if method not in self.rpc_methods: raise InvalidRPCMethod(method) try: @@ -237,14 +263,15 @@ def inner_serve(): args = self.filter_args(f, args) res = f(**args) except BaseException as e: + # logger.exception(e) ex_short = traceback.format_exception_only(e.__class__, e) ex_full = traceback.format_exception(*sys.exc_info()) ex_trace = True if isinstance(e, Error): ex_short = [e.get_message()] ex_trace = e.traceback - if isinstance(e, (Repository.DoesNotExist, Repository.AlreadyExists, PathNotAllowed)): - # These exceptions are reconstructed on the client end in RemoteRepository.call_many(), + if isinstance(e, (self.RepoCls.DoesNotExist, self.RepoCls.AlreadyExists, PathNotAllowed)): + # These exceptions are reconstructed on the client end in RemoteRepository*.call_many(), # and will be handled just like locally raised exceptions. Suppress the remote traceback # for these, except ErrorWithTraceback, which should always display a traceback. pass @@ -341,8 +368,18 @@ def _resolve_path(self, path): return os.path.realpath(path) def open( - self, path, create=False, lock_wait=None, lock=True, exclusive=None, append_only=False, make_parent_dirs=False + self, + path, + create=False, + lock_wait=None, + lock=True, + exclusive=None, + append_only=False, + make_parent_dirs=False, + v1_or_v2=False, ): + self.RepoCls = LegacyRepository if v1_or_v2 else Repository + self.rpc_methods = self._legacy_rpc_methods if v1_or_v2 else self._rpc_methods logging.debug("Resolving repository path %r", path) path = self._resolve_path(path) logging.debug("Resolved repository path to %r", path) @@ -368,7 +405,7 @@ def open( # while "borg init --append-only" (=append_only) does, regardless of the --append-only (self.append_only) # flag for serve. append_only = (not create and self.append_only) or append_only - self.repository = Repository( + self.repository = self.RepoCls( path, create, lock_wait=lock_wait, @@ -393,17 +430,17 @@ def inject_exception(self, kind): s1 = "test string" s2 = "test string2" if kind == "DoesNotExist": - raise Repository.DoesNotExist(s1) + raise self.RepoCls.DoesNotExist(s1) elif kind == "AlreadyExists": - raise Repository.AlreadyExists(s1) + raise self.RepoCls.AlreadyExists(s1) elif kind == "CheckNeeded": - raise Repository.CheckNeeded(s1) + raise self.RepoCls.CheckNeeded(s1) elif kind == "IntegrityError": raise IntegrityError(s1) elif kind == "PathNotAllowed": raise PathNotAllowed("foo") elif kind == "ObjectNotFound": - raise Repository.ObjectNotFound(s1, s2) + raise self.RepoCls.ObjectNotFound(s1, s2) elif kind == "InvalidRPCMethod": raise InvalidRPCMethod(s1) elif kind == "divide": @@ -550,7 +587,7 @@ def __init__( location, create=False, exclusive=False, - lock_wait=None, + lock_wait=1.0, lock=True, append_only=False, make_parent_dirs=False, @@ -585,7 +622,6 @@ def __init__( borg_cmd = self.ssh_cmd(location) + borg_cmd logger.debug("SSH command line: %s", borg_cmd) # we do not want the ssh getting killed by Ctrl-C/SIGINT because it is needed for clean shutdown of borg. - # borg's SIGINT handler tries to write a checkpoint and requires the remote repo connection. self.p = Popen(borg_cmd, bufsize=0, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env, preexec_fn=ignore_sigint) self.stdin_fd = self.p.stdin.fileno() self.stdout_fd = self.p.stdout.fileno() @@ -654,7 +690,7 @@ def __del__(self): logging.debug("still %d cached responses left in RemoteRepository" % (len(self.responses),)) if self.p or self.sock: self.close() - assert False, "cleanup happened in Repository.__del__" + assert False, "cleanup happened in RemoteRepository.__del__" def __repr__(self): return f"<{self.__class__.__name__} {self.location.canonical_path()}>" @@ -666,11 +702,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): try: if exc_type is not None: self.shutdown_time = time.monotonic() + 30 - self.rollback() finally: - # in any case, we want to close the repo cleanly, even if the - # rollback can not succeed (e.g. because the connection was - # already closed) and raised another exception: + # in any case, we want to close the repo cleanly. logger.debug( "RemoteRepository: %s bytes sent, %s bytes received, %d messages sent", format_file_size(self.tx_bytes), @@ -805,6 +838,8 @@ def handle_error(unpacked): raise NotLocked(args[0]) elif error == "NotMyLock": raise NotMyLock(args[0]) + elif error == "NoManifestError": + raise NoManifestError else: raise self.RPCError(unpacked) @@ -939,9 +974,18 @@ def handle_error(unpacked): since=parse_version("1.0.0"), append_only={"since": parse_version("1.0.7"), "previously": False}, make_parent_dirs={"since": parse_version("1.1.9"), "previously": False}, + v1_or_v2={"since": parse_version("2.0.0b8"), "previously": True}, # TODO fix version ) def open( - self, path, create=False, lock_wait=None, lock=True, exclusive=False, append_only=False, make_parent_dirs=False + self, + path, + create=False, + lock_wait=None, + lock=True, + exclusive=False, + append_only=False, + make_parent_dirs=False, + v1_or_v2=False, ): """actual remoting is done via self.call in the @api decorator""" @@ -973,24 +1017,8 @@ def destroy(self): def __len__(self): """actual remoting is done via self.call in the @api decorator""" - @api( - since=parse_version("1.0.0"), - mask={"since": parse_version("2.0.0b2"), "previously": 0}, - value={"since": parse_version("2.0.0b2"), "previously": 0}, - ) - def list(self, limit=None, marker=None, mask=0, value=0): - """actual remoting is done via self.call in the @api decorator""" - - @api(since=parse_version("2.0.0b3")) - def scan(self, limit=None, state=None): - """actual remoting is done via self.call in the @api decorator""" - - @api(since=parse_version("2.0.0b2")) - def flags(self, id, mask=0xFFFFFFFF, value=None): - """actual remoting is done via self.call in the @api decorator""" - - @api(since=parse_version("2.0.0b2")) - def flags_many(self, ids, mask=0xFFFFFFFF, value=None): + @api(since=parse_version("1.0.0")) + def list(self, limit=None, marker=None): """actual remoting is done via self.call in the @api decorator""" def get(self, id, read_data=True): @@ -1044,6 +1072,30 @@ def async_response(self, wait=True): def preload(self, ids): self.preload_ids += ids + @api(since=parse_version("2.0.0b8")) + def get_manifest(self): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def put_manifest(self, data): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def store_list(self, name): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def store_load(self, name): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def store_store(self, name, value): + """actual remoting is done via self.call in the @api decorator""" + + @api(since=parse_version("2.0.0b8")) + def store_delete(self, name): + """actual remoting is done via self.call in the @api decorator""" + class RepositoryNoCache: """A not caching Repository wrapper, passes through to repository. diff --git a/src/borg/repoobj.py b/src/borg/repoobj.py index 3fb2534ad4..64c054325f 100644 --- a/src/borg/repoobj.py +++ b/src/borg/repoobj.py @@ -1,6 +1,8 @@ +from collections import namedtuple from struct import Struct from .constants import * # NOQA +from .checksums import xxh64 from .helpers import msgpack, workarounds from .helpers.errors import IntegrityError from .compress import Compressor, LZ4_COMPRESSOR, get_compressor @@ -10,14 +12,17 @@ class RepoObj: - meta_len_hdr = Struct(" bytes: # used for crypto type detection - offs = cls.meta_len_hdr.size - meta_len = cls.meta_len_hdr.unpack(data[:offs])[0] - return data[offs + meta_len :] + hdr_size = cls.obj_header.size + hdr = cls.ObjHeader(*cls.obj_header.unpack(data[:hdr_size])) + return data[hdr_size + hdr.meta_size :] def __init__(self, key): self.key = key @@ -61,8 +66,9 @@ def format( data_encrypted = self.key.encrypt(id, data_compressed) meta_packed = msgpack.packb(meta) meta_encrypted = self.key.encrypt(id, meta_packed) - hdr = self.meta_len_hdr.pack(len(meta_encrypted)) - return hdr + meta_encrypted + data_encrypted + hdr = self.ObjHeader(len(meta_encrypted), len(data_encrypted), xxh64(meta_encrypted), xxh64(data_encrypted)) + hdr_packed = self.obj_header.pack(*hdr) + return hdr_packed + meta_encrypted + data_encrypted def parse_meta(self, id: bytes, cdata: bytes, ro_type: str) -> dict: # when calling parse_meta, enough cdata needs to be supplied to contain completely the @@ -71,11 +77,10 @@ def parse_meta(self, id: bytes, cdata: bytes, ro_type: str) -> dict: assert isinstance(cdata, bytes) assert isinstance(ro_type, str) obj = memoryview(cdata) - offs = self.meta_len_hdr.size - hdr = obj[:offs] - len_meta_encrypted = self.meta_len_hdr.unpack(hdr)[0] - assert offs + len_meta_encrypted <= len(obj) - meta_encrypted = obj[offs : offs + len_meta_encrypted] + hdr_size = self.obj_header.size + hdr = self.ObjHeader(*self.obj_header.unpack(obj[:hdr_size])) + assert hdr_size + hdr.meta_size <= len(obj) + meta_encrypted = obj[hdr_size : hdr_size + hdr.meta_size] meta_packed = self.key.decrypt(id, meta_encrypted) meta = msgpack.unpackb(meta_packed) if ro_type != ROBJ_DONTCARE and meta["type"] != ro_type: @@ -100,17 +105,16 @@ def parse( assert isinstance(id, bytes) assert isinstance(cdata, bytes) obj = memoryview(cdata) - offs = self.meta_len_hdr.size - hdr = obj[:offs] - len_meta_encrypted = self.meta_len_hdr.unpack(hdr)[0] - assert offs + len_meta_encrypted <= len(obj) - meta_encrypted = obj[offs : offs + len_meta_encrypted] - offs += len_meta_encrypted + hdr_size = self.obj_header.size + hdr = self.ObjHeader(*self.obj_header.unpack(obj[:hdr_size])) + assert hdr_size + hdr.meta_size <= len(obj) + meta_encrypted = obj[hdr_size : hdr_size + hdr.meta_size] meta_packed = self.key.decrypt(id, meta_encrypted) meta_compressed = msgpack.unpackb(meta_packed) # means: before adding more metadata in decompress block if ro_type != ROBJ_DONTCARE and meta_compressed["type"] != ro_type: raise IntegrityError(f"ro_type expected: {ro_type} got: {meta_compressed['type']}") - data_encrypted = obj[offs:] + assert hdr_size + hdr.meta_size + hdr.data_size <= len(obj) + data_encrypted = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size] data_compressed = self.key.decrypt(id, data_encrypted) # does not include the type/level bytes if decompress: ctype = meta_compressed["ctype"] diff --git a/src/borg/repository.py b/src/borg/repository.py index 079fbc21ef..23c3b15096 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1,137 +1,25 @@ -import errno -import mmap import os -import shutil -import stat -import struct import time -from collections import defaultdict -from configparser import ConfigParser -from datetime import datetime, timezone -from functools import partial -from itertools import islice -from typing import Callable, DefaultDict +from borgstore.store import Store +from borgstore.store import ObjectNotFound as StoreObjectNotFound +from borgstore.backends.errors import BackendDoesNotExist as StoreBackendDoesNotExist + +from .checksums import xxh64 from .constants import * # NOQA -from .hashindex import NSIndexEntry, NSIndex, NSIndex1, hashindex_variant -from .helpers import Error, ErrorWithTraceback, IntegrityError, format_file_size, parse_file_size +from .helpers import Error, ErrorWithTraceback, IntegrityError from .helpers import Location -from .helpers import ProgressIndicatorPercent from .helpers import bin_to_hex, hex_to_bin -from .helpers import secure_erase, safe_unlink -from .helpers import msgpack -from .helpers.lrucache import LRUCache -from .locking import Lock, LockError, LockErrorT +from .storelocking import Lock from .logger import create_logger -from .manifest import Manifest -from .platform import SaveFile, SyncFile, sync_dir, safe_fadvise +from .manifest import NoManifestError from .repoobj import RepoObj -from .checksums import crc32, StreamingXXH64 -from .crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError logger = create_logger(__name__) -MAGIC = b"BORG_SEG" -MAGIC_LEN = len(MAGIC) - -TAG_PUT = 0 -TAG_DELETE = 1 -TAG_COMMIT = 2 -TAG_PUT2 = 3 - -# Highest ID usable as TAG_* value -# -# Code may expect not to find any tags exceeding this value. In particular, -# in order to speed up `borg check --repair`, any tag greater than MAX_TAG_ID -# is assumed to be corrupted. When increasing this value, in order to add more -# tags, keep in mind that old versions of Borg accessing a new repository -# may not be able to handle the new tags. -MAX_TAG_ID = 15 - -FreeSpace: Callable[[], DefaultDict] = partial(defaultdict, int) - - -def header_size(tag): - if tag == TAG_PUT2: - size = LoggedIO.HEADER_ID_SIZE + LoggedIO.ENTRY_HASH_SIZE - elif tag == TAG_PUT or tag == TAG_DELETE: - size = LoggedIO.HEADER_ID_SIZE - elif tag == TAG_COMMIT: - size = LoggedIO.header_fmt.size - else: - raise ValueError(f"unsupported tag: {tag!r}") - return size - class Repository: - """ - Filesystem based transactional key value store - - Transactionality is achieved by using a log (aka journal) to record changes. The log is a series of numbered files - called segments. Each segment is a series of log entries. The segment number together with the offset of each - entry relative to its segment start establishes an ordering of the log entries. This is the "definition" of - time for the purposes of the log. - - Log entries are either PUT, DELETE or COMMIT. - - A COMMIT is always the final log entry in a segment and marks all data from the beginning of the log until the - segment ending with the COMMIT as committed and consistent. The segment number of a segment ending with a COMMIT - is called the transaction ID of that commit, and a segment ending with a COMMIT is called committed. - - When reading from a repository it is first checked whether the last segment is committed. If it is not, then - all segments after the last committed segment are deleted; they contain log entries whose consistency is not - established by a COMMIT. - - Note that the COMMIT can't establish consistency by itself, but only manages to do so with proper support from - the platform (including the hardware). See platform.base.SyncFile for details. - - A PUT inserts a key-value pair. The value is stored in the log entry, hence the repository implements - full data logging, meaning that all data is consistent, not just metadata (which is common in file systems). - - A DELETE marks a key as deleted. - - For a given key only the last entry regarding the key, which is called current (all other entries are called - superseded), is relevant: If there is no entry or the last entry is a DELETE then the key does not exist. - Otherwise the last PUT defines the value of the key. - - By superseding a PUT (with either another PUT or a DELETE) the log entry becomes obsolete. A segment containing - such obsolete entries is called sparse, while a segment containing no such entries is called compact. - - Sparse segments can be compacted and thereby disk space freed. This destroys the transaction for which the - superseded entries where current. - - On disk layout: - - dir/README - dir/config - dir/data// - dir/index.X - dir/hints.X - - File system interaction - ----------------------- - - LoggedIO generally tries to rely on common behaviours across transactional file systems. - - Segments that are deleted are truncated first, which avoids problems if the FS needs to - allocate space to delete the dirent of the segment. This mostly affects CoW file systems, - traditional journaling file systems have a fairly good grip on this problem. - - Note that deletion, i.e. unlink(2), is atomic on every file system that uses inode reference - counts, which includes pretty much all of them. To remove a dirent the inodes refcount has - to be decreased, but you can't decrease the refcount before removing the dirent nor can you - decrease the refcount after removing the dirent. File systems solve this with a lock, - and by ensuring it all stays within the same FS transaction. - - Truncation is generally not atomic in itself, and combining truncate(2) and unlink(2) is of - course never guaranteed to be atomic. Truncation in a classic extent-based FS is done in - roughly two phases, first the extents are removed then the inode is updated. (In practice - this is of course way more complex). - - LoggedIO gracefully handles truncate/unlink splits as long as the truncate resulted in - a zero length file. Zero length segments are considered not to exist, while LoggedIO.cleanup() - will still get rid of them. - """ + """borgstore based key value store""" class AlreadyExists(Error): """A repository already exists at {}.""" @@ -195,1132 +83,309 @@ class PathPermissionDenied(Error): def __init__( self, - path, + path_or_location, create=False, exclusive=False, - lock_wait=None, + lock_wait=1.0, lock=True, append_only=False, storage_quota=None, make_parent_dirs=False, send_log_cb=None, ): - self.path = os.path.abspath(path) - self._location = Location("file://%s" % self.path) + if isinstance(path_or_location, Location): + location = path_or_location + if location.proto == "file": + url = f"file://{location.path}" # frequently users give without file:// prefix + else: + url = location.processed # location as given by user, processed placeholders + else: + url = "file://%s" % os.path.abspath(path_or_location) + location = Location(url) + self._location = location + # use a Store with flat config storage and 2-levels-nested data storage + self.store = Store(url, levels={"config/": [0], "data/": [2]}) self.version = None # long-running repository methods which emit log or progress output are responsible for calling # the ._send_log method periodically to get log and progress output transferred to the borg client # in a timely manner, in case we have a RemoteRepository. # for local repositories ._send_log can be called also (it will just do nothing in that case). self._send_log = send_log_cb or (lambda: None) - self.io = None # type: LoggedIO - self.lock = None - self.index = None - # This is an index of shadowed log entries during this transaction. Consider the following sequence: - # segment_n PUT A, segment_x DELETE A - # After the "DELETE A" in segment_x the shadow index will contain "A -> [n]". - # .delete() is updating this index, it is persisted into "hints" file and is later used by .compact_segments(). - # We need the entries in the shadow_index to not accidentally drop the "DELETE A" when we compact segment_x - # only (and we do not compact segment_n), because DELETE A is still needed then because PUT A will be still - # there. Otherwise chunk A would reappear although it was previously deleted. - self.shadow_index = {} - self._active_txn = False - self.lock_wait = lock_wait - self.do_lock = lock self.do_create = create self.created = False + self.acceptable_repo_versions = (3,) + self.opened = False + self.append_only = append_only # XXX not implemented / not implementable + self.storage_quota = storage_quota # XXX not implemented + self.storage_quota_use = 0 # XXX not implemented + self.lock = None + self.do_lock = lock + self.lock_wait = lock_wait self.exclusive = exclusive - self.append_only = append_only - self.storage_quota = storage_quota - self.storage_quota_use = 0 - self.transaction_doomed = None - self.make_parent_dirs = make_parent_dirs - # v2 is the default repo version for borg 2.0 - # v1 repos must only be used in a read-only way, e.g. for - # --other-repo=V1_REPO with borg init and borg transfer! - self.acceptable_repo_versions = (1, 2) - - def __del__(self): - if self.lock: - self.close() - assert False, "cleanup happened in Repository.__del__" def __repr__(self): - return f"<{self.__class__.__name__} {self.path}>" + return f"<{self.__class__.__name__} {self._location}>" def __enter__(self): if self.do_create: self.do_create = False - self.create(self.path) + self.create() self.created = True - self.open(self.path, bool(self.exclusive), lock_wait=self.lock_wait, lock=self.do_lock) + self.open(exclusive=bool(self.exclusive), lock_wait=self.lock_wait, lock=self.do_lock) return self def __exit__(self, exc_type, exc_val, exc_tb): - if exc_type is not None: - no_space_left_on_device = exc_type is OSError and exc_val.errno == errno.ENOSPC - # The ENOSPC could have originated somewhere else besides the Repository. The cleanup is always safe, unless - # EIO or FS corruption ensues, which is why we specifically check for ENOSPC. - if self._active_txn and no_space_left_on_device: - logger.warning("No space left on device, cleaning up partial transaction to free space.") - cleanup = True - else: - cleanup = False - self._rollback(cleanup=cleanup) self.close() @property def id_str(self): return bin_to_hex(self.id) - @staticmethod - def is_repository(path): - """Check whether there is already a Borg repository at *path*.""" - try: - # Use binary mode to avoid troubles if a README contains some stuff not in our locale - with open(os.path.join(path, "README"), "rb") as fd: - # Read only the first ~100 bytes (if any), in case some README file we stumble upon is large. - readme_head = fd.read(100) - # The first comparison captures our current variant (REPOSITORY_README), the second comparison - # is an older variant of the README file (used by 1.0.x). - return b"Borg Backup repository" in readme_head or b"Borg repository" in readme_head - except OSError: - # Ignore FileNotFound, PermissionError, ... - return False - - def check_can_create_repository(self, path): - """ - Raise an exception if a repository already exists at *path* or any parent directory. - - Checking parent directories is done for two reasons: - (1) It's just a weird thing to do, and usually not intended. A Borg using the "parent" repository - may be confused, or we may accidentally put stuff into the "data/" or "data//" directories. - (2) When implementing repository quotas (which we currently don't), it's important to prohibit - folks from creating quota-free repositories. Since no one can create a repository within another - repository, user's can only use the quota'd repository, when their --restrict-to-path points - at the user's repository. - """ - try: - st = os.stat(path) - except FileNotFoundError: - pass # nothing there! - except PermissionError: - raise self.PathPermissionDenied(path) from None - else: - # there is something already there! - if self.is_repository(path): - raise self.AlreadyExists(path) - if not stat.S_ISDIR(st.st_mode): - raise self.PathAlreadyExists(path) - try: - files = os.listdir(path) - except PermissionError: - raise self.PathPermissionDenied(path) from None - else: - if files: # a dir, but not empty - raise self.PathAlreadyExists(path) - else: # an empty directory is acceptable for us. - pass - - while True: - # Check all parent directories for Borg's repository README - previous_path = path - # Thus, path = previous_path/.. - path = os.path.abspath(os.path.join(previous_path, os.pardir)) - if path == previous_path: - # We reached the root of the directory hierarchy (/.. = / and C:\.. = C:\). - break - if self.is_repository(path): - raise self.AlreadyExists(path) - - def create(self, path): - """Create a new empty repository at `path`""" - self.check_can_create_repository(path) - if self.make_parent_dirs: - parent_path = os.path.join(path, os.pardir) - os.makedirs(parent_path, exist_ok=True) - if not os.path.exists(path): - try: - os.mkdir(path) - except FileNotFoundError as err: - raise self.ParentPathDoesNotExist(path) from err - with open(os.path.join(path, "README"), "w") as fd: - fd.write(REPOSITORY_README) - os.mkdir(os.path.join(path, "data")) - config = ConfigParser(interpolation=None) - config.add_section("repository") - self.version = 2 - config.set("repository", "version", str(self.version)) - config.set("repository", "segments_per_dir", str(DEFAULT_SEGMENTS_PER_DIR)) - config.set("repository", "max_segment_size", str(DEFAULT_MAX_SEGMENT_SIZE)) - config.set("repository", "append_only", str(int(self.append_only))) - if self.storage_quota: - config.set("repository", "storage_quota", str(self.storage_quota)) - else: - config.set("repository", "storage_quota", "0") - config.set("repository", "additional_free_space", "0") - config.set("repository", "id", bin_to_hex(os.urandom(32))) - self.save_config(path, config) - - def save_config(self, path, config): - config_path = os.path.join(path, "config") - old_config_path = os.path.join(path, "config.old") - - if os.path.isfile(old_config_path): - logger.warning("Old config file not securely erased on previous config update") - secure_erase(old_config_path, avoid_collateral_damage=True) - - if os.path.isfile(config_path): - link_error_msg = ( - "Failed to erase old repository config file securely (hardlinks not supported). " - "Old repokey data, if any, might persist on physical storage." - ) - try: - os.link(config_path, old_config_path) - except OSError as e: - if e.errno in (errno.EMLINK, errno.ENOSYS, errno.EPERM, errno.EACCES, errno.ENOTSUP, errno.EIO): - logger.warning(link_error_msg) - else: - raise - except AttributeError: - # some python ports have no os.link, see #4901 - logger.warning(link_error_msg) - - try: - with SaveFile(config_path) as fd: - config.write(fd) - except PermissionError as e: - # error is only a problem if we even had a lock - if self.do_lock: - raise - logger.warning( - "%s: Failed writing to '%s'. This is expected when working on " - "read-only repositories." % (e.strerror, e.filename) - ) - - if os.path.isfile(old_config_path): - secure_erase(old_config_path, avoid_collateral_damage=True) + def create(self): + """Create a new empty repository""" + self.store.create() + self.store.open() + self.store.store("config/readme", REPOSITORY_README.encode()) + self.version = 3 + self.store.store("config/version", str(self.version).encode()) + self.store.store("config/id", bin_to_hex(os.urandom(32)).encode()) + self.store.close() + + def _set_id(self, id): + # for testing: change the id of an existing repository + assert self.opened + assert isinstance(id, bytes) and len(id) == 32 + self.id = id + self.store.store("config/id", bin_to_hex(id).encode()) + + def _lock_refresh(self): + if self.lock is not None: + self.lock.refresh() def save_key(self, keydata): - assert self.config - keydata = keydata.decode("utf-8") # remote repo: msgpack issue #99, getting bytes - # note: saving an empty key means that there is no repokey any more - self.config.set("repository", "key", keydata) - self.save_config(self.path, self.config) + # note: saving an empty key means that there is no repokey anymore + self.store.store("keys/repokey", keydata) def load_key(self): - keydata = self.config.get("repository", "key", fallback="").strip() + keydata = self.store.load("keys/repokey") # note: if we return an empty string, it means there is no repo key - return keydata.encode("utf-8") # remote repo: msgpack issue #99, returning bytes + return keydata def destroy(self): - """Destroy the repository at `self.path`""" - if self.append_only: - raise ValueError(self.path + " is in append-only mode") + """Destroy the repository""" self.close() - os.remove(os.path.join(self.path, "config")) # kill config first - shutil.rmtree(self.path) - - def get_index_transaction_id(self): - indices = sorted( - int(fn[6:]) - for fn in os.listdir(self.path) - if fn.startswith("index.") and fn[6:].isdigit() and os.stat(os.path.join(self.path, fn)).st_size != 0 - ) - if indices: - return indices[-1] - else: - return None - - def check_transaction(self): - index_transaction_id = self.get_index_transaction_id() - segments_transaction_id = self.io.get_segments_transaction_id() - if index_transaction_id is not None and segments_transaction_id is None: - # we have a transaction id from the index, but we did not find *any* - # commit in the segment files (thus no segments transaction id). - # this can happen if a lot of segment files are lost, e.g. due to a - # filesystem or hardware malfunction. it means we have no identifiable - # valid (committed) state of the repo which we could use. - msg = '%s" - although likely this is "beyond repair' % self.path # dirty hack - raise self.CheckNeeded(msg) - # Attempt to rebuild index automatically if we crashed between commit - # tag write and index save. - if index_transaction_id != segments_transaction_id: - if index_transaction_id is not None and index_transaction_id > segments_transaction_id: - replay_from = None - else: - replay_from = index_transaction_id - self.replay_segments(replay_from, segments_transaction_id) - - def get_transaction_id(self): - self.check_transaction() - return self.get_index_transaction_id() - - def break_lock(self): - Lock(os.path.join(self.path, "lock")).break_lock() + self.store.destroy() - def migrate_lock(self, old_id, new_id): - # note: only needed for local repos - if self.lock is not None: - self.lock.migrate_lock(old_id, new_id) - - def open(self, path, exclusive, lock_wait=None, lock=True): - self.path = path + def open(self, *, exclusive, lock_wait=None, lock=True): + assert lock_wait is not None try: - st = os.stat(path) - except FileNotFoundError: - raise self.DoesNotExist(path) - if not stat.S_ISDIR(st.st_mode): - raise self.InvalidRepository(path) + self.store.open() + except StoreBackendDoesNotExist: + raise self.DoesNotExist(str(self._location)) from None if lock: - self.lock = Lock(os.path.join(path, "lock"), exclusive, timeout=lock_wait).acquire() + self.lock = Lock(self.store, exclusive, timeout=lock_wait).acquire() else: self.lock = None - self.config = ConfigParser(interpolation=None) - try: - with open(os.path.join(self.path, "config")) as fd: - self.config.read_file(fd) - except FileNotFoundError: - self.close() - raise self.InvalidRepository(self.path) - if "repository" not in self.config.sections(): - self.close() - raise self.InvalidRepositoryConfig(path, "no repository section found") - self.version = self.config.getint("repository", "version") + readme = self.store.load("config/readme").decode() + if readme != REPOSITORY_README: + raise self.InvalidRepository(str(self._location)) + self.version = int(self.store.load("config/version").decode()) if self.version not in self.acceptable_repo_versions: self.close() raise self.InvalidRepositoryConfig( - path, "repository version %d is not supported by this borg version" % self.version + str(self._location), "repository version %d is not supported by this borg version" % self.version ) - self.max_segment_size = parse_file_size(self.config.get("repository", "max_segment_size")) - if self.max_segment_size >= MAX_SEGMENT_SIZE_LIMIT: - self.close() - raise self.InvalidRepositoryConfig(path, "max_segment_size >= %d" % MAX_SEGMENT_SIZE_LIMIT) # issue 3592 - self.segments_per_dir = self.config.getint("repository", "segments_per_dir") - self.additional_free_space = parse_file_size(self.config.get("repository", "additional_free_space", fallback=0)) - # append_only can be set in the constructor - # it shouldn't be overridden (True -> False) here - self.append_only = self.append_only or self.config.getboolean("repository", "append_only", fallback=False) - if self.storage_quota is None: - # self.storage_quota is None => no explicit storage_quota was specified, use repository setting. - self.storage_quota = parse_file_size(self.config.get("repository", "storage_quota", fallback=0)) - self.id = hex_to_bin(self.config.get("repository", "id").strip(), length=32) - self.io = LoggedIO(self.path, self.max_segment_size, self.segments_per_dir) - - def _load_hints(self): - if (transaction_id := self.get_transaction_id()) is None: - # self is a fresh repo, so transaction_id is None and there is no hints file - return - hints = self._unpack_hints(transaction_id) - self.version = hints["version"] - self.storage_quota_use = hints["storage_quota_use"] - self.shadow_index = hints["shadow_index"] + self.id = hex_to_bin(self.store.load("config/id").decode(), length=32) + self.opened = True + + def close(self): + if self.opened: + if self.lock: + self.lock.release() + self.lock = None + self.store.close() + self.opened = False def info(self): """return some infos about the repo (must be opened first)""" - info = dict(id=self.id, version=self.version, append_only=self.append_only) - self._load_hints() - info["storage_quota"] = self.storage_quota - info["storage_quota_use"] = self.storage_quota_use + # note: don't do anything expensive here or separate the lock refresh into a separate method. + self._lock_refresh() # do not remove, see do_with_lock() + info = dict( + id=self.id, + version=self.version, + storage_quota_use=self.storage_quota_use, + storage_quota=self.storage_quota, + append_only=self.append_only, + ) return info - def close(self): - if self.lock: - if self.io: - self.io.close() - self.io = None - self.lock.release() - self.lock = None + def check(self, repair=False, max_duration=0): + """Check repository consistency""" + + def log_error(msg): + nonlocal obj_corrupted + obj_corrupted = True + logger.error(f"Repo object {info.name} is corrupted: {msg}") + + def check_object(obj): + """Check if obj looks valid.""" + hdr_size = RepoObj.obj_header.size + obj_size = len(obj) + if obj_size >= hdr_size: + hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(obj[:hdr_size])) + meta = obj[hdr_size : hdr_size + hdr.meta_size] + if hdr.meta_size != len(meta): + log_error("metadata size incorrect.") + elif hdr.meta_hash != xxh64(meta): + log_error("metadata does not match checksum.") + data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size] + if hdr.data_size != len(data): + log_error("data size incorrect.") + elif hdr.data_hash != xxh64(data): + log_error("data does not match checksum.") + else: + log_error("too small.") - def commit(self, compact=True, threshold=0.1): - """Commit transaction""" - if self.transaction_doomed: - exception = self.transaction_doomed - self.rollback() - raise exception - self.check_free_space() - segment = self.io.write_commit() - self.segments.setdefault(segment, 0) - self.compact[segment] += LoggedIO.header_fmt.size - if compact and not self.append_only: - self.compact_segments(threshold) - self.write_index() - self.rollback() - - def _read_integrity(self, transaction_id, key): - integrity_file = "integrity.%d" % transaction_id - integrity_path = os.path.join(self.path, integrity_file) - try: - with open(integrity_path, "rb") as fd: - integrity = msgpack.unpack(fd) - except FileNotFoundError: - return - if integrity.get("version") != 2: - logger.warning("Unknown integrity data version %r in %s", integrity.get("version"), integrity_file) - return - return integrity[key] - - def open_index(self, transaction_id, auto_recover=True): - if transaction_id is None: - return NSIndex() - index_path = os.path.join(self.path, "index.%d" % transaction_id) - variant = hashindex_variant(index_path) - integrity_data = self._read_integrity(transaction_id, "index") - try: - with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: - if variant == 2: - return NSIndex.read(fd) - if variant == 1: # legacy - return NSIndex1.read(fd) - except (ValueError, OSError, FileIntegrityError) as exc: - logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc) - os.unlink(index_path) - if not auto_recover: - raise - self.prepare_txn(self.get_transaction_id()) - # don't leave an open transaction around - self.commit(compact=False) - return self.open_index(self.get_transaction_id()) - - def _unpack_hints(self, transaction_id): - hints_path = os.path.join(self.path, "hints.%d" % transaction_id) - integrity_data = self._read_integrity(transaction_id, "hints") - with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd: - return msgpack.unpack(fd) - - def prepare_txn(self, transaction_id, do_cleanup=True): - self._active_txn = True - if self.do_lock and not self.lock.got_exclusive_lock(): - if self.exclusive is not None: - # self.exclusive is either True or False, thus a new client is active here. - # if it is False and we get here, the caller did not use exclusive=True although - # it is needed for a write operation. if it is True and we get here, something else - # went very wrong, because we should have an exclusive lock, but we don't. - raise AssertionError("bug in code, exclusive lock should exist here") - # if we are here, this is an old client talking to a new server (expecting lock upgrade). - # or we are replaying segments and might need a lock upgrade for that. - try: - self.lock.upgrade() - except (LockError, LockErrorT): - # if upgrading the lock to exclusive fails, we do not have an - # active transaction. this is important for "serve" mode, where - # the repository instance lives on - even if exceptions happened. - self._active_txn = False - raise - if not self.index or transaction_id is None: + # TODO: progress indicator, ... + partial = bool(max_duration) + assert not (repair and partial) + mode = "partial" if partial else "full" + logger.info(f"Starting {mode} repository check") + if partial: + # continue a past partial check (if any) or from a checkpoint or start one from beginning try: - self.index = self.open_index(transaction_id, auto_recover=False) - except (ValueError, OSError, FileIntegrityError) as exc: - logger.warning("Checking repository transaction due to previous error: %s", exc) - self.check_transaction() - self.index = self.open_index(transaction_id, auto_recover=False) - if transaction_id is None: - self.segments = {} # XXX bad name: usage_count_of_segment_x = self.segments[x] - self.compact = FreeSpace() # XXX bad name: freeable_space_of_segment_x = self.compact[x] - self.storage_quota_use = 0 - self.shadow_index.clear() + last_key_checked = self.store.load("config/last-key-checked").decode() + except StoreObjectNotFound: + last_key_checked = "" else: - if do_cleanup: - self.io.cleanup(transaction_id) - hints_path = os.path.join(self.path, "hints.%d" % transaction_id) - index_path = os.path.join(self.path, "index.%d" % transaction_id) + # start from the beginning and also forget about any potential past partial checks + last_key_checked = "" try: - hints = self._unpack_hints(transaction_id) - except (msgpack.UnpackException, FileNotFoundError, FileIntegrityError) as e: - logger.warning("Repository hints file missing or corrupted, trying to recover: %s", e) - if not isinstance(e, FileNotFoundError): - os.unlink(hints_path) - # index must exist at this point - os.unlink(index_path) - self.check_transaction() - self.prepare_txn(transaction_id) - return - if hints["version"] == 1: - logger.debug("Upgrading from v1 hints.%d", transaction_id) - self.segments = hints["segments"] - self.compact = FreeSpace() - self.storage_quota_use = 0 - self.shadow_index = {} - for segment in sorted(hints["compact"]): - logger.debug("Rebuilding sparse info for segment %d", segment) - self._rebuild_sparse(segment) - logger.debug("Upgrade to v2 hints complete") - elif hints["version"] != 2: - raise ValueError("Unknown hints file version: %d" % hints["version"]) - else: - self.segments = hints["segments"] - self.compact = FreeSpace(hints["compact"]) - self.storage_quota_use = hints.get("storage_quota_use", 0) - self.shadow_index = hints.get("shadow_index", {}) - # Drop uncommitted segments in the shadow index - for key, shadowed_segments in self.shadow_index.items(): - for segment in list(shadowed_segments): - if segment > transaction_id: - shadowed_segments.remove(segment) - - def write_index(self): - def flush_and_sync(fd): - fd.flush() - os.fsync(fd.fileno()) - - def rename_tmp(file): - os.replace(file + ".tmp", file) - - hints = { - "version": 2, - "segments": self.segments, - "compact": self.compact, - "storage_quota_use": self.storage_quota_use, - "shadow_index": self.shadow_index, - } - integrity = { - # Integrity version started at 2, the current hints version. - # Thus, integrity version == hints version, for now. - "version": 2 - } - transaction_id = self.io.get_segments_transaction_id() - assert transaction_id is not None - - # Log transaction in append-only mode - if self.append_only: - with open(os.path.join(self.path, "transactions"), "a") as log: - print( - "transaction %d, UTC time %s" - % (transaction_id, datetime.now(tz=timezone.utc).isoformat(timespec="microseconds")), - file=log, - ) - - # Write hints file - hints_name = "hints.%d" % transaction_id - hints_file = os.path.join(self.path, hints_name) - with IntegrityCheckedFile(hints_file + ".tmp", filename=hints_name, write=True) as fd: - msgpack.pack(hints, fd) - flush_and_sync(fd) - integrity["hints"] = fd.integrity_data - - # Write repository index - index_name = "index.%d" % transaction_id - index_file = os.path.join(self.path, index_name) - with IntegrityCheckedFile(index_file + ".tmp", filename=index_name, write=True) as fd: - # XXX: Consider using SyncFile for index write-outs. - self.index.write(fd) - flush_and_sync(fd) - integrity["index"] = fd.integrity_data - - # Write integrity file, containing checksums of the hints and index files - integrity_name = "integrity.%d" % transaction_id - integrity_file = os.path.join(self.path, integrity_name) - with open(integrity_file + ".tmp", "wb") as fd: - msgpack.pack(integrity, fd) - flush_and_sync(fd) - - # Rename the integrity file first - rename_tmp(integrity_file) - sync_dir(self.path) - # Rename the others after the integrity file is hypothetically on disk - rename_tmp(hints_file) - rename_tmp(index_file) - sync_dir(self.path) - - # Remove old auxiliary files - current = ".%d" % transaction_id - for name in os.listdir(self.path): - if not name.startswith(("index.", "hints.", "integrity.")): - continue - if name.endswith(current): - continue - os.unlink(os.path.join(self.path, name)) - self.index = None - - def check_free_space(self): - """Pre-commit check for sufficient free space necessary to perform the commit.""" - # As a baseline we take four times the current (on-disk) index size. - # At this point the index may only be updated by compaction, which won't resize it. - # We still apply a factor of four so that a later, separate invocation can free space - # (journaling all deletes for all chunks is one index size) or still make minor additions - # (which may grow the index up to twice its current size). - # Note that in a subsequent operation the committed index is still on-disk, therefore we - # arrive at index_size * (1 + 2 + 1). - # In that order: journaled deletes (1), hashtable growth (2), persisted index (1). - required_free_space = self.index.size() * 4 - - # Conservatively estimate hints file size: - # 10 bytes for each segment-refcount pair, 10 bytes for each segment-space pair - # Assume maximum of 5 bytes per integer. Segment numbers will usually be packed more densely (1-3 bytes), - # as will refcounts and free space integers. For 5 MiB segments this estimate is good to ~20 PB repo size. - # Add a generous 4K to account for constant format overhead. - hints_size = len(self.segments) * 10 + len(self.compact) * 10 + 4096 - required_free_space += hints_size - - required_free_space += self.additional_free_space - if not self.append_only: - full_segment_size = self.max_segment_size + MAX_OBJECT_SIZE - if len(self.compact) < 10: - # This is mostly for the test suite to avoid overestimated free space needs. This can be annoying - # if TMP is a small-ish tmpfs. - compact_working_space = 0 - for segment, free in self.compact.items(): - try: - compact_working_space += self.io.segment_size(segment) - free - except FileNotFoundError: - # looks like self.compact is referring to a nonexistent segment file, ignore it. - pass - logger.debug("check_free_space: Few segments, not requiring a full free segment") - compact_working_space = min(compact_working_space, full_segment_size) - logger.debug( - "check_free_space: Calculated working space for compact as %d bytes", compact_working_space - ) - required_free_space += compact_working_space - else: - # Keep one full worst-case segment free in non-append-only mode - required_free_space += full_segment_size - + self.store.delete("config/last-key-checked") + except StoreObjectNotFound: + pass + if last_key_checked: + logger.info(f"Skipping to keys after {last_key_checked}.") + else: + logger.info("Starting from beginning.") + t_start = time.monotonic() + t_last_checkpoint = t_start + objs_checked = objs_errors = 0 + infos = self.store.list("data") try: - free_space = shutil.disk_usage(self.path).free - except OSError as os_error: - logger.warning("Failed to check free space before committing: " + str(os_error)) - return - logger.debug(f"check_free_space: Required bytes {required_free_space}, free bytes {free_space}") - if free_space < required_free_space: - if self.created: - logger.error("Not enough free space to initialize repository at this location.") - self.destroy() - else: - self._rollback(cleanup=True) - formatted_required = format_file_size(required_free_space) - formatted_free = format_file_size(free_space) - raise self.InsufficientFreeSpaceError(formatted_required, formatted_free) - - def compact_segments(self, threshold): - """Compact sparse segments by copying data into new segments""" - if not self.compact: - logger.debug("Nothing to do: compact empty") - return - quota_use_before = self.storage_quota_use - index_transaction_id = self.get_index_transaction_id() - segments = self.segments - unused = [] # list of segments, that are not used anymore - - def complete_xfer(intermediate=True): - # complete the current transfer (when some target segment is full) - nonlocal unused - # commit the new, compact, used segments - segment = self.io.write_commit(intermediate=intermediate) - self.segments.setdefault(segment, 0) - self.compact[segment] += LoggedIO.header_fmt.size - logger.debug( - "complete_xfer: Wrote %scommit at segment %d", "intermediate " if intermediate else "", segment - ) - # get rid of the old, sparse, unused segments. free space. - for segment in unused: - logger.debug("complete_xfer: Deleting unused segment %d", segment) - count = self.segments.pop(segment) - assert count == 0, "Corrupted segment reference count - corrupted index or hints" - self.io.delete_segment(segment) - del self.compact[segment] - unused = [] - - logger.debug("Compaction started (threshold is %i%%).", threshold * 100) - pi = ProgressIndicatorPercent( - total=len(self.compact), msg="Compacting segments %3.0f%%", step=1, msgid="repository.compact_segments" - ) - for segment, freeable_space in sorted(self.compact.items()): - if not self.io.segment_exists(segment): - logger.warning("Segment %d not found, but listed in compaction data", segment) - del self.compact[segment] - pi.show() - self._send_log() - continue - segment_size = self.io.segment_size(segment) - freeable_ratio = 1.0 * freeable_space / segment_size - # we want to compact if: - # - we can free a considerable relative amount of space (freeable_ratio over some threshold) - if not (freeable_ratio > threshold): - logger.debug( - "Not compacting segment %d (maybe freeable: %2.2f%% [%d bytes])", - segment, - freeable_ratio * 100.0, - freeable_space, - ) - pi.show() - self._send_log() - continue - segments.setdefault(segment, 0) - logger.debug( - "Compacting segment %d with usage count %d (maybe freeable: %2.2f%% [%d bytes])", - segment, - segments[segment], - freeable_ratio * 100.0, - freeable_space, - ) - for tag, key, offset, _, data in self.io.iter_objects(segment): - if tag == TAG_COMMIT: + for info in infos: + self._lock_refresh() + key = "data/%s" % info.name + if key <= last_key_checked: # needs sorted keys continue - in_index = self.index.get(key) - is_index_object = in_index and (in_index.segment, in_index.offset) == (segment, offset) - if tag in (TAG_PUT2, TAG_PUT) and is_index_object: - try: - new_segment, offset = self.io.write_put(key, data, raise_full=True) - except LoggedIO.SegmentFull: - complete_xfer() - new_segment, offset = self.io.write_put(key, data) - self.index[key] = NSIndexEntry(new_segment, offset, len(data)) - segments.setdefault(new_segment, 0) - segments[new_segment] += 1 - segments[segment] -= 1 - if tag == TAG_PUT: - # old tag is PUT, but new will be PUT2 and use a bit more storage - self.storage_quota_use += self.io.ENTRY_HASH_SIZE - elif tag in (TAG_PUT2, TAG_PUT) and not is_index_object: - # If this is a PUT shadowed by a later tag, then it will be gone when this segment is deleted after - # this loop. Therefore it is removed from the shadow index. - try: - self.shadow_index[key].remove(segment) - except (KeyError, ValueError): - # do not remove entry with empty shadowed_segments list here, - # it is needed for shadowed_put_exists code (see below)! - pass - self.storage_quota_use -= header_size(tag) + len(data) - elif tag == TAG_DELETE and not in_index: - # If the shadow index doesn't contain this key, then we can't say if there's a shadowed older tag, - # therefore we do not drop the delete, but write it to a current segment. - key_not_in_shadow_index = key not in self.shadow_index - # If the key is in the shadow index and there is any segment with an older PUT of this - # key, we have a shadowed put. - shadowed_put_exists = key_not_in_shadow_index or any( - shadowed < segment for shadowed in self.shadow_index[key] - ) - delete_is_not_stable = index_transaction_id is None or segment > index_transaction_id - - if shadowed_put_exists or delete_is_not_stable: - # (introduced in 6425d16aa84be1eaaf88) - # This is needed to avoid object un-deletion if we crash between the commit and the deletion - # of old segments in complete_xfer(). - # - # However, this only happens if the crash also affects the FS to the effect that file deletions - # did not materialize consistently after journal recovery. If they always materialize in-order - # then this is not a problem, because the old segment containing a deleted object would be - # deleted before the segment containing the delete. - # - # Consider the following series of operations if we would not do this, i.e. this entire if: - # would be removed. - # Columns are segments, lines are different keys (line 1 = some key, line 2 = some other key) - # Legend: P=TAG_PUT/TAG_PUT2, D=TAG_DELETE, c=commit, i=index is written for latest commit - # - # Segment | 1 | 2 | 3 - # --------+-------+-----+------ - # Key 1 | P | D | - # Key 2 | P | | P - # commits | c i | c | c i - # --------+-------+-----+------ - # ^- compact_segments starts - # ^- complete_xfer commits, after that complete_xfer deletes - # segments 1 and 2 (and then the index would be written). - # - # Now we crash. But only segment 2 gets deleted, while segment 1 is still around. Now key 1 - # is suddenly undeleted (because the delete in segment 2 is now missing). - # Again, note the requirement here. We delete these in the correct order that this doesn't - # happen, and only if the FS materialization of these deletes is reordered or parts dropped - # this can happen. - # In this case it doesn't cause outright corruption, 'just' an index count mismatch, which - # will be fixed by borg-check --repair. - # - # Note that in this check the index state is the proxy for a "most definitely settled" - # repository state, i.e. the assumption is that *all* operations on segments <= index state - # are completed and stable. - try: - new_segment, size = self.io.write_delete(key, raise_full=True) - except LoggedIO.SegmentFull: - complete_xfer() - new_segment, size = self.io.write_delete(key) - self.compact[new_segment] += size - segments.setdefault(new_segment, 0) - else: - logger.debug( - "Dropping DEL for id %s - seg %d, iti %r, knisi %r, spe %r, dins %r, si %r", - bin_to_hex(key), - segment, - index_transaction_id, - key_not_in_shadow_index, - shadowed_put_exists, - delete_is_not_stable, - self.shadow_index.get(key), - ) - # we did not keep the delete tag for key (see if-branch) - if not self.shadow_index[key]: - # shadowed segments list is empty -> remove it - del self.shadow_index[key] - assert segments[segment] == 0, "Corrupted segment reference count - corrupted index or hints" - unused.append(segment) - pi.show() - self._send_log() - pi.finish() - self._send_log() - complete_xfer(intermediate=False) - self.io.clear_empty_dirs() - quota_use_after = self.storage_quota_use - logger.info("Compaction freed about %s repository space.", format_file_size(quota_use_before - quota_use_after)) - logger.debug("Compaction completed.") - - def replay_segments(self, index_transaction_id, segments_transaction_id): - # fake an old client, so that in case we do not have an exclusive lock yet, prepare_txn will upgrade the lock: - remember_exclusive = self.exclusive - self.exclusive = None - self.prepare_txn(index_transaction_id, do_cleanup=False) - try: - segment_count = sum(1 for _ in self.io.segment_iterator()) - pi = ProgressIndicatorPercent( - total=segment_count, msg="Replaying segments %3.0f%%", msgid="repository.replay_segments" - ) - for i, (segment, filename) in enumerate(self.io.segment_iterator()): - pi.show(i) - self._send_log() - if index_transaction_id is not None and segment <= index_transaction_id: + try: + obj = self.store.load(key) + except StoreObjectNotFound: + # looks like object vanished since store.list(), ignore that. continue - if segment > segments_transaction_id: + obj_corrupted = False + check_object(obj) + objs_checked += 1 + if obj_corrupted: + objs_errors += 1 + if repair: + # if it is corrupted, we can't do much except getting rid of it. + # but let's just retry loading it, in case the error goes away. + try: + obj = self.store.load(key) + except StoreObjectNotFound: + log_error("existing object vanished.") + else: + obj_corrupted = False + check_object(obj) + if obj_corrupted: + log_error("reloading did not help, deleting it!") + self.store.delete(key) + else: + log_error("reloading did help, inconsistent behaviour detected!") + now = time.monotonic() + if now > t_last_checkpoint + 300: # checkpoint every 5 mins + t_last_checkpoint = now + logger.info(f"Checkpointing at key {key}.") + self.store.store("config/last-key-checked", key.encode()) + if partial and now > t_start + max_duration: + logger.info(f"Finished partial repository check, last key checked is {key}.") + self.store.store("config/last-key-checked", key.encode()) break - objects = self.io.iter_objects(segment) - self._update_index(segment, objects) - pi.finish() - self._send_log() - self.write_index() - finally: - self.exclusive = remember_exclusive - self.rollback() - - def _update_index(self, segment, objects, report=None): - """some code shared between replay_segments and check""" - self.segments[segment] = 0 - for tag, key, offset, size, _ in objects: - if tag in (TAG_PUT2, TAG_PUT): - try: - # If this PUT supersedes an older PUT, mark the old segment for compaction and count the free space - in_index = self.index[key] - self.compact[in_index.segment] += header_size(tag) + size - self.segments[in_index.segment] -= 1 - self.shadow_index.setdefault(key, []).append(in_index.segment) - except KeyError: - pass - self.index[key] = NSIndexEntry(segment, offset, size) - self.segments[segment] += 1 - self.storage_quota_use += header_size(tag) + size - elif tag == TAG_DELETE: + else: + logger.info("Finished repository check.") try: - # if the deleted PUT is not in the index, there is nothing to clean up - in_index = self.index.pop(key) - except KeyError: + self.store.delete("config/last-key-checked") + except StoreObjectNotFound: pass - else: - if self.io.segment_exists(in_index.segment): - # the old index is not necessarily valid for this transaction (e.g. compaction); if the segment - # is already gone, then it was already compacted. - self.segments[in_index.segment] -= 1 - self.compact[in_index.segment] += header_size(tag) + in_index.size - self.shadow_index.setdefault(key, []).append(in_index.segment) - elif tag == TAG_COMMIT: - continue - else: - msg = f"Unexpected tag {tag} in segment {segment}" - if report is None: - raise self.CheckNeeded(msg) - else: - report(msg) - if self.segments[segment] == 0: - self.compact[segment] = self.io.segment_size(segment) - - def _rebuild_sparse(self, segment): - """Rebuild sparse bytes count for a single segment relative to the current index.""" - try: - segment_size = self.io.segment_size(segment) - except FileNotFoundError: - # segment does not exist any more, remove it from the mappings. - # note: no need to self.compact.pop(segment), as we start from empty mapping. - self.segments.pop(segment) - return - - if self.segments[segment] == 0: - self.compact[segment] = segment_size - return - - self.compact[segment] = 0 - for tag, key, offset, size, _ in self.io.iter_objects(segment, read_data=False): - if tag in (TAG_PUT2, TAG_PUT): - in_index = self.index.get(key) - if not in_index or (in_index.segment, in_index.offset) != (segment, offset): - # This PUT is superseded later. - self.compact[segment] += header_size(tag) + size - elif tag == TAG_DELETE: - # The outcome of the DELETE has been recorded in the PUT branch already. - self.compact[segment] += header_size(tag) + size - - def check(self, repair=False, max_duration=0): - """Check repository consistency - - This method verifies all segment checksums and makes sure - the index is consistent with the data stored in the segments. - """ - if self.append_only and repair: - raise ValueError(self.path + " is in append-only mode") - error_found = False - - def report_error(msg, *args): - nonlocal error_found - error_found = True - logger.error(msg, *args) - - logger.info("Starting repository check") - assert not self._active_txn - try: - transaction_id = self.get_transaction_id() - current_index = self.open_index(transaction_id) - logger.debug("Read committed index of transaction %d", transaction_id) - except Exception as exc: - transaction_id = self.io.get_segments_transaction_id() - current_index = None - logger.debug("Failed to read committed index (%s)", exc) - if transaction_id is None: - logger.debug("No segments transaction found") - transaction_id = self.get_index_transaction_id() - if transaction_id is None: - logger.debug("No index transaction found, trying latest segment") - transaction_id = self.io.get_latest_segment() - if transaction_id is None: - report_error("This repository contains no valid data.") - return False - if repair: - self.io.cleanup(transaction_id) - segments_transaction_id = self.io.get_segments_transaction_id() - logger.debug("Segment transaction is %s", segments_transaction_id) - logger.debug("Determined transaction is %s", transaction_id) - self.prepare_txn(None) # self.index, self.compact, self.segments, self.shadow_index all empty now! - segment_count = sum(1 for _ in self.io.segment_iterator()) - logger.debug("Found %d segments", segment_count) - - partial = bool(max_duration) - assert not (repair and partial) - mode = "partial" if partial else "full" - if partial: - # continue a past partial check (if any) or start one from beginning - last_segment_checked = self.config.getint("repository", "last_segment_checked", fallback=-1) - logger.info("Skipping to segments >= %d", last_segment_checked + 1) - else: - # start from the beginning and also forget about any potential past partial checks - last_segment_checked = -1 - self.config.remove_option("repository", "last_segment_checked") - self.save_config(self.path, self.config) - t_start = time.monotonic() - pi = ProgressIndicatorPercent( - total=segment_count, msg="Checking segments %3.1f%%", step=0.1, msgid="repository.check" - ) - segment = -1 # avoid uninitialized variable if there are no segment files at all - for i, (segment, filename) in enumerate(self.io.segment_iterator()): - pi.show(i) - self._send_log() - if segment <= last_segment_checked: - continue - if segment > transaction_id: - continue - logger.debug("Checking segment file %s...", filename) - try: - objects = list(self.io.iter_objects(segment)) - except IntegrityError as err: - report_error(str(err)) - objects = [] - if repair: - self.io.recover_segment(segment, filename) - objects = list(self.io.iter_objects(segment)) - if not partial: - self._update_index(segment, objects, report_error) - if partial and time.monotonic() > t_start + max_duration: - logger.info("Finished partial segment check, last segment checked is %d", segment) - self.config.set("repository", "last_segment_checked", str(segment)) - self.save_config(self.path, self.config) - break + except StoreObjectNotFound: + # it can be that there is no "data/" at all, then it crashes when iterating infos. + pass + logger.info(f"Checked {objs_checked} repository objects, {objs_errors} errors.") + if objs_errors == 0: + logger.info(f"Finished {mode} repository check, no problems found.") else: - logger.info("Finished segment check at segment %d", segment) - self.config.remove_option("repository", "last_segment_checked") - self.save_config(self.path, self.config) - - pi.finish() - self._send_log() - # self.index, self.segments, self.compact now reflect the state of the segment files up to . - # We might need to add a commit tag if no committed segment is found. - if repair and segments_transaction_id is None: - report_error(f"Adding commit tag to segment {transaction_id}") - self.io.segment = transaction_id + 1 - self.io.write_commit() - if not partial: - logger.info("Starting repository index check") - if current_index and not repair: - # current_index = "as found on disk" - # self.index = "as rebuilt in-memory from segments" - if len(current_index) != len(self.index): - report_error("Index object count mismatch.") - report_error("committed index: %d objects", len(current_index)) - report_error("rebuilt index: %d objects", len(self.index)) - else: - logger.info("Index object count match.") - line_format = "ID: %-64s rebuilt index: %-16s committed index: %-16s" - not_found = "" - for key, value in self.index.iteritems(): - current_value = current_index.get(key, not_found) - if current_value != value: - report_error(line_format, bin_to_hex(key), value, current_value) - self._send_log() - for key, current_value in current_index.iteritems(): - if key in self.index: - continue - value = self.index.get(key, not_found) - if current_value != value: - report_error(line_format, bin_to_hex(key), value, current_value) - self._send_log() - if repair: - self.write_index() - self.rollback() - if error_found: if repair: - logger.info("Finished %s repository check, errors found and repaired.", mode) + logger.info(f"Finished {mode} repository check, errors found and repaired.") else: - logger.error("Finished %s repository check, errors found.", mode) - else: - logger.info("Finished %s repository check, no problems found.", mode) - return not error_found or repair - - def scan_low_level(self, segment=None, offset=None): - """Very low level scan over all segment file entries. - - It does NOT care about what's committed and what not. - It does NOT care whether an object might be deleted or superseded later. - It just yields anything it finds in the segment files. + logger.error(f"Finished {mode} repository check, errors found.") + return objs_errors == 0 or repair - This is intended as a last-resort way to get access to all repo contents of damaged repos, - when there is uncommitted, but valuable data in there... - - When segment or segment+offset is given, limit processing to this location only. + def list(self, limit=None, marker=None): + """ + list infos starting from after id . + each info is a tuple (id, storage_size). """ - for current_segment, filename in self.io.segment_iterator(start_segment=segment, end_segment=segment): + self._lock_refresh() + collect = True if marker is None else False + result = [] + infos = self.store.list("data") # generator yielding ItemInfos + while True: try: - for tag, key, current_offset, _, data in self.io.iter_objects( - segment=current_segment, offset=offset or 0 - ): - if offset is not None and current_offset > offset: + info = next(infos) + except StoreObjectNotFound: + break # can happen e.g. if "data" does not exist, pointless to continue in that case + except StopIteration: + break + else: + id = hex_to_bin(info.name) + if collect: + result.append((id, info.size)) + if len(result) == limit: break - yield key, data, tag, current_segment, current_offset - except IntegrityError as err: - logger.error( - "Segment %d (%s) has IntegrityError(s) [%s] - skipping." % (current_segment, filename, str(err)) - ) - - def _rollback(self, *, cleanup): - if cleanup: - self.io.cleanup(self.io.get_segments_transaction_id()) - self.index = None - self._active_txn = False - self.transaction_doomed = None - - def rollback(self): - # note: when used in remote mode, this is time limited, see RemoteRepository.shutdown_time. - self._rollback(cleanup=False) - - def __len__(self): - if not self.index: - self.index = self.open_index(self.get_transaction_id()) - return len(self.index) - - def __contains__(self, id): - if not self.index: - self.index = self.open_index(self.get_transaction_id()) - return id in self.index - - def list(self, limit=None, marker=None, mask=0, value=0): - """ - list IDs starting from after id - in index (pseudo-random) order. - - if mask and value are given, only return IDs where flags & mask == value (default: all IDs). - """ - if not self.index: - self.index = self.open_index(self.get_transaction_id()) - return [id_ for id_, _ in islice(self.index.iteritems(marker=marker, mask=mask, value=value), limit)] - - def scan(self, limit=None, state=None): - """ - list (the next) chunk IDs from the repository - in on-disk order, so that a client - fetching data in this order does linear reads and reuses stuff from disk cache. - - state can either be None (initially, when starting to scan) or the object - returned from a previous scan call (meaning "continue scanning"). - - returns: list of chunk ids, state - - We rely on repository.check() has run already (either now or some time before) and that: - - - if we are called from a borg check command, self.index is a valid, fresh, in-sync repo index. - - if we are called from elsewhere, either self.index or the on-disk index is valid and in-sync. - - the repository segments are valid (no CRC errors). - if we encounter CRC errors in segment entry headers, rest of segment is skipped. - """ - if limit is not None and limit < 1: - raise ValueError("please use limit > 0 or limit = None") - transaction_id = self.get_transaction_id() - if not self.index: - self.index = self.open_index(transaction_id) - # smallest valid seg is 0, smallest valid offs is 8 - start_segment, start_offset, end_segment = state if state is not None else (0, 0, transaction_id) - ids, segment, offset = [], 0, 0 - # we only scan up to end_segment == transaction_id to scan only **committed** chunks, - # avoiding scanning into newly written chunks. - for segment, filename in self.io.segment_iterator(start_segment, end_segment): - # the start_offset we potentially got from state is only valid for the start_segment we also got - # from there. in case the segment file vanished meanwhile, the segment_iterator might never - # return a segment/filename corresponding to the start_segment and we must start from offset 0 then. - start_offset = start_offset if segment == start_segment else 0 - obj_iterator = self.io.iter_objects(segment, start_offset, read_data=False) - while True: - try: - tag, id, offset, size, _ = next(obj_iterator) - except (StopIteration, IntegrityError): - # either end-of-segment or an error - we can not seek to objects at - # higher offsets than one that has an error in the header fields. - break - if start_offset > 0: - # we are using a state != None and it points to the last object we have already - # returned in the previous scan() call - thus, we need to skip this one object. - # also, for the next segment, we need to start at offset 0. - start_offset = 0 - continue - if tag in (TAG_PUT2, TAG_PUT): - in_index = self.index.get(id) - if in_index and (in_index.segment, in_index.offset) == (segment, offset): - # we have found an existing and current object - ids.append(id) - if len(ids) == limit: - return ids, (segment, offset, end_segment) - return ids, (segment, offset, end_segment) - - def flags(self, id, mask=0xFFFFFFFF, value=None): - """ - query and optionally set flags - - :param id: id (key) of object - :param mask: bitmask for flags (default: operate on all 32 bits) - :param value: value to set masked bits to (default: do not change any flags) - :return: (previous) flags value (only masked bits) - """ - if not self.index: - self.index = self.open_index(self.get_transaction_id()) - return self.index.flags(id, mask, value) - - def flags_many(self, ids, mask=0xFFFFFFFF, value=None): - return [self.flags(id_, mask, value) for id_ in ids] + elif id == marker: + collect = True + # note: do not collect the marker id + return result def get(self, id, read_data=True): - if not self.index: - self.index = self.open_index(self.get_transaction_id()) + self._lock_refresh() + id_hex = bin_to_hex(id) + key = "data/" + id_hex try: - in_index = NSIndexEntry(*((self.index[id] + (None,))[:3])) # legacy: index entries have no size element - return self.io.read(in_index.segment, in_index.offset, id, expected_size=in_index.size, read_data=read_data) - except KeyError: - raise self.ObjectNotFound(id, self.path) from None + if read_data: + # read everything + return self.store.load(key) + else: + # RepoObj layout supports separately encrypted metadata and data. + # We return enough bytes so the client can decrypt the metadata. + hdr_size = RepoObj.obj_header.size + extra_size = 1024 - hdr_size # load a bit more, 1024b, reduces round trips + obj = self.store.load(key, size=hdr_size + extra_size) + hdr = obj[0:hdr_size] + if len(hdr) != hdr_size: + raise IntegrityError(f"Object too small [id {id_hex}]: expected {hdr_size}, got {len(hdr)} bytes") + meta_size = RepoObj.obj_header.unpack(hdr)[0] + if meta_size > extra_size: + # we did not get enough, need to load more, but not all. + # this should be rare, as chunk metadata is rather small usually. + obj = self.store.load(key, size=hdr_size + meta_size) + meta = obj[hdr_size : hdr_size + meta_size] + if len(meta) != meta_size: + raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes") + return hdr + meta + except StoreObjectNotFound: + raise self.ObjectNotFound(id, str(self._location)) from None def get_many(self, ids, read_data=True, is_preloaded=False): for id_ in ids: @@ -1332,28 +397,13 @@ def put(self, id, data, wait=True): Note: when doing calls with wait=False this gets async and caller must deal with async results / exceptions later. """ - if not self._active_txn: - self.prepare_txn(self.get_transaction_id()) - try: - in_index = self.index[id] - except KeyError: - pass - else: - # this put call supersedes a previous put to same id. - # it is essential to do a delete first to get correct quota bookkeeping - # and also a correctly updated shadow_index, so that the compaction code - # does not wrongly resurrect an old PUT by dropping a DEL that is still needed. - self._delete(id, in_index.segment, in_index.offset, in_index.size) - segment, offset = self.io.write_put(id, data) - self.storage_quota_use += header_size(TAG_PUT2) + len(data) - self.segments.setdefault(segment, 0) - self.segments[segment] += 1 - self.index[id] = NSIndexEntry(segment, offset, len(data)) - if self.storage_quota and self.storage_quota_use > self.storage_quota: - self.transaction_doomed = self.StorageQuotaExceeded( - format_file_size(self.storage_quota), format_file_size(self.storage_quota_use) - ) - raise self.transaction_doomed + self._lock_refresh() + data_size = len(data) + if data_size > MAX_DATA_SIZE: + raise IntegrityError(f"More than allowed put data [{data_size} > {MAX_DATA_SIZE}]") + + key = "data/" + bin_to_hex(id) + self.store.store(key, data) def delete(self, id, wait=True): """delete a repo object @@ -1361,26 +411,12 @@ def delete(self, id, wait=True): Note: when doing calls with wait=False this gets async and caller must deal with async results / exceptions later. """ - if not self._active_txn: - self.prepare_txn(self.get_transaction_id()) + self._lock_refresh() + key = "data/" + bin_to_hex(id) try: - in_index = self.index.pop(id) - except KeyError: - raise self.ObjectNotFound(id, self.path) from None - self._delete(id, in_index.segment, in_index.offset, in_index.size) - - def _delete(self, id, segment, offset, size): - # common code used by put and delete - # because we'll write a DEL tag to the repository, we must update the shadow index. - # this is always true, no matter whether we are called from put() or delete(). - # the compaction code needs this to not drop DEL tags if they are still required - # to keep a PUT in an earlier segment in the "effectively deleted" state. - self.shadow_index.setdefault(id, []).append(segment) - self.segments[segment] -= 1 - self.compact[segment] += header_size(TAG_PUT2) + size - segment, size = self.io.write_delete(id) - self.compact[segment] += size - self.segments.setdefault(segment, 0) + self.store.delete(key) + except StoreObjectNotFound: + raise self.ObjectNotFound(id, str(self._location)) from None def async_response(self, wait=True): """Get one async result (only applies to remote repositories). @@ -1396,522 +432,34 @@ def async_response(self, wait=True): def preload(self, ids): """Preload objects (only applies to remote repositories)""" + def break_lock(self): + Lock(self.store).break_lock() -class LoggedIO: - class SegmentFull(Exception): - """raised when a segment is full, before opening next""" - - header_fmt = struct.Struct(" transaction_id: - self.delete_segment(segment) - count += 1 - else: - break - logger.debug("Cleaned up %d uncommitted segment files (== everything after segment %d).", count, transaction_id) + def migrate_lock(self, old_id, new_id): + # note: only needed for local repos + if self.lock is not None: + self.lock.migrate_lock(old_id, new_id) - def is_committed_segment(self, segment): - """Check if segment ends with a COMMIT_TAG tag""" - try: - iterator = self.iter_objects(segment) - except IntegrityError: - return False - with open(self.segment_filename(segment), "rb") as fd: - try: - fd.seek(-self.header_fmt.size, os.SEEK_END) - except OSError as e: - # return False if segment file is empty or too small - if e.errno == errno.EINVAL: - return False - raise e - if fd.read(self.header_fmt.size) != self.COMMIT: - return False - seen_commit = False - while True: - try: - tag, key, offset, _, _ = next(iterator) - except IntegrityError: - return False - except StopIteration: - break - if tag == TAG_COMMIT: - seen_commit = True - continue - if seen_commit: - return False - return seen_commit - - def segment_filename(self, segment): - return os.path.join(self.path, "data", str(segment // self.segments_per_dir), str(segment)) - - def get_write_fd(self, no_new=False, want_new=False, raise_full=False): - if not no_new and (want_new or self.offset and self.offset > self.limit): - if raise_full: - raise self.SegmentFull - self.close_segment() - if not self._write_fd: - if self.segment % self.segments_per_dir == 0: - dirname = os.path.join(self.path, "data", str(self.segment // self.segments_per_dir)) - if not os.path.exists(dirname): - os.mkdir(dirname) - sync_dir(os.path.join(self.path, "data")) - self._write_fd = SyncFile(self.segment_filename(self.segment), binary=True) - self._write_fd.write(MAGIC) - self.offset = MAGIC_LEN - if self.segment in self.fds: - # we may have a cached fd for a segment file we already deleted and - # we are writing now a new segment file to same file name. get rid of - # the cached fd that still refers to the old file, so it will later - # get repopulated (on demand) with a fd that refers to the new file. - del self.fds[self.segment] - return self._write_fd - - def get_fd(self, segment): - # note: get_fd() returns a fd with undefined file pointer position, - # so callers must always seek() to desired position afterwards. - now = time.monotonic() - - def open_fd(): - fd = open(self.segment_filename(segment), "rb") - self.fds[segment] = (now, fd) - return fd - - def clean_old(): - # we regularly get rid of all old FDs here: - if now - self._fds_cleaned > FD_MAX_AGE // 8: - self._fds_cleaned = now - for k, ts_fd in list(self.fds.items()): - ts, fd = ts_fd - if now - ts > FD_MAX_AGE: - # we do not want to touch long-unused file handles to - # avoid ESTALE issues (e.g. on network filesystems). - del self.fds[k] - - clean_old() - if self._write_fd is not None: - # without this, we have a test failure now - self._write_fd.sync() - try: - ts, fd = self.fds[segment] - except KeyError: - fd = open_fd() - else: - # we only have fresh enough stuff here. - # update the timestamp of the lru cache entry. - self.fds.replace(segment, (now, fd)) - return fd - - def close_segment(self): - # set self._write_fd to None early to guard against reentry from error handling code paths: - fd, self._write_fd = self._write_fd, None - if fd is not None: - self.segment += 1 - self.offset = 0 - fd.close() - - def delete_segment(self, segment): - if segment in self.fds: - del self.fds[segment] + def get_manifest(self): try: - safe_unlink(self.segment_filename(segment)) - except FileNotFoundError: - pass - - def clear_empty_dirs(self): - """Delete empty segment dirs, i.e those with no segment files.""" - data_dir = os.path.join(self.path, "data") - segment_dirs = self.get_segment_dirs(data_dir) - for segment_dir in segment_dirs: - try: - # os.rmdir will only delete the directory if it is empty - # so we don't need to explicitly check for emptiness first. - os.rmdir(segment_dir) - except OSError: - # OSError is raised by os.rmdir if directory is not empty. This is expected. - # Its subclass FileNotFoundError may be raised if the directory already does not exist. Ignorable. - pass - sync_dir(data_dir) + return self.store.load("config/manifest") + except StoreObjectNotFound: + raise NoManifestError - def segment_exists(self, segment): - filename = self.segment_filename(segment) - # When deleting segments, they are first truncated. If truncate(2) and unlink(2) are split - # across FS transactions, then logically deleted segments will show up as truncated. - return os.path.exists(filename) and os.path.getsize(filename) + def put_manifest(self, data): + return self.store.store("config/manifest", data) - def segment_size(self, segment): - return os.path.getsize(self.segment_filename(segment)) - - def get_segment_magic(self, segment): - fd = self.get_fd(segment) - fd.seek(0) - return fd.read(MAGIC_LEN) - - def iter_objects(self, segment, offset=0, read_data=True): - """ - Return object iterator for *segment*. - - See the _read() docstring about confidence in the returned data. - - The iterator returns five-tuples of (tag, key, offset, size, data). - """ - fd = self.get_fd(segment) - fd.seek(offset) - if offset == 0: - # we are touching this segment for the first time, check the MAGIC. - # Repository.scan() calls us with segment > 0 when it continues an ongoing iteration - # from a marker position - but then we have checked the magic before already. - if fd.read(MAGIC_LEN) != MAGIC: - raise IntegrityError(f"Invalid segment magic [segment {segment}, offset {0}]") - offset = MAGIC_LEN - header = fd.read(self.header_fmt.size) - while header: - size, tag, key, data = self._read( - fd, header, segment, offset, (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT), read_data=read_data - ) - # tuple[3]: corresponds to len(data) == length of the full chunk payload (meta_len+enc_meta+enc_data) - # tuple[4]: data will be None if read_data is False. - yield tag, key, offset, size - header_size(tag), data - assert size >= 0 - offset += size - # we must get the fd via get_fd() here again as we yielded to our caller and it might - # have triggered closing of the fd we had before (e.g. by calling io.read() for - # different segment(s)). - # by calling get_fd() here again we also make our fd "recently used" so it likely - # does not get kicked out of self.fds LRUcache. - fd = self.get_fd(segment) - fd.seek(offset) - header = fd.read(self.header_fmt.size) - - def recover_segment(self, segment, filename): - logger.info("Attempting to recover " + filename) - if segment in self.fds: - del self.fds[segment] - if os.path.getsize(filename) < MAGIC_LEN + self.header_fmt.size: - # this is either a zero-byte file (which would crash mmap() below) or otherwise - # just too small to be a valid non-empty segment file, so do a shortcut here: - with SaveFile(filename, binary=True) as fd: - fd.write(MAGIC) - return - with SaveFile(filename, binary=True) as dst_fd: - with open(filename, "rb") as src_fd: - # note: file must not be 0 size or mmap() will crash. - with mmap.mmap(src_fd.fileno(), 0, access=mmap.ACCESS_READ) as mm: - # memoryview context manager is problematic, see https://bugs.python.org/issue35686 - data = memoryview(mm) - d = data - try: - dst_fd.write(MAGIC) - while len(d) >= self.header_fmt.size: - crc, size, tag = self.header_fmt.unpack(d[: self.header_fmt.size]) - size_invalid = size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d) - if size_invalid or tag > MAX_TAG_ID: - d = d[1:] - continue - if tag == TAG_PUT2: - c_offset = self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE - # skip if header is invalid - if crc32(d[4:c_offset]) & 0xFFFFFFFF != crc: - d = d[1:] - continue - # skip if content is invalid - if ( - self.entry_hash(d[4 : self.HEADER_ID_SIZE], d[c_offset:size]) - != d[self.HEADER_ID_SIZE : c_offset] - ): - d = d[1:] - continue - elif tag in (TAG_DELETE, TAG_COMMIT, TAG_PUT): - if crc32(d[4:size]) & 0xFFFFFFFF != crc: - d = d[1:] - continue - else: # tag unknown - d = d[1:] - continue - dst_fd.write(d[:size]) - d = d[size:] - finally: - del d - data.release() - - def entry_hash(self, *data): - h = StreamingXXH64() - for d in data: - h.update(d) - return h.digest() - - def read(self, segment, offset, id, *, read_data=True, expected_size=None): - """ - Read entry from *segment* at *offset* with *id*. + def store_list(self, name): + try: + return list(self.store.list(name)) + except StoreObjectNotFound: + return [] - See the _read() docstring about confidence in the returned data. - """ - if segment == self.segment and self._write_fd: - self._write_fd.sync() - fd = self.get_fd(segment) - fd.seek(offset) - header = fd.read(self.header_fmt.size) - size, tag, key, data = self._read(fd, header, segment, offset, (TAG_PUT2, TAG_PUT), read_data=read_data) - if id != key: - raise IntegrityError( - f"Invalid segment entry header, is not for wanted id [segment {segment}, offset {offset}]" - ) - data_size_from_header = size - header_size(tag) - if expected_size is not None and expected_size != data_size_from_header: - raise IntegrityError( - f"size from repository index: {expected_size} != " f"size from entry header: {data_size_from_header}" - ) - return data + def store_load(self, name): + return self.store.load(name) - def _read(self, fd, header, segment, offset, acceptable_tags, read_data=True): - """ - Code shared by read() and iter_objects(). - - Confidence in returned data: - PUT2 tags, read_data == True: crc32 check (header) plus digest check (header+data) - PUT2 tags, read_data == False: crc32 check (header) - PUT tags, read_data == True: crc32 check (header+data) - PUT tags, read_data == False: crc32 check can not be done, all data obtained must be considered informational - - read_data == False behaviour: - PUT2 tags: return enough of the chunk so that the client is able to decrypt the metadata, - do not read, but just seek over the data. - PUT tags: return None and just seek over the data. - """ + def store_store(self, name, value): + return self.store.store(name, value) - def check_crc32(wanted, header, *data): - result = crc32(memoryview(header)[4:]) # skip first 32 bits of the header, they contain the crc. - for d in data: - result = crc32(d, result) - if result & 0xFFFFFFFF != wanted: - raise IntegrityError(f"Segment entry header checksum mismatch [segment {segment}, offset {offset}]") - - # See comment on MAX_TAG_ID for details - assert max(acceptable_tags) <= MAX_TAG_ID, "Exceeding MAX_TAG_ID will break backwards compatibility" - key = data = None - fmt = self.header_fmt - try: - hdr_tuple = fmt.unpack(header) - except struct.error as err: - raise IntegrityError(f"Invalid segment entry header [segment {segment}, offset {offset}]: {err}") from None - crc, size, tag = hdr_tuple - length = size - fmt.size # we already read the header - if size > MAX_OBJECT_SIZE: - # if you get this on an archive made with borg < 1.0.7 and millions of files and - # you need to restore it, you can disable this check by using "if False:" above. - raise IntegrityError(f"Invalid segment entry size {size} - too big [segment {segment}, offset {offset}]") - if size < fmt.size: - raise IntegrityError(f"Invalid segment entry size {size} - too small [segment {segment}, offset {offset}]") - if tag not in (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT): - raise IntegrityError( - f"Invalid segment entry header, did not get a known tag " f"[segment {segment}, offset {offset}]" - ) - if tag not in acceptable_tags: - raise IntegrityError( - f"Invalid segment entry header, did not get acceptable tag " f"[segment {segment}, offset {offset}]" - ) - if tag == TAG_COMMIT: - check_crc32(crc, header) - # that's all for COMMITs. - else: - # all other tags (TAG_PUT2, TAG_DELETE, TAG_PUT) have a key - key = fd.read(32) - length -= 32 - if len(key) != 32: - raise IntegrityError( - f"Segment entry key short read [segment {segment}, offset {offset}]: " - f"expected {32}, got {len(key)} bytes" - ) - if tag == TAG_DELETE: - check_crc32(crc, header, key) - # that's all for DELETEs. - else: - # TAG_PUT: we can not do a crc32 header check here, because the crc32 is computed over header+data! - # for the check, see code below when read_data is True. - if tag == TAG_PUT2: - entry_hash = fd.read(self.ENTRY_HASH_SIZE) - length -= self.ENTRY_HASH_SIZE - if len(entry_hash) != self.ENTRY_HASH_SIZE: - raise IntegrityError( - f"Segment entry hash short read [segment {segment}, offset {offset}]: " - f"expected {self.ENTRY_HASH_SIZE}, got {len(entry_hash)} bytes" - ) - check_crc32(crc, header, key, entry_hash) - if not read_data: - if tag == TAG_PUT2: - # PUT2 is only used in new repos and they also have different RepoObj layout, - # supporting separately encrypted metadata and data. - # In this case, we return enough bytes so the client can decrypt the metadata - # and seek over the rest (over the encrypted data). - meta_len_size = RepoObj.meta_len_hdr.size - meta_len = fd.read(meta_len_size) - length -= meta_len_size - if len(meta_len) != meta_len_size: - raise IntegrityError( - f"Segment entry meta length short read [segment {segment}, offset {offset}]: " - f"expected {meta_len_size}, got {len(meta_len)} bytes" - ) - ml = RepoObj.meta_len_hdr.unpack(meta_len)[0] - meta = fd.read(ml) - length -= ml - if len(meta) != ml: - raise IntegrityError( - f"Segment entry meta short read [segment {segment}, offset {offset}]: " - f"expected {ml}, got {len(meta)} bytes" - ) - data = meta_len + meta # shortened chunk - enough so the client can decrypt the metadata - # we do not have a checksum for this data, but the client's AEAD crypto will check it. - # in any case, we see over the remainder of the chunk - oldpos = fd.tell() - seeked = fd.seek(length, os.SEEK_CUR) - oldpos - if seeked != length: - raise IntegrityError( - f"Segment entry data short seek [segment {segment}, offset {offset}]: " - f"expected {length}, got {seeked} bytes" - ) - else: # read data! - data = fd.read(length) - if len(data) != length: - raise IntegrityError( - f"Segment entry data short read [segment {segment}, offset {offset}]: " - f"expected {length}, got {len(data)} bytes" - ) - if tag == TAG_PUT2: - if self.entry_hash(memoryview(header)[4:], key, data) != entry_hash: - raise IntegrityError(f"Segment entry hash mismatch [segment {segment}, offset {offset}]") - elif tag == TAG_PUT: - check_crc32(crc, header, key, data) - return size, tag, key, data - - def write_put(self, id, data, raise_full=False): - data_size = len(data) - if data_size > MAX_DATA_SIZE: - # this would push the segment entry size beyond MAX_OBJECT_SIZE. - raise IntegrityError(f"More than allowed put data [{data_size} > {MAX_DATA_SIZE}]") - fd = self.get_write_fd(want_new=(id == Manifest.MANIFEST_ID), raise_full=raise_full) - size = data_size + self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE - offset = self.offset - header = self.header_no_crc_fmt.pack(size, TAG_PUT2) - entry_hash = self.entry_hash(header, id, data) - crc = self.crc_fmt.pack(crc32(entry_hash, crc32(id, crc32(header))) & 0xFFFFFFFF) - fd.write(b"".join((crc, header, id, entry_hash))) - fd.write(data) - self.offset += size - return self.segment, offset - - def write_delete(self, id, raise_full=False): - fd = self.get_write_fd(want_new=(id == Manifest.MANIFEST_ID), raise_full=raise_full) - header = self.header_no_crc_fmt.pack(self.HEADER_ID_SIZE, TAG_DELETE) - crc = self.crc_fmt.pack(crc32(id, crc32(header)) & 0xFFFFFFFF) - fd.write(b"".join((crc, header, id))) - self.offset += self.HEADER_ID_SIZE - return self.segment, self.HEADER_ID_SIZE - - def write_commit(self, intermediate=False): - # Intermediate commits go directly into the current segment - this makes checking their validity more - # expensive, but is faster and reduces clobber. Final commits go into a new segment. - fd = self.get_write_fd(want_new=not intermediate, no_new=intermediate) - if intermediate: - fd.sync() - header = self.header_no_crc_fmt.pack(self.header_fmt.size, TAG_COMMIT) - crc = self.crc_fmt.pack(crc32(header) & 0xFFFFFFFF) - fd.write(b"".join((crc, header))) - self.close_segment() - return self.segment - 1 # close_segment() increments it - - -assert LoggedIO.HEADER_ID_SIZE + LoggedIO.ENTRY_HASH_SIZE == 41 + 8 # see constants.MAX_OBJECT_SIZE + def store_delete(self, name): + return self.store.delete(name) diff --git a/src/borg/selftest.py b/src/borg/selftest.py index 8f6b693bb8..e53ae06837 100644 --- a/src/borg/selftest.py +++ b/src/borg/selftest.py @@ -33,7 +33,7 @@ ChunkerTestCase, ] -SELFTEST_COUNT = 33 +SELFTEST_COUNT = 19 class SelfTestResult(TestResult): diff --git a/src/borg/storelocking.py b/src/borg/storelocking.py new file mode 100644 index 0000000000..dc111f9c13 --- /dev/null +++ b/src/borg/storelocking.py @@ -0,0 +1,243 @@ +import datetime +import json +import random +import time + +from borgstore.store import ObjectNotFound + +from . import platform +from .checksums import xxh64 +from .helpers import Error, ErrorWithTraceback, bin_to_hex +from .logger import create_logger + +logger = create_logger(__name__) + + +class LockError(Error): + """Failed to acquire the lock {}.""" + + exit_mcode = 70 + + +class LockErrorT(ErrorWithTraceback): + """Failed to acquire the lock {}.""" + + exit_mcode = 71 + + +class LockFailed(LockErrorT): + """Failed to create/acquire the lock {} ({}).""" + + exit_mcode = 72 + + +class LockTimeout(LockError): + """Failed to create/acquire the lock {} (timeout).""" + + exit_mcode = 73 + + +class NotLocked(LockErrorT): + """Failed to release the lock {} (was not locked).""" + + exit_mcode = 74 + + +class NotMyLock(LockErrorT): + """Failed to release the lock {} (was/is locked, but not by me).""" + + exit_mcode = 75 + + +class Lock: + """ + A Lock for a resource that can be accessed in a shared or exclusive way. + Typically, write access to a resource needs an exclusive lock (1 writer, + no one is allowed reading) and read access to a resource needs a shared + lock (multiple readers are allowed). + + If possible, try to use the contextmanager here like:: + + with Lock(...) as lock: + ... + + This makes sure the lock is released again if the block is left, no + matter how (e.g. if an exception occurred). + """ + + def __init__(self, store, exclusive=False, sleep=None, timeout=1.0, stale=30 * 60, id=None): + self.store = store + self.is_exclusive = exclusive + self.sleep = sleep + self.timeout = timeout + self.race_recheck_delay = 0.01 # local: 0.01, network/slow remote: >= 1.0 + self.other_locks_go_away_delay = 0.1 # local: 0.1, network/slow remote: >= 1.0 + self.retry_delay_min = 1.0 + self.retry_delay_max = 5.0 + self.stale_td = datetime.timedelta(seconds=stale) # ignore/delete it if older + self.refresh_td = datetime.timedelta(seconds=stale // 2) # don't refresh it if younger + self.last_refresh_dt = None + self.id = id or platform.get_process_id() + assert len(self.id) == 3 + + def __enter__(self): + return self.acquire() + + def __exit__(self, *exc): + self.release() + + def __repr__(self): + return f"<{self.__class__.__name__}: {self.id!r}>" + + def _create_lock(self, *, exclusive=None): + assert exclusive is not None + now = datetime.datetime.now(datetime.timezone.utc) + timestamp = now.isoformat(timespec="milliseconds") + lock = dict(exclusive=exclusive, hostid=self.id[0], processid=self.id[1], threadid=self.id[2], time=timestamp) + value = json.dumps(lock).encode("utf-8") + key = bin_to_hex(xxh64(value)) + self.store.store(f"locks/{key}", value) + self.last_refresh_dt = now + return key + + def _delete_lock(self, key, *, ignore_not_found=False): + try: + self.store.delete(f"locks/{key}") + except ObjectNotFound: + if not ignore_not_found: + raise + + def _is_stale_lock(self, lock): + now = datetime.datetime.now(datetime.timezone.utc) + if lock["dt"] < now - self.stale_td: + # lock is too old, it was not refreshed. + return True + if not platform.process_alive(lock["hostid"], lock["processid"], lock["threadid"]): + # we KNOW that the lock owning process is dead. + return True + return False + + def _get_locks(self): + locks = {} + try: + infos = list(self.store.list("locks")) + except ObjectNotFound: + return {} + for info in infos: + key = info.name + content = self.store.load(f"locks/{key}") + lock = json.loads(content.decode("utf-8")) + lock["key"] = key + lock["dt"] = datetime.datetime.fromisoformat(lock["time"]) + if self._is_stale_lock(lock): + # ignore it and delete it (even if it is not from us) + self._delete_lock(key, ignore_not_found=True) + else: + locks[key] = lock + return locks + + def _find_locks(self, *, only_exclusive=False, only_mine=False): + locks = self._get_locks() + found_locks = [] + for key in locks: + lock = locks[key] + if (not only_exclusive or lock["exclusive"]) and ( + not only_mine or (lock["hostid"], lock["processid"], lock["threadid"]) == self.id + ): + found_locks.append(lock) + return found_locks + + def acquire(self): + # goal + # for exclusive lock: there must be only 1 exclusive lock and no other (exclusive or non-exclusive) locks. + # for non-exclusive lock: there can be multiple n-e locks, but there must not exist an exclusive lock. + started = time.monotonic() + while time.monotonic() - started < self.timeout: + exclusive_locks = self._find_locks(only_exclusive=True) + if len(exclusive_locks) == 0: + # looks like there are no exclusive locks, create our lock. + key = self._create_lock(exclusive=self.is_exclusive) + # obviously we have a race condition here: other client(s) might have created exclusive + # lock(s) at the same time in parallel. thus we have to check again. + time.sleep( + self.race_recheck_delay + ) # give other clients time to notice our exclusive lock, stop creating theirs + exclusive_locks = self._find_locks(only_exclusive=True) + if self.is_exclusive: + if len(exclusive_locks) == 1 and exclusive_locks[0]["key"] == key: + # success, we are the only exclusive lock! wait until the non-exclusive locks go away: + while time.monotonic() - started < self.timeout: + locks = self._find_locks(only_exclusive=False) + if len(locks) == 1 and locks[0]["key"] == key: + # success, we are alone! + return self + time.sleep(self.other_locks_go_away_delay) + break # timeout + else: + # take back our lock as some other client(s) also created exclusive lock(s). + self._delete_lock(key, ignore_not_found=True) + else: # not is_exclusive + if len(exclusive_locks) == 0: + # success, noone else created an exclusive lock meanwhile! + # We don't care for other non-exclusive locks. + return self + else: + # take back our lock as some other client(s) also created exclusive lock(s). + self._delete_lock(key, ignore_not_found=True) + # wait a random bit before retrying + time.sleep(self.retry_delay_min + (self.retry_delay_max - self.retry_delay_min) * random.random()) + raise LockTimeout(str(self.store)) + + def release(self): + locks = self._find_locks(only_mine=True) + if not locks: + raise NotLocked(str(self.store)) + assert len(locks) == 1 + self._delete_lock(locks[0]["key"], ignore_not_found=True) + self.last_refresh_dt = None + + def got_exclusive_lock(self): + locks = self._find_locks(only_mine=True, only_exclusive=True) + return len(locks) == 1 + + def break_lock(self): + """break ALL locks (not just ours)""" + locks = self._get_locks() + for key in locks: + self._delete_lock(key, ignore_not_found=True) + self.last_refresh_dt = None + + def migrate_lock(self, old_id, new_id): + """migrate the lock ownership from old_id to new_id""" + assert self.id == old_id + assert len(new_id) == 3 + old_locks = self._find_locks(only_mine=True) + assert len(old_locks) == 1 + self.id = new_id + self._create_lock(exclusive=old_locks[0]["exclusive"]) + self._delete_lock(old_locks[0]["key"]) + now = datetime.datetime.now(datetime.timezone.utc) + self.last_refresh_dt = now + + def refresh(self): + """refresh the lock - call this frequently, but not later than every seconds""" + now = datetime.datetime.now(datetime.timezone.utc) + if self.last_refresh_dt is not None and now > self.last_refresh_dt + self.refresh_td: + old_locks = self._find_locks(only_mine=True) + if len(old_locks) == 0: + # crap, my lock has been removed. :-( + # this can happen e.g. if my machine has been suspended while doing a backup, so that the + # lock will auto-expire. a borg client on another machine might then kill that lock. + # if my machine then wakes up again, the lock will have vanished and we get here. + # in this case, we need to abort the operation, because the other borg might have removed + # repo objects we have written, but the referential tree was not yet full present, e.g. + # no archive has been added yet to the manifest, thus all objects looked unused/orphaned. + # another scenario when this can happen is a careless user running break-lock on another + # machine without making sure there is no borg activity in that repo. + raise LockTimeout(str(self.store)) # our lock was killed, there is no safe way to continue. + assert len(old_locks) == 1 # there shouldn't be more than 1 + old_lock = old_locks[0] + if old_lock["dt"] < now - self.refresh_td: + self._create_lock(exclusive=old_lock["exclusive"]) + self._delete_lock(old_lock["key"]) + self.last_refresh_dt = now diff --git a/src/borg/testsuite/archiver/__init__.py b/src/borg/testsuite/archiver/__init__.py index 9d7a5db42d..2ebcb45738 100644 --- a/src/borg/testsuite/archiver/__init__.py +++ b/src/borg/testsuite/archiver/__init__.py @@ -8,7 +8,6 @@ import sys import tempfile import time -from configparser import ConfigParser from contextlib import contextmanager from datetime import datetime from io import BytesIO, StringIO @@ -18,11 +17,9 @@ from ... import xattr, platform from ...archive import Archive from ...archiver import Archiver, PURE_PYTHON_MSGPACK_WARNING -from ...cache import Cache, LocalCache from ...constants import * # NOQA from ...helpers import Location, umount from ...helpers import EXIT_SUCCESS -from ...helpers import bin_to_hex from ...helpers import init_ec_warnings from ...logger import flush_logging from ...manifest import Manifest @@ -261,12 +258,8 @@ def _extract_repository_id(repo_path): def _set_repository_id(repo_path, id): - config = ConfigParser(interpolation=None) - config.read(os.path.join(repo_path, "config")) - config.set("repository", "id", bin_to_hex(id)) - with open(os.path.join(repo_path, "config"), "w") as fd: - config.write(fd) with Repository(repo_path) as repository: + repository._set_id(id) return repository.id @@ -348,34 +341,6 @@ def _assert_test_keep_tagged(archiver): assert sorted(os.listdir("output/input/taggedall")), [".NOBACKUP1", ".NOBACKUP2", CACHE_TAG_NAME] -def check_cache(archiver): - # First run a regular borg check - cmd(archiver, "check") - # Then check that the cache on disk matches exactly what's in the repo. - with open_repository(archiver) as repository: - manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - with Cache(repository, manifest, sync=False) as cache: - original_chunks = cache.chunks - # the LocalCache implementation has an on-disk chunks cache, - # but AdHocWithFilesCache and AdHocCache don't have persistent chunks cache. - persistent = isinstance(cache, LocalCache) - Cache.destroy(repository) - with Cache(repository, manifest) as cache: - correct_chunks = cache.chunks - if not persistent: - # there is no point in doing the checks - return - assert original_chunks is not correct_chunks - seen = set() - for id, (refcount, size) in correct_chunks.iteritems(): - o_refcount, o_size = original_chunks[id] - assert refcount == o_refcount - assert size == o_size - seen.add(id) - for id, (refcount, size) in original_chunks.iteritems(): - assert id in seen - - @contextmanager def assert_creates_file(path): assert not os.path.exists(path), f"{path} should not exist" diff --git a/src/borg/testsuite/archiver/bypass_lock_option.py b/src/borg/testsuite/archiver/bypass_lock_option.py deleted file mode 100644 index 8ddeb6762c..0000000000 --- a/src/borg/testsuite/archiver/bypass_lock_option.py +++ /dev/null @@ -1,130 +0,0 @@ -import pytest - -from ...constants import * # NOQA -from ...helpers import EXIT_ERROR -from ...locking import LockFailed -from ...remote import RemoteRepository -from .. import llfuse -from . import cmd, create_src_archive, RK_ENCRYPTION, read_only, fuse_mount - - -def test_readonly_check(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "check", "--verify-data", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "check", "--verify-data") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "check", "--verify-data", "--bypass-lock") - - -def test_readonly_diff(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "a") - create_src_archive(archiver, "b") - - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "diff", "a", "b", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "diff", "a", "b") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "diff", "a", "b", "--bypass-lock") - - -def test_readonly_export_tar(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "export-tar", "test", "test.tar", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "export-tar", "test", "test.tar") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "export-tar", "test", "test.tar", "--bypass-lock") - - -def test_readonly_extract(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "extract", "test", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "extract", "test") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "extract", "test", "--bypass-lock") - - -def test_readonly_info(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "rinfo", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "rinfo") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "rinfo", "--bypass-lock") - - -def test_readonly_list(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - cmd(archiver, "rlist", exit_code=EXIT_ERROR) - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - cmd(archiver, "rlist") - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - cmd(archiver, "rlist", "--bypass-lock") - - -@pytest.mark.skipif(not llfuse, reason="llfuse not installed") -def test_readonly_mount(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - with read_only(archiver.repository_path): - # verify that command normally doesn't work with read-only repo - if archiver.FORK_DEFAULT: - with fuse_mount(archiver, exit_code=EXIT_ERROR): - pass - else: - with pytest.raises((LockFailed, RemoteRepository.RPCError)) as excinfo: - # self.fuse_mount always assumes fork=True, so for this test we have to set fork=False manually - with fuse_mount(archiver, fork=False): - pass - if isinstance(excinfo.value, RemoteRepository.RPCError): - assert excinfo.value.exception_class == "LockFailed" - # verify that command works with read-only repo when using --bypass-lock - with fuse_mount(archiver, None, "--bypass-lock"): - pass diff --git a/src/borg/testsuite/archiver/check_cmd.py b/src/borg/testsuite/archiver/check_cmd.py index 87fd10ab37..2f65110ce1 100644 --- a/src/borg/testsuite/archiver/check_cmd.py +++ b/src/borg/testsuite/archiver/check_cmd.py @@ -8,7 +8,9 @@ from ...constants import * # NOQA from ...helpers import bin_to_hex, msgpack from ...manifest import Manifest +from ...remote import RemoteRepository from ...repository import Repository +from ..repository import fchunk from . import cmd, src_file, create_src_archive, open_archive, generate_archiver_tests, RK_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA @@ -26,17 +28,15 @@ def test_check_usage(archivers, request): check_cmd_setup(archiver) output = cmd(archiver, "check", "-v", "--progress", exit_code=0) - assert "Starting repository check" in output + assert "Starting full repository check" in output assert "Starting archive consistency check" in output - assert "Checking segments" in output output = cmd(archiver, "check", "-v", "--repository-only", exit_code=0) - assert "Starting repository check" in output + assert "Starting full repository check" in output assert "Starting archive consistency check" not in output - assert "Checking segments" not in output output = cmd(archiver, "check", "-v", "--archives-only", exit_code=0) - assert "Starting repository check" not in output + assert "Starting full repository check" not in output assert "Starting archive consistency check" in output output = cmd(archiver, "check", "-v", "--archives-only", "--match-archives=archive2", exit_code=0) @@ -105,7 +105,6 @@ def test_missing_file_chunk(archivers, request): break else: pytest.fail("should not happen") # convert 'fail' - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) output = cmd(archiver, "check", "--repair", exit_code=0) @@ -171,7 +170,6 @@ def test_missing_archive_item_chunk(archivers, request): archive, repository = open_archive(archiver.repository_path, "archive1") with repository: repository.delete(archive.metadata.items[0]) - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) cmd(archiver, "check", "--repair", exit_code=0) cmd(archiver, "check", exit_code=0) @@ -183,7 +181,6 @@ def test_missing_archive_metadata(archivers, request): archive, repository = open_archive(archiver.repository_path, "archive1") with repository: repository.delete(archive.id) - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) cmd(archiver, "check", "--repair", exit_code=0) cmd(archiver, "check", exit_code=0) @@ -194,8 +191,10 @@ def test_missing_manifest(archivers, request): check_cmd_setup(archiver) archive, repository = open_archive(archiver.repository_path, "archive1") with repository: - repository.delete(Manifest.MANIFEST_ID) - repository.commit(compact=False) + if isinstance(repository, (Repository, RemoteRepository)): + repository.store_delete("config/manifest") + else: + repository.delete(Manifest.MANIFEST_ID) cmd(archiver, "check", exit_code=1) output = cmd(archiver, "check", "-v", "--repair", exit_code=0) assert "archive1" in output @@ -208,10 +207,9 @@ def test_corrupted_manifest(archivers, request): check_cmd_setup(archiver) archive, repository = open_archive(archiver.repository_path, "archive1") with repository: - manifest = repository.get(Manifest.MANIFEST_ID) - corrupted_manifest = manifest + b"corrupted!" - repository.put(Manifest.MANIFEST_ID, corrupted_manifest) - repository.commit(compact=False) + manifest = repository.get_manifest() + corrupted_manifest = manifest[:123] + b"corrupted!" + manifest[123:] + repository.put_manifest(corrupted_manifest) cmd(archiver, "check", exit_code=1) output = cmd(archiver, "check", "-v", "--repair", exit_code=0) assert "archive1" in output @@ -242,8 +240,7 @@ def test_spoofed_manifest(archivers, request): ) # maybe a repo-side attacker could manage to move the fake manifest file chunk over to the manifest ID. # we simulate this here by directly writing the fake manifest data to the manifest ID. - repository.put(Manifest.MANIFEST_ID, cdata) - repository.commit(compact=False) + repository.put_manifest(cdata) # borg should notice that the manifest has the wrong ro_type. cmd(archiver, "check", exit_code=1) # borg check --repair should remove the corrupted manifest and rebuild a new one. @@ -258,13 +255,12 @@ def test_manifest_rebuild_corrupted_chunk(archivers, request): check_cmd_setup(archiver) archive, repository = open_archive(archiver.repository_path, "archive1") with repository: - manifest = repository.get(Manifest.MANIFEST_ID) - corrupted_manifest = manifest + b"corrupted!" - repository.put(Manifest.MANIFEST_ID, corrupted_manifest) + manifest = repository.get_manifest() + corrupted_manifest = manifest[:123] + b"corrupted!" + manifest[123:] + repository.put_manifest(corrupted_manifest) chunk = repository.get(archive.id) corrupted_chunk = chunk + b"corrupted!" repository.put(archive.id, corrupted_chunk) - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) output = cmd(archiver, "check", "-v", "--repair", exit_code=0) assert "archive2" in output @@ -277,9 +273,9 @@ def test_manifest_rebuild_duplicate_archive(archivers, request): archive, repository = open_archive(archiver.repository_path, "archive1") repo_objs = archive.repo_objs with repository: - manifest = repository.get(Manifest.MANIFEST_ID) - corrupted_manifest = manifest + b"corrupted!" - repository.put(Manifest.MANIFEST_ID, corrupted_manifest) + manifest = repository.get_manifest() + corrupted_manifest = manifest[:123] + b"corrupted!" + manifest[123:] + repository.put_manifest(corrupted_manifest) archive_dict = { "command_line": "", "item_ptrs": [], @@ -292,9 +288,12 @@ def test_manifest_rebuild_duplicate_archive(archivers, request): archive = repo_objs.key.pack_metadata(archive_dict) archive_id = repo_objs.id_hash(archive) repository.put(archive_id, repo_objs.format(archive_id, {}, archive, ro_type=ROBJ_ARCHIVE_META)) - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) - cmd(archiver, "check", "--repair", exit_code=0) + # when undeleting archives, borg check will discover both the original archive1 as well as + # the fake archive1 we created above. for the fake one, a new archives directory entry + # named archive1.1 will be created because we request undeleting archives and there + # is no archives directory entry for the fake archive yet. + cmd(archiver, "check", "--repair", "--undelete-archives", exit_code=0) output = cmd(archiver, "rlist") assert "archive1" in output assert "archive1.1" in output @@ -308,9 +307,9 @@ def test_spoofed_archive(archivers, request): repo_objs = archive.repo_objs with repository: # attacker would corrupt or delete the manifest to trigger a rebuild of it: - manifest = repository.get(Manifest.MANIFEST_ID) - corrupted_manifest = manifest + b"corrupted!" - repository.put(Manifest.MANIFEST_ID, corrupted_manifest) + manifest = repository.get_manifest() + corrupted_manifest = manifest[:123] + b"corrupted!" + manifest[123:] + repository.put_manifest(corrupted_manifest) archive_dict = { "command_line": "", "item_ptrs": [], @@ -333,7 +332,6 @@ def test_spoofed_archive(archivers, request): ro_type=ROBJ_FILE_STREAM, # a real archive is stored with ROBJ_ARCHIVE_META ), ) - repository.commit(compact=False) cmd(archiver, "check", exit_code=1) cmd(archiver, "check", "--repair", "--debug", exit_code=0) output = cmd(archiver, "rlist") @@ -349,18 +347,61 @@ def test_extra_chunks(archivers, request): check_cmd_setup(archiver) cmd(archiver, "check", exit_code=0) with Repository(archiver.repository_location, exclusive=True) as repository: - repository.put(b"01234567890123456789012345678901", b"xxxx") - repository.commit(compact=False) - output = cmd(archiver, "check", "-v", exit_code=0) # orphans are not considered warnings anymore - assert "1 orphaned (unused) objects found." in output - cmd(archiver, "check", "--repair", exit_code=0) - output = cmd(archiver, "check", "-v", exit_code=0) - assert "orphaned (unused) objects found." not in output - cmd(archiver, "extract", "archive1", "--dry-run", exit_code=0) + chunk = fchunk(b"xxxx") + repository.put(b"01234567890123456789012345678901", chunk) + cmd(archiver, "check", "-v", exit_code=0) # check does not deal with orphans anymore @pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]]) def test_verify_data(archivers, request, init_args): + archiver = request.getfixturevalue(archivers) + if archiver.get_kind() != "local": + pytest.skip("only works locally, patches objects") + + # it's tricky to test the cryptographic data verification, because usually already the + # repository-level xxh64 hash fails to verify. So we use a fake one that doesn't. + # note: it only works like tested here for a highly engineered data corruption attack, + # because with accidental corruption, usually already the xxh64 low-level check fails. + def fake_xxh64(data, seed=0): + return b"fakefake" + + import borg.repoobj + import borg.repository + + with patch.object(borg.repoobj, "xxh64", fake_xxh64), patch.object(borg.repository, "xxh64", fake_xxh64): + check_cmd_setup(archiver) + shutil.rmtree(archiver.repository_path) + cmd(archiver, "rcreate", *init_args) + create_src_archive(archiver, "archive1") + archive, repository = open_archive(archiver.repository_path, "archive1") + with repository: + for item in archive.iter_items(): + if item.path.endswith(src_file): + chunk = item.chunks[-1] + data = repository.get(chunk.id) + data = data[0:123] + b"x" + data[123:] + repository.put(chunk.id, data) + break + + # the normal archives check does not read file content data. + cmd(archiver, "check", "--archives-only", exit_code=0) + # but with --verify-data, it does and notices the issue. + output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1) + assert f"{bin_to_hex(chunk.id)}, integrity error" in output + + # repair (heal is tested in another test) + output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0) + assert f"{bin_to_hex(chunk.id)}, integrity error" in output + assert f"{src_file}: New missing file chunk detected" in output + + # run with --verify-data again, all fine now (file was patched with a replacement chunk). + cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=0) + + +@pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]]) +def test_corrupted_file_chunk(archivers, request, init_args): + ## similar to test_verify_data, but here we let the low level repository-only checks discover the issue. + archiver = request.getfixturevalue(archivers) check_cmd_setup(archiver) shutil.rmtree(archiver.repository_path) @@ -372,19 +413,22 @@ def test_verify_data(archivers, request, init_args): if item.path.endswith(src_file): chunk = item.chunks[-1] data = repository.get(chunk.id) - data = data[0:100] + b"x" + data[101:] + data = data[0:123] + b"x" + data[123:] repository.put(chunk.id, data) break - repository.commit(compact=False) - cmd(archiver, "check", exit_code=0) - output = cmd(archiver, "check", "--verify-data", exit_code=1) - assert bin_to_hex(chunk.id) + ", integrity error" in output + + # the normal check checks all repository objects and the xxh64 checksum fails. + output = cmd(archiver, "check", "--repository-only", exit_code=1) + assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output # repair (heal is tested in another test) - output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0) - assert bin_to_hex(chunk.id) + ", integrity error" in output + output = cmd(archiver, "check", "--repair", exit_code=0) + assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output assert f"{src_file}: New missing file chunk detected" in output + # run normal check again, all fine now (file was patched with a replacement chunk). + cmd(archiver, "check", "--repository-only", exit_code=0) + def test_empty_repository(archivers, request): archiver = request.getfixturevalue(archivers) @@ -392,7 +436,6 @@ def test_empty_repository(archivers, request): pytest.skip("only works locally") check_cmd_setup(archiver) with Repository(archiver.repository_location, exclusive=True) as repository: - for id_ in repository.list(): - repository.delete(id_) - repository.commit(compact=False) + for id, _ in repository.list(): + repository.delete(id) cmd(archiver, "check", exit_code=1) diff --git a/src/borg/testsuite/archiver/checks.py b/src/borg/testsuite/archiver/checks.py index a9324fbdf5..e6c407e8d0 100644 --- a/src/borg/testsuite/archiver/checks.py +++ b/src/borg/testsuite/archiver/checks.py @@ -4,7 +4,7 @@ import pytest -from ...cache import Cache, LocalCache, get_cache_impl +from ...cache import Cache from ...constants import * # NOQA from ...helpers import Location, get_security_dir, bin_to_hex from ...helpers import EXIT_ERROR @@ -13,7 +13,7 @@ from ...repository import Repository from .. import llfuse from .. import changedir -from . import cmd, _extract_repository_id, open_repository, check_cache, create_test_files +from . import cmd, _extract_repository_id, create_test_files from . import _set_repository_id, create_regular_file, assert_creates_file, generate_archiver_tests, RK_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote") # NOQA @@ -29,7 +29,6 @@ def add_unknown_feature(repo_path, operation): manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) manifest.config["feature_flags"] = {operation.value: {"mandatory": ["unknown-feature"]}} manifest.write() - repository.commit(compact=False) def cmd_raises_unknown_feature(archiver, args): @@ -204,17 +203,6 @@ def test_unknown_feature_on_create(archivers, request): cmd_raises_unknown_feature(archiver, ["create", "test", "input"]) -@pytest.mark.skipif(get_cache_impl() in ("adhocwithfiles", "adhoc"), reason="only works with LocalCache") -def test_unknown_feature_on_cache_sync(archivers, request): - # LocalCache.sync checks repo compat - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", RK_ENCRYPTION) - # delete the cache to trigger a cache sync later in borg create - cmd(archiver, "rdelete", "--cache-only") - add_unknown_feature(archiver.repository_path, Manifest.Operation.READ) - cmd_raises_unknown_feature(archiver, ["create", "test", "input"]) - - def test_unknown_feature_on_change_passphrase(archivers, request): archiver = request.getfixturevalue(archivers) print(cmd(archiver, "rcreate", RK_ENCRYPTION)) @@ -266,7 +254,6 @@ def test_unknown_feature_on_mount(archivers, request): cmd_raises_unknown_feature(archiver, ["mount", mountpoint]) -@pytest.mark.allow_cache_wipe def test_unknown_mandatory_feature_in_cache(archivers, request): archiver = request.getfixturevalue(archivers) remote_repo = archiver.get_kind() == "remote" @@ -277,27 +264,10 @@ def test_unknown_mandatory_feature_in_cache(archivers, request): repository._location = Location(archiver.repository_location) manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) with Cache(repository, manifest) as cache: - is_localcache = isinstance(cache, LocalCache) - cache.begin_txn() cache.cache_config.mandatory_features = {"unknown-feature"} - cache.commit() if archiver.FORK_DEFAULT: cmd(archiver, "create", "test", "input") - else: - called = False - wipe_cache_safe = LocalCache.wipe_cache - - def wipe_wrapper(*args): - nonlocal called - called = True - wipe_cache_safe(*args) - - with patch.object(LocalCache, "wipe_cache", wipe_wrapper): - cmd(archiver, "create", "test", "input") - - if is_localcache: - assert called with Repository(archiver.repository_path, exclusive=True) as repository: if remote_repo: @@ -307,41 +277,6 @@ def wipe_wrapper(*args): assert cache.cache_config.mandatory_features == set() -def test_check_cache(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", RK_ENCRYPTION) - cmd(archiver, "create", "test", "input") - with open_repository(archiver) as repository: - manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - with Cache(repository, manifest, sync=False) as cache: - cache.begin_txn() - cache.chunks.incref(list(cache.chunks.iteritems())[0][0]) - cache.commit() - persistent = isinstance(cache, LocalCache) - if not persistent: - pytest.skip("check_cache is pointless if we do not have a persistent chunks cache") - with pytest.raises(AssertionError): - check_cache(archiver) - - -@pytest.mark.skipif(get_cache_impl() in ("adhocwithfiles", "adhoc"), reason="only works with LocalCache") -def test_env_use_chunks_archive(archivers, request, monkeypatch): - archiver = request.getfixturevalue(archivers) - create_test_files(archiver.input_path) - monkeypatch.setenv("BORG_USE_CHUNKS_ARCHIVE", "no") - cmd(archiver, "rcreate", RK_ENCRYPTION) - repository_id = bin_to_hex(_extract_repository_id(archiver.repository_path)) - cache_path = os.path.join(archiver.cache_path, repository_id) - cmd(archiver, "create", "test", "input") - assert os.path.exists(cache_path) - assert os.path.exists(os.path.join(cache_path, "chunks.archive.d")) - assert len(os.listdir(os.path.join(cache_path, "chunks.archive.d"))) == 0 - cmd(archiver, "rdelete", "--cache-only") - monkeypatch.setenv("BORG_USE_CHUNKS_ARCHIVE", "yes") - cmd(archiver, "create", "test2", "input") - assert len(os.listdir(os.path.join(cache_path, "chunks.archive.d"))) > 0 - - # Begin Remote Tests def test_remote_repo_restrict_to_path(remote_archiver): original_location, repo_path = remote_archiver.repository_location, remote_archiver.repository_path diff --git a/src/borg/testsuite/archiver/compact_cmd.py b/src/borg/testsuite/archiver/compact_cmd.py new file mode 100644 index 0000000000..1b90ecf5b6 --- /dev/null +++ b/src/borg/testsuite/archiver/compact_cmd.py @@ -0,0 +1,44 @@ +from ...constants import * # NOQA +from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION + +pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA + + +def test_compact_empty_repository(archivers, request): + archiver = request.getfixturevalue(archivers) + + cmd(archiver, "rcreate", RK_ENCRYPTION) + + output = cmd(archiver, "compact", "-v", exit_code=0) + assert "Starting compaction" in output + assert "Repository size is 0 B in 0 objects." in output + assert "Finished compaction" in output + + +def test_compact_after_deleting_all_archives(archivers, request): + archiver = request.getfixturevalue(archivers) + + cmd(archiver, "rcreate", RK_ENCRYPTION) + create_src_archive(archiver, "archive") + cmd(archiver, "delete", "-a", "archive", exit_code=0) + + output = cmd(archiver, "compact", "-v", exit_code=0) + assert "Starting compaction" in output + assert "Deleting " in output + assert "Repository size is 0 B in 0 objects." in output + assert "Finished compaction" in output + + +def test_compact_after_deleting_some_archives(archivers, request): + archiver = request.getfixturevalue(archivers) + + cmd(archiver, "rcreate", RK_ENCRYPTION) + create_src_archive(archiver, "archive1") + create_src_archive(archiver, "archive2") + cmd(archiver, "delete", "-a", "archive1", exit_code=0) + + output = cmd(archiver, "compact", "-v", exit_code=0) + assert "Starting compaction" in output + assert "Deleting " in output + assert "Repository size is 0 B in 0 objects." not in output + assert "Finished compaction" in output diff --git a/src/borg/testsuite/archiver/config_cmd.py b/src/borg/testsuite/archiver/config_cmd.py deleted file mode 100644 index fa89df241d..0000000000 --- a/src/borg/testsuite/archiver/config_cmd.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import pytest - -from ...constants import * # NOQA -from . import RK_ENCRYPTION, create_test_files, cmd, generate_archiver_tests -from ...helpers import CommandError, Error - -pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,binary") # NOQA - - -def test_config(archivers, request): - archiver = request.getfixturevalue(archivers) - create_test_files(archiver.input_path) - os.unlink("input/flagfile") - cmd(archiver, "rcreate", RK_ENCRYPTION) - output = cmd(archiver, "config", "--list") - assert "[repository]" in output - assert "version" in output - assert "segments_per_dir" in output - assert "storage_quota" in output - assert "append_only" in output - assert "additional_free_space" in output - assert "id" in output - assert "last_segment_checked" not in output - - if archiver.FORK_DEFAULT: - output = cmd(archiver, "config", "last_segment_checked", exit_code=2) - assert "No option " in output - else: - with pytest.raises(Error): - cmd(archiver, "config", "last_segment_checked") - - cmd(archiver, "config", "last_segment_checked", "123") - output = cmd(archiver, "config", "last_segment_checked") - assert output == "123" + os.linesep - output = cmd(archiver, "config", "--list") - assert "last_segment_checked" in output - cmd(archiver, "config", "--delete", "last_segment_checked") - - for cfg_key, cfg_value in [("additional_free_space", "2G"), ("repository.append_only", "1")]: - output = cmd(archiver, "config", cfg_key) - assert output == "0" + os.linesep - cmd(archiver, "config", cfg_key, cfg_value) - output = cmd(archiver, "config", cfg_key) - assert output == cfg_value + os.linesep - cmd(archiver, "config", "--delete", cfg_key) - if archiver.FORK_DEFAULT: - cmd(archiver, "config", cfg_key, exit_code=2) - else: - with pytest.raises(Error): - cmd(archiver, "config", cfg_key) - - cmd(archiver, "config", "--list", "--delete", exit_code=2) - if archiver.FORK_DEFAULT: - expected_ec = CommandError().exit_code - cmd(archiver, "config", exit_code=expected_ec) - else: - with pytest.raises(CommandError): - cmd(archiver, "config") - if archiver.FORK_DEFAULT: - cmd(archiver, "config", "invalid-option", exit_code=2) - else: - with pytest.raises(Error): - cmd(archiver, "config", "invalid-option") diff --git a/src/borg/testsuite/archiver/corruption.py b/src/borg/testsuite/archiver/corruption.py index 65804eaca6..7bd4c55f76 100644 --- a/src/borg/testsuite/archiver/corruption.py +++ b/src/borg/testsuite/archiver/corruption.py @@ -2,33 +2,12 @@ import json import os from configparser import ConfigParser -from unittest.mock import patch import pytest from ...constants import * # NOQA -from ...helpers import bin_to_hex, Error -from . import cmd, create_src_archive, create_test_files, RK_ENCRYPTION -from ...hashindex import ChunkIndex -from ...cache import LocalCache - - -def test_check_corrupted_repository(archiver): - cmd(archiver, "rcreate", RK_ENCRYPTION) - create_src_archive(archiver, "test") - cmd(archiver, "extract", "test", "--dry-run") - cmd(archiver, "check") - - name = sorted(os.listdir(os.path.join(archiver.tmpdir, "repository", "data", "0")), reverse=True)[1] - with open(os.path.join(archiver.tmpdir, "repository", "data", "0", name), "r+b") as fd: - fd.seek(100) - fd.write(b"XXXX") - - if archiver.FORK_DEFAULT: - cmd(archiver, "check", exit_code=1) - else: - with pytest.raises(Error): - cmd(archiver, "check") +from ...helpers import bin_to_hex +from . import cmd, create_test_files, RK_ENCRYPTION def corrupt_archiver(archiver): @@ -45,40 +24,6 @@ def corrupt(file, amount=1): fd.write(corrupted) -@pytest.mark.allow_cache_wipe -def test_cache_chunks(archiver): - corrupt_archiver(archiver) - if archiver.cache_path is None: - pytest.skip("no cache path for this kind of Cache implementation") - - create_src_archive(archiver, "test") - chunks_path = os.path.join(archiver.cache_path, "chunks") - if not os.path.exists(chunks_path): - pytest.skip("no persistent chunks index for this kind of Cache implementation") - - chunks_before_corruption = set(ChunkIndex(path=chunks_path).iteritems()) - - corrupt(chunks_path) - - assert not archiver.FORK_DEFAULT # test does not support forking - - chunks_in_memory = None - sync_chunks = LocalCache.sync - - def sync_wrapper(cache): - nonlocal chunks_in_memory - sync_chunks(cache) - chunks_in_memory = set(cache.chunks.iteritems()) - - with patch.object(LocalCache, "sync", sync_wrapper): - out = cmd(archiver, "rinfo") - - assert chunks_in_memory == chunks_before_corruption - assert "forcing a cache rebuild" in out - chunks_after_repair = set(ChunkIndex(path=chunks_path).iteritems()) - assert chunks_after_repair == chunks_before_corruption - - def test_cache_files(archiver): corrupt_archiver(archiver) if archiver.cache_path is None: @@ -91,42 +36,6 @@ def test_cache_files(archiver): assert "files cache is corrupted" in out -def test_chunks_archive(archiver): - corrupt_archiver(archiver) - if archiver.cache_path is None: - pytest.skip("no cache path for this kind of Cache implementation") - - cmd(archiver, "create", "test1", "input") - # Find ID of test1, so we can corrupt it later :) - target_id = cmd(archiver, "rlist", "--format={id}{NL}").strip() - cmd(archiver, "create", "test2", "input") - - # Force cache sync, creating archive chunks of test1 and test2 in chunks.archive.d - cmd(archiver, "rdelete", "--cache-only") - cmd(archiver, "rinfo", "--json") - - chunks_archive = os.path.join(archiver.cache_path, "chunks.archive.d") - if not os.path.exists(chunks_archive): - pytest.skip("Only LocalCache has a per-archive chunks index cache.") - assert len(os.listdir(chunks_archive)) == 4 # two archives, one chunks cache and one .integrity file each - - corrupt(os.path.join(chunks_archive, target_id + ".compact")) - - # Trigger cache sync by changing the manifest ID in the cache config - config_path = os.path.join(archiver.cache_path, "config") - config = ConfigParser(interpolation=None) - config.read(config_path) - config.set("cache", "manifest", bin_to_hex(bytes(32))) - with open(config_path, "w") as fd: - config.write(fd) - - # Cache sync notices corrupted archive chunks, but automatically recovers. - out = cmd(archiver, "create", "-v", "test3", "input", exit_code=1) - assert "Reading cached archive chunk index for test1" in out - assert "Cached archive chunk index of test1 is corrupted" in out - assert "Fetching and building archive index for test1" in out - - def test_old_version_interfered(archiver): corrupt_archiver(archiver) if archiver.cache_path is None: diff --git a/src/borg/testsuite/archiver/create_cmd.py b/src/borg/testsuite/archiver/create_cmd.py index 72e8bc97d6..342d97c60c 100644 --- a/src/borg/testsuite/archiver/create_cmd.py +++ b/src/borg/testsuite/archiver/create_cmd.py @@ -236,33 +236,9 @@ def test_create_stdin(archivers, request): assert extracted_data == input_data -def test_create_stdin_checkpointing(archivers, request): - archiver = request.getfixturevalue(archivers) - chunk_size = 1000 # fixed chunker with this size, also volume based checkpointing after that volume - cmd(archiver, "rcreate", RK_ENCRYPTION) - input_data = b"X" * (chunk_size * 2 - 1) # one full and one partial chunk - cmd( - archiver, - "create", - f"--chunker-params=fixed,{chunk_size}", - f"--checkpoint-volume={chunk_size}", - "test", - "-", - input=input_data, - ) - # repo looking good overall? checks for rc == 0. - cmd(archiver, "check", "--debug") - # verify that there are no part files in final archive - out = cmd(archiver, "list", "test") - assert "stdin.borg_part" not in out - # verify full file - out = cmd(archiver, "extract", "test", "stdin", "--stdout", binary_output=True) - assert out == input_data - - def test_create_erroneous_file(archivers, request): archiver = request.getfixturevalue(archivers) - chunk_size = 1000 # fixed chunker with this size, also volume based checkpointing after that volume + chunk_size = 1000 # fixed chunker with this size create_regular_file(archiver.input_path, os.path.join(archiver.input_path, "file1"), size=chunk_size * 2) create_regular_file(archiver.input_path, os.path.join(archiver.input_path, "file2"), size=chunk_size * 2) create_regular_file(archiver.input_path, os.path.join(archiver.input_path, "file3"), size=chunk_size * 2) @@ -550,7 +526,7 @@ def test_create_pattern_intermediate_folders_first(archivers, request): assert out_list.index("d x/b") < out_list.index("- x/b/foo_b") -@pytest.mark.skipif(get_cache_impl() in ("adhocwithfiles", "local"), reason="only works with AdHocCache") +@pytest.mark.skipif(get_cache_impl() != "adhoc", reason="only works with AdHocCache") def test_create_no_cache_sync_adhoc(archivers, request): # TODO: add test for AdHocWithFilesCache archiver = request.getfixturevalue(archivers) create_test_files(archiver.input_path) @@ -670,7 +646,7 @@ def test_create_dry_run(archivers, request): # Make sure no archive has been created with Repository(archiver.repository_path) as repository: manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - assert len(manifest.archives) == 0 + assert manifest.archives.count() == 0 def test_progress_on(archivers, request): diff --git a/src/borg/testsuite/archiver/debug_cmds.py b/src/borg/testsuite/archiver/debug_cmds.py index 3923871a5f..d8b571f669 100644 --- a/src/borg/testsuite/archiver/debug_cmds.py +++ b/src/borg/testsuite/archiver/debug_cmds.py @@ -45,7 +45,7 @@ def test_debug_dump_repo_objs(archivers, request): with changedir("output"): output = cmd(archiver, "debug", "dump-repo-objs") output_dir = sorted(os.listdir("output")) - assert len(output_dir) > 0 and output_dir[0].startswith("00000000_") + assert len(output_dir) > 0 assert "Done." in output @@ -158,28 +158,6 @@ def test_debug_dump_archive(archivers, request): assert "_items" in result -def test_debug_refcount_obj(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", RK_ENCRYPTION) - output = cmd(archiver, "debug", "refcount-obj", "0" * 64).strip() - info = "object 0000000000000000000000000000000000000000000000000000000000000000 not found [info from chunks cache]." - assert output == info - - create_json = json.loads(cmd(archiver, "create", "--json", "test", "input")) - archive_id = create_json["archive"]["id"] - output = cmd(archiver, "debug", "refcount-obj", archive_id).strip() - # LocalCache does precise refcounting, so we'll get 1 reference for the archive. - # AdHocCache or AdHocWithFilesCache doesn't, we'll get ChunkIndex.MAX_VALUE as refcount. - assert ( - output == f"object {archive_id} has 1 referrers [info from chunks cache]." - or output == f"object {archive_id} has 4294966271 referrers [info from chunks cache]." - ) - - # Invalid IDs do not abort or return an error - output = cmd(archiver, "debug", "refcount-obj", "124", "xyza").strip() - assert output == f"object id 124 is invalid.{os.linesep}object id xyza is invalid." - - def test_debug_info(archivers, request): archiver = request.getfixturevalue(archivers) output = cmd(archiver, "debug", "info") diff --git a/src/borg/testsuite/archiver/delete_cmd.py b/src/borg/testsuite/archiver/delete_cmd.py index 25c35e9313..f29324b4b5 100644 --- a/src/borg/testsuite/archiver/delete_cmd.py +++ b/src/borg/testsuite/archiver/delete_cmd.py @@ -1,13 +1,10 @@ -from ...archive import Archive from ...constants import * # NOQA -from ...manifest import Manifest -from ...repository import Repository -from . import cmd, create_regular_file, src_file, create_src_archive, generate_archiver_tests, RK_ENCRYPTION +from . import cmd, create_regular_file, generate_archiver_tests, RK_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA -def test_delete(archivers, request): +def test_delete_options(archivers, request): archiver = request.getfixturevalue(archivers) create_regular_file(archiver.input_path, "file1", size=1024 * 80) create_regular_file(archiver.input_path, "dir2/file2", size=1024 * 80) @@ -17,14 +14,11 @@ def test_delete(archivers, request): cmd(archiver, "create", "test.3", "input") cmd(archiver, "create", "another_test.1", "input") cmd(archiver, "create", "another_test.2", "input") - cmd(archiver, "extract", "test", "--dry-run") - cmd(archiver, "extract", "test.2", "--dry-run") cmd(archiver, "delete", "--match-archives", "sh:another_*") - cmd(archiver, "delete", "--last", "1") + cmd(archiver, "delete", "--last", "1") # test.3 cmd(archiver, "delete", "-a", "test") - cmd(archiver, "extract", "test.2", "--dry-run") - output = cmd(archiver, "delete", "-a", "test.2", "--stats") - assert "Original size: -" in output # negative size == deleted data + cmd(archiver, "extract", "test.2", "--dry-run") # still there? + cmd(archiver, "delete", "-a", "test.2") output = cmd(archiver, "rlist") assert output == "" # no archives left! @@ -35,47 +29,6 @@ def test_delete_multiple(archivers, request): cmd(archiver, "rcreate", RK_ENCRYPTION) cmd(archiver, "create", "test1", "input") cmd(archiver, "create", "test2", "input") - cmd(archiver, "create", "test3", "input") cmd(archiver, "delete", "-a", "test1") cmd(archiver, "delete", "-a", "test2") - cmd(archiver, "extract", "test3", "--dry-run") - cmd(archiver, "delete", "-a", "test3") assert not cmd(archiver, "rlist") - - -def test_delete_force(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", "--encryption=none") - create_src_archive(archiver, "test") - with Repository(archiver.repository_path, exclusive=True) as repository: - manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - archive = Archive(manifest, "test") - for item in archive.iter_items(): - if item.path.endswith(src_file): - repository.delete(item.chunks[-1].id) - break - else: - assert False # missed the file - repository.commit(compact=False) - output = cmd(archiver, "delete", "-a", "test", "--force") - assert "deleted archive was corrupted" in output - - cmd(archiver, "check", "--repair") - output = cmd(archiver, "rlist") - assert "test" not in output - - -def test_delete_double_force(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", "--encryption=none") - create_src_archive(archiver, "test") - with Repository(archiver.repository_path, exclusive=True) as repository: - manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - archive = Archive(manifest, "test") - id = archive.metadata.items[0] - repository.put(id, b"corrupted items metadata stream chunk") - repository.commit(compact=False) - cmd(archiver, "delete", "-a", "test", "--force", "--force") - cmd(archiver, "check", "--repair") - output = cmd(archiver, "rlist") - assert "test" not in output diff --git a/src/borg/testsuite/archiver/disk_full.py b/src/borg/testsuite/archiver/disk_full.py index 5f85a29318..0d36173810 100644 --- a/src/borg/testsuite/archiver/disk_full.py +++ b/src/borg/testsuite/archiver/disk_full.py @@ -14,6 +14,7 @@ if the directory does not exist, the test will be skipped. """ + import errno import os import random diff --git a/src/borg/testsuite/archiver/lock_cmds.py b/src/borg/testsuite/archiver/lock_cmds.py index 20cfbd7da2..9b7857f6b6 100644 --- a/src/borg/testsuite/archiver/lock_cmds.py +++ b/src/borg/testsuite/archiver/lock_cmds.py @@ -1,4 +1,6 @@ import os +import subprocess +import time from ...constants import * # NOQA from . import cmd, generate_archiver_tests, RK_ENCRYPTION @@ -13,12 +15,33 @@ def test_break_lock(archivers, request): cmd(archiver, "break-lock") -def test_with_lock(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", RK_ENCRYPTION) - lock_path = os.path.join(archiver.repository_path, "lock.exclusive") - command = "python3", "-c", 'import os, sys; sys.exit(42 if os.path.exists("%s") else 23)' % lock_path - cmd(archiver, "with-lock", *command, fork=True, exit_code=42) +def test_with_lock(tmp_path): + repo_path = tmp_path / "repo" + env = os.environ.copy() + env["BORG_REPO"] = "file://" + str(repo_path) + command0 = "python3", "-m", "borg", "rcreate", "--encryption=none" + # timings must be adjusted so that command1 keeps running while command2 tries to get the lock, + # so that lock acquisition for command2 fails as the test expects it. + lock_wait, execution_time, startup_wait = 2, 4, 1 + assert lock_wait < execution_time - startup_wait + command1 = "python3", "-c", f'import time; print("first command - acquires the lock"); time.sleep({execution_time})' + command2 = "python3", "-c", 'print("second command - should never get executed")' + borgwl = "python3", "-m", "borg", "with-lock", f"--lock-wait={lock_wait}" + popen_options = dict(stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env) + subprocess.run(command0, env=env, check=True, text=True, capture_output=True) + assert repo_path.exists() + with subprocess.Popen([*borgwl, *command1], **popen_options) as p1: + time.sleep(startup_wait) # wait until p1 is running + # now try to get another lock on the same repository: + with subprocess.Popen([*borgwl, *command2], **popen_options) as p2: + out, err_out = p2.communicate() + assert "second command" not in out # command2 is "locked out" + assert "Failed to create/acquire the lock" in err_out + assert p2.returncode == 73 # LockTimeout: could not acquire the lock, p1 already has it + out, err_out = p1.communicate() + assert "first command" in out # command1 was executed and had the lock + assert not err_out + assert p1.returncode == 0 def test_with_lock_non_existent_command(archivers, request): diff --git a/src/borg/testsuite/archiver/mount_cmds.py b/src/borg/testsuite/archiver/mount_cmds.py index 136eb145fd..1a8bb12ff0 100644 --- a/src/borg/testsuite/archiver/mount_cmds.py +++ b/src/borg/testsuite/archiver/mount_cmds.py @@ -7,7 +7,7 @@ from ... import xattr, platform from ...constants import * # NOQA -from ...locking import Lock +from ...storelocking import Lock from ...helpers import flags_noatime, flags_normal from .. import has_lchflags, llfuse from .. import changedir, no_selinux, same_ts_ns @@ -32,16 +32,18 @@ def test_fuse_mount_hardlinks(archivers, request): ignore_perms = ["-o", "ignore_permissions,defer_permissions"] else: ignore_perms = ["-o", "ignore_permissions"] - with fuse_mount(archiver, mountpoint, "-a", "test", "--strip-components=2", *ignore_perms), changedir( - os.path.join(mountpoint, "test") + with ( + fuse_mount(archiver, mountpoint, "-a", "test", "--strip-components=2", *ignore_perms), + changedir(os.path.join(mountpoint, "test")), ): assert os.stat("hardlink").st_nlink == 2 assert os.stat("subdir/hardlink").st_nlink == 2 assert open("subdir/hardlink", "rb").read() == b"123456" assert os.stat("aaaa").st_nlink == 2 assert os.stat("source2").st_nlink == 2 - with fuse_mount(archiver, mountpoint, "input/dir1", "-a", "test", *ignore_perms), changedir( - os.path.join(mountpoint, "test") + with ( + fuse_mount(archiver, mountpoint, "input/dir1", "-a", "test", *ignore_perms), + changedir(os.path.join(mountpoint, "test")), ): assert os.stat("input/dir1/hardlink").st_nlink == 2 assert os.stat("input/dir1/subdir/hardlink").st_nlink == 2 @@ -213,7 +215,6 @@ def test_fuse_allow_damaged_files(archivers, request): break else: assert False # missed the file - repository.commit(compact=False) cmd(archiver, "check", "--repair", exit_code=0) mountpoint = os.path.join(archiver.tmpdir, "mountpoint") diff --git a/src/borg/testsuite/archiver/prune_cmd.py b/src/borg/testsuite/archiver/prune_cmd.py index e22a03d57a..a61eec8e3d 100644 --- a/src/borg/testsuite/archiver/prune_cmd.py +++ b/src/borg/testsuite/archiver/prune_cmd.py @@ -23,39 +23,18 @@ def test_prune_repository(archivers, request): cmd(archiver, "rcreate", RK_ENCRYPTION) cmd(archiver, "create", "test1", src_dir) cmd(archiver, "create", "test2", src_dir) - # these are not really a checkpoints, but they look like some: - cmd(archiver, "create", "test3.checkpoint", src_dir) - cmd(archiver, "create", "test3.checkpoint.1", src_dir) - cmd(archiver, "create", "test4.checkpoint", src_dir) output = cmd(archiver, "prune", "--list", "--dry-run", "--keep-daily=1") assert re.search(r"Would prune:\s+test1", output) - # must keep the latest non-checkpoint archive: + # must keep the latest archive: assert re.search(r"Keeping archive \(rule: daily #1\):\s+test2", output) - # must keep the latest checkpoint archive: - assert re.search(r"Keeping checkpoint archive:\s+test4.checkpoint", output) - output = cmd(archiver, "rlist", "--consider-checkpoints") + output = cmd(archiver, "rlist") assert "test1" in output assert "test2" in output - assert "test3.checkpoint" in output - assert "test3.checkpoint.1" in output - assert "test4.checkpoint" in output cmd(archiver, "prune", "--keep-daily=1") - output = cmd(archiver, "rlist", "--consider-checkpoints") + output = cmd(archiver, "rlist") assert "test1" not in output - # the latest non-checkpoint archive must be still there: + # the latest archive must be still there: assert "test2" in output - # only the latest checkpoint archive must still be there: - assert "test3.checkpoint" not in output - assert "test3.checkpoint.1" not in output - assert "test4.checkpoint" in output - # now we supersede the latest checkpoint by a successful backup: - cmd(archiver, "create", "test5", src_dir) - cmd(archiver, "prune", "--keep-daily=2") - output = cmd(archiver, "rlist", "--consider-checkpoints") - # all checkpoints should be gone now: - assert "checkpoint" not in output - # the latest archive must be still there - assert "test5" in output # This test must match docs/misc/prune-example.txt diff --git a/src/borg/testsuite/archiver/rcompress_cmd.py b/src/borg/testsuite/archiver/rcompress_cmd.py index 4635f05acb..22c634a3f5 100644 --- a/src/borg/testsuite/archiver/rcompress_cmd.py +++ b/src/borg/testsuite/archiver/rcompress_cmd.py @@ -15,12 +15,13 @@ def check_compression(ctype, clevel, olevel): repository = Repository(archiver.repository_path, exclusive=True) with repository: manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - state = None + marker = None while True: - ids, state = repository.scan(limit=LIST_SCAN_LIMIT, state=state) - if not ids: + result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + if not result: break - for id in ids: + marker = result[-1][0] + for id, _ in result: chunk = repository.get(id, read_data=True) meta, data = manifest.repo_objs.parse( id, chunk, ro_type=ROBJ_DONTCARE diff --git a/src/borg/testsuite/archiver/rcreate_cmd.py b/src/borg/testsuite/archiver/rcreate_cmd.py index b027ca1a85..0569d747d7 100644 --- a/src/borg/testsuite/archiver/rcreate_cmd.py +++ b/src/borg/testsuite/archiver/rcreate_cmd.py @@ -6,28 +6,11 @@ from ...helpers.errors import Error, CancelledByUser from ...constants import * # NOQA from ...crypto.key import FlexiKey -from ...repository import Repository from . import cmd, generate_archiver_tests, RK_ENCRYPTION, KF_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA -def test_rcreate_parent_dirs(archivers, request): - archiver = request.getfixturevalue(archivers) - if archiver.EXE: - pytest.skip("does not raise Exception, but sets rc==2") - remote_repo = archiver.get_kind() == "remote" - parent_path = os.path.join(archiver.tmpdir, "parent1", "parent2") - repository_path = os.path.join(parent_path, "repository") - archiver.repository_location = ("ssh://__testsuite__" + repository_path) if remote_repo else repository_path - with pytest.raises(Repository.ParentPathDoesNotExist): - # normal borg rcreate does NOT create missing parent dirs - cmd(archiver, "rcreate", "--encryption=none") - # but if told so, it does: - cmd(archiver, "rcreate", "--encryption=none", "--make-parent-dirs") - assert os.path.exists(parent_path) - - def test_rcreate_interrupt(archivers, request): archiver = request.getfixturevalue(archivers) if archiver.EXE: @@ -51,18 +34,6 @@ def test_rcreate_requires_encryption_option(archivers, request): cmd(archiver, "rcreate", exit_code=2) -def test_rcreate_nested_repositories(archivers, request): - archiver = request.getfixturevalue(archivers) - cmd(archiver, "rcreate", RK_ENCRYPTION) - archiver.repository_location += "/nested" - if archiver.FORK_DEFAULT: - expected_ec = Repository.AlreadyExists().exit_code - cmd(archiver, "rcreate", RK_ENCRYPTION, exit_code=expected_ec) - else: - with pytest.raises(Repository.AlreadyExists): - cmd(archiver, "rcreate", RK_ENCRYPTION) - - def test_rcreate_refuse_to_overwrite_keyfile(archivers, request, monkeypatch): # BORG_KEY_FILE=something borg rcreate should quit if "something" already exists. # See: https://github.com/borgbackup/borg/pull/6046 diff --git a/src/borg/testsuite/archiver/recreate_cmd.py b/src/borg/testsuite/archiver/recreate_cmd.py index b21a73fc8e..d9c119961e 100644 --- a/src/borg/testsuite/archiver/recreate_cmd.py +++ b/src/borg/testsuite/archiver/recreate_cmd.py @@ -16,7 +16,6 @@ _assert_test_keep_tagged, _extract_hardlinks_setup, generate_archiver_tests, - check_cache, cmd, create_regular_file, create_test_files, @@ -96,12 +95,12 @@ def test_recreate_target(archivers, request): archiver = request.getfixturevalue(archivers) create_test_files(archiver.input_path) cmd(archiver, "rcreate", RK_ENCRYPTION) - check_cache(archiver) + cmd(archiver, "check") cmd(archiver, "create", "test0", "input") - check_cache(archiver) + cmd(archiver, "check") original_archive = cmd(archiver, "rlist") cmd(archiver, "recreate", "test0", "input/dir2", "-e", "input/dir2/file3", "--target=new-archive") - check_cache(archiver) + cmd(archiver, "check") archives = cmd(archiver, "rlist") assert original_archive in archives @@ -120,7 +119,7 @@ def test_recreate_basic(archivers, request): cmd(archiver, "rcreate", RK_ENCRYPTION) cmd(archiver, "create", "test0", "input") cmd(archiver, "recreate", "test0", "input/dir2", "-e", "input/dir2/file3") - check_cache(archiver) + cmd(archiver, "check") listing = cmd(archiver, "list", "test0", "--short") assert "file1" not in listing assert "dir2/file2" in listing @@ -134,7 +133,7 @@ def test_recreate_subtree_hardlinks(archivers, request): _extract_hardlinks_setup(archiver) cmd(archiver, "create", "test2", "input") cmd(archiver, "recreate", "-a", "test", "input/dir1") - check_cache(archiver) + cmd(archiver, "check") with changedir("output"): cmd(archiver, "extract", "test") assert os.stat("input/dir1/hardlink").st_nlink == 2 @@ -159,7 +158,7 @@ def test_recreate_rechunkify(archivers, request): # right now, the file is chunked differently assert num_chunks1 != num_chunks2 cmd(archiver, "recreate", "--chunker-params", "default") - check_cache(archiver) + cmd(archiver, "check") num_chunks1 = int(cmd(archiver, "list", "test1", "input/large_file", "--format", "{num_chunks}")) num_chunks2 = int(cmd(archiver, "list", "test2", "input/large_file", "--format", "{num_chunks}")) # now the files are chunked in the same way @@ -220,7 +219,7 @@ def test_recreate_dry_run(archivers, request): cmd(archiver, "create", "test", "input") archives_before = cmd(archiver, "list", "test") cmd(archiver, "recreate", "-n", "-e", "input/compressible") - check_cache(archiver) + cmd(archiver, "check") archives_after = cmd(archiver, "list", "test") assert archives_after == archives_before @@ -232,7 +231,7 @@ def test_recreate_skips_nothing_to_do(archivers, request): cmd(archiver, "create", "test", "input") info_before = cmd(archiver, "info", "-a", "test") cmd(archiver, "recreate", "--chunker-params", "default") - check_cache(archiver) + cmd(archiver, "check") info_after = cmd(archiver, "info", "-a", "test") assert info_before == info_after # includes archive ID @@ -248,22 +247,22 @@ def test_recreate_list_output(archivers, request): cmd(archiver, "create", "test", "input") output = cmd(archiver, "recreate", "-a", "test", "--list", "--info", "-e", "input/file2") - check_cache(archiver) + cmd(archiver, "check") assert "input/file1" in output assert "- input/file2" in output output = cmd(archiver, "recreate", "-a", "test", "--list", "-e", "input/file3") - check_cache(archiver) + cmd(archiver, "check") assert "input/file1" in output assert "- input/file3" in output output = cmd(archiver, "recreate", "-a", "test", "-e", "input/file4") - check_cache(archiver) + cmd(archiver, "check") assert "input/file1" not in output assert "- input/file4" not in output output = cmd(archiver, "recreate", "-a", "test", "--info", "-e", "input/file5") - check_cache(archiver) + cmd(archiver, "check") assert "input/file1" not in output assert "- input/file5" not in output diff --git a/src/borg/testsuite/archiver/rename_cmd.py b/src/borg/testsuite/archiver/rename_cmd.py index 5a1b65c0aa..40ede1a60f 100644 --- a/src/borg/testsuite/archiver/rename_cmd.py +++ b/src/borg/testsuite/archiver/rename_cmd.py @@ -23,6 +23,6 @@ def test_rename(archivers, request): # Make sure both archives have been renamed with Repository(archiver.repository_path) as repository: manifest = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - assert len(manifest.archives) == 2 - assert "test.3" in manifest.archives - assert "test.4" in manifest.archives + assert manifest.archives.count() == 2 + assert manifest.archives.exists("test.3") + assert manifest.archives.exists("test.4") diff --git a/src/borg/testsuite/archiver/return_codes.py b/src/borg/testsuite/archiver/return_codes.py index 9c23f7995a..3825904a44 100644 --- a/src/borg/testsuite/archiver/return_codes.py +++ b/src/borg/testsuite/archiver/return_codes.py @@ -5,7 +5,7 @@ def test_return_codes(cmd_fixture, tmpdir): - repo = tmpdir.mkdir("repo") + repo = tmpdir / "repo" # borg creates the directory input = tmpdir.mkdir("input") output = tmpdir.mkdir("output") input.join("test_file").write("content") diff --git a/src/borg/testsuite/archiver/rinfo_cmd.py b/src/borg/testsuite/archiver/rinfo_cmd.py index bf2b14c527..d08606a22b 100644 --- a/src/borg/testsuite/archiver/rinfo_cmd.py +++ b/src/borg/testsuite/archiver/rinfo_cmd.py @@ -1,5 +1,4 @@ import json -from random import randbytes from ...constants import * # NOQA from . import checkts, cmd, create_regular_file, generate_archiver_tests, RK_ENCRYPTION @@ -13,7 +12,7 @@ def test_info(archivers, request): cmd(archiver, "rcreate", RK_ENCRYPTION) cmd(archiver, "create", "test", "input") info_repo = cmd(archiver, "rinfo") - assert "Original size:" in info_repo + assert "Repository ID:" in info_repo def test_info_json(archivers, request): @@ -30,26 +29,3 @@ def test_info_json(archivers, request): checkts(repository["last_modified"]) assert info_repo["encryption"]["mode"] == RK_ENCRYPTION[13:] assert "keyfile" not in info_repo["encryption"] - - cache = info_repo["cache"] - stats = cache["stats"] - assert all(isinstance(o, int) for o in stats.values()) - assert all(key in stats for key in ("total_chunks", "total_size", "total_unique_chunks", "unique_size")) - - -def test_info_on_repository_with_storage_quota(archivers, request): - archiver = request.getfixturevalue(archivers) - create_regular_file(archiver.input_path, "file1", contents=randbytes(1000 * 1000)) - cmd(archiver, "rcreate", RK_ENCRYPTION, "--storage-quota=1G") - cmd(archiver, "create", "test", "input") - info_repo = cmd(archiver, "rinfo") - assert "Storage quota: 1.00 MB used out of 1.00 GB" in info_repo - - -def test_info_on_repository_without_storage_quota(archivers, request): - archiver = request.getfixturevalue(archivers) - create_regular_file(archiver.input_path, "file1", contents=randbytes(1000 * 1000)) - cmd(archiver, "rcreate", RK_ENCRYPTION) - cmd(archiver, "create", "test", "input") - info_repo = cmd(archiver, "rinfo") - assert "Storage quota: 1.00 MB used" in info_repo diff --git a/src/borg/testsuite/archiver/rlist_cmd.py b/src/borg/testsuite/archiver/rlist_cmd.py index 5c5c48ec52..a86b7d4d27 100644 --- a/src/borg/testsuite/archiver/rlist_cmd.py +++ b/src/borg/testsuite/archiver/rlist_cmd.py @@ -80,26 +80,6 @@ def test_date_matching(archivers, request): assert "archive3" not in output -def test_rlist_consider_checkpoints(archivers, request): - archiver = request.getfixturevalue(archivers) - - cmd(archiver, "rcreate", RK_ENCRYPTION) - cmd(archiver, "create", "test1", src_dir) - # these are not really a checkpoints, but they look like some: - cmd(archiver, "create", "test2.checkpoint", src_dir) - cmd(archiver, "create", "test3.checkpoint.1", src_dir) - - output = cmd(archiver, "rlist") - assert "test1" in output - assert "test2.checkpoint" not in output - assert "test3.checkpoint.1" not in output - - output = cmd(archiver, "rlist", "--consider-checkpoints") - assert "test1" in output - assert "test2.checkpoint" in output - assert "test3.checkpoint.1" in output - - def test_rlist_json(archivers, request): archiver = request.getfixturevalue(archivers) create_regular_file(archiver.input_path, "file1", size=1024 * 80) diff --git a/src/borg/testsuite/archiver/transfer_cmd.py b/src/borg/testsuite/archiver/transfer_cmd.py index 52d9025c02..75fb551e84 100644 --- a/src/borg/testsuite/archiver/transfer_cmd.py +++ b/src/borg/testsuite/archiver/transfer_cmd.py @@ -75,8 +75,8 @@ def convert_tz(local_naive, tzoffset, tzinfo): assert os.environ.get("BORG_PASSPHRASE") == "waytooeasyonlyfortests" os.environ["BORG_TESTONLY_WEAKEN_KDF"] = "0" # must use the strong kdf here or it can't decrypt the key - cmd(archiver, "rcreate", RK_ENCRYPTION, other_repo1) - cmd(archiver, "transfer", other_repo1, "--upgrader=From12To20") + cmd(archiver, "rcreate", RK_ENCRYPTION, other_repo1, "--from-borg1") + cmd(archiver, "transfer", other_repo1, "--from-borg1") cmd(archiver, "check") # check list of archives / manifest diff --git a/src/borg/testsuite/cache.py b/src/borg/testsuite/cache.py index 60cb870e3d..f1e6e558af 100644 --- a/src/borg/testsuite/cache.py +++ b/src/borg/testsuite/cache.py @@ -1,8 +1,5 @@ -import io import os.path -from ..helpers.msgpack import packb - import pytest from .hashindex import H @@ -10,163 +7,16 @@ from ..archive import Statistics from ..cache import AdHocCache from ..crypto.key import AESOCBRepoKey -from ..hashindex import ChunkIndex, CacheSynchronizer from ..manifest import Manifest from ..repository import Repository -class TestCacheSynchronizer: - @pytest.fixture - def index(self): - return ChunkIndex() - - @pytest.fixture - def sync(self, index): - return CacheSynchronizer(index) - - def test_no_chunks(self, index, sync): - data = packb({"foo": "bar", "baz": 1234, "bar": 5678, "user": "chunks", "chunks": []}) - sync.feed(data) - assert not len(index) - - def test_simple(self, index, sync): - data = packb({"foo": "bar", "baz": 1234, "bar": 5678, "user": "chunks", "chunks": [(H(1), 1), (H(2), 2)]}) - sync.feed(data) - assert len(index) == 2 - assert index[H(1)] == (1, 1) - assert index[H(2)] == (1, 2) - - def test_multiple(self, index, sync): - data = packb({"foo": "bar", "baz": 1234, "bar": 5678, "user": "chunks", "chunks": [(H(1), 1), (H(2), 2)]}) - data += packb({"xattrs": {"security.foo": "bar", "chunks": "123456"}, "stuff": [(1, 2, 3)]}) - data += packb( - { - "xattrs": {"security.foo": "bar", "chunks": "123456"}, - "chunks": [(H(1), 1), (H(2), 2)], - "stuff": [(1, 2, 3)], - } - ) - data += packb({"chunks": [(H(3), 1)]}) - data += packb({"chunks": [(H(1), 1)]}) - - part1 = data[:70] - part2 = data[70:120] - part3 = data[120:] - sync.feed(part1) - sync.feed(part2) - sync.feed(part3) - assert len(index) == 3 - assert index[H(1)] == (3, 1) - assert index[H(2)] == (2, 2) - assert index[H(3)] == (1, 1) - - @pytest.mark.parametrize( - "elem,error", - ( - ({1: 2}, "Unexpected object: map"), - ( - bytes(213), - ["Unexpected bytes in chunks structure", "Incorrect key length"], # structure 2/3 - ), # structure 3/3 - (1, "Unexpected object: integer"), - (1.0, "Unexpected object: double"), - (True, "Unexpected object: true"), - (False, "Unexpected object: false"), - (None, "Unexpected object: nil"), - ), - ids=["map", "bytes", "int", "double", "true", "false", "none"], - ) - @pytest.mark.parametrize( - "structure", - (lambda elem: {"chunks": elem}, lambda elem: {"chunks": [elem]}, lambda elem: {"chunks": [(elem, 1)]}), - ) - def test_corrupted(self, sync, structure, elem, error): - packed = packb(structure(elem)) - with pytest.raises(ValueError) as excinfo: - sync.feed(packed) - if isinstance(error, str): - error = [error] - possible_errors = ["cache_sync_feed failed: " + error for error in error] - assert str(excinfo.value) in possible_errors - - @pytest.mark.parametrize( - "data,error", - ( - # Incorrect tuple length - ({"chunks": [(bytes(32), 2, 3, 4)]}, "Invalid chunk list entry length"), - ({"chunks": [(bytes(32),)]}, "Invalid chunk list entry length"), - # Incorrect types - ({"chunks": [(1, 2)]}, "Unexpected object: integer"), - ({"chunks": [(1, bytes(32))]}, "Unexpected object: integer"), - ({"chunks": [(bytes(32), 1.0)]}, "Unexpected object: double"), - ), - ) - def test_corrupted_ancillary(self, index, sync, data, error): - packed = packb(data) - with pytest.raises(ValueError) as excinfo: - sync.feed(packed) - assert str(excinfo.value) == "cache_sync_feed failed: " + error - - def make_index_with_refcount(self, refcount): - index_data = io.BytesIO() - index_data.write(b"BORG2IDX") - # version - index_data.write((2).to_bytes(4, "little")) - # num_entries - index_data.write((1).to_bytes(4, "little")) - # num_buckets - index_data.write((1).to_bytes(4, "little")) - # num_empty - index_data.write((0).to_bytes(4, "little")) - # key_size - index_data.write((32).to_bytes(4, "little")) - # value_size - index_data.write((3 * 4).to_bytes(4, "little")) - # reserved - index_data.write(bytes(1024 - 32)) - - index_data.write(H(0)) - index_data.write(refcount.to_bytes(4, "little")) - index_data.write((1234).to_bytes(4, "little")) - index_data.write((5678).to_bytes(4, "little")) - - index_data.seek(0) - index = ChunkIndex.read(index_data) - return index - - def test_corrupted_refcount(self): - index = self.make_index_with_refcount(ChunkIndex.MAX_VALUE + 1) - sync = CacheSynchronizer(index) - data = packb({"chunks": [(H(0), 1)]}) - with pytest.raises(ValueError) as excinfo: - sync.feed(data) - assert str(excinfo.value) == "cache_sync_feed failed: invalid reference count" - - def test_refcount_max_value(self): - index = self.make_index_with_refcount(ChunkIndex.MAX_VALUE) - sync = CacheSynchronizer(index) - data = packb({"chunks": [(H(0), 1)]}) - sync.feed(data) - assert index[H(0)] == (ChunkIndex.MAX_VALUE, 1234) - - def test_refcount_one_below_max_value(self): - index = self.make_index_with_refcount(ChunkIndex.MAX_VALUE - 1) - sync = CacheSynchronizer(index) - data = packb({"chunks": [(H(0), 1)]}) - sync.feed(data) - # Incremented to maximum - assert index[H(0)] == (ChunkIndex.MAX_VALUE, 1234) - sync.feed(data) - assert index[H(0)] == (ChunkIndex.MAX_VALUE, 1234) - - class TestAdHocCache: @pytest.fixture def repository(self, tmpdir): self.repository_location = os.path.join(str(tmpdir), "repository") with Repository(self.repository_location, exclusive=True, create=True) as repository: repository.put(H(1), b"1234") - repository.put(Manifest.MANIFEST_ID, b"5678") yield repository @pytest.fixture @@ -187,42 +37,18 @@ def cache(self, repository, key, manifest): def test_does_not_contain_manifest(self, cache): assert not cache.seen_chunk(Manifest.MANIFEST_ID) - def test_does_not_delete_existing_chunks(self, repository, cache): - assert cache.seen_chunk(H(1)) == ChunkIndex.MAX_VALUE - cache.chunk_decref(H(1), 1, Statistics()) - assert repository.get(H(1)) == b"1234" - def test_seen_chunk_add_chunk_size(self, cache): assert cache.add_chunk(H(1), {}, b"5678", stats=Statistics()) == (H(1), 4) - def test_deletes_chunks_during_lifetime(self, cache, repository): - """E.g. checkpoint archives""" - cache.add_chunk(H(5), {}, b"1010", stats=Statistics()) - assert cache.seen_chunk(H(5)) == 1 - cache.chunk_decref(H(5), 1, Statistics()) - assert not cache.seen_chunk(H(5)) - with pytest.raises(Repository.ObjectNotFound): - repository.get(H(5)) - def test_files_cache(self, cache): assert cache.file_known_and_unchanged(b"foo", bytes(32), None) == (False, None) assert cache.cache_mode == "d" assert cache.files is None - def test_txn(self, cache): - assert not cache._txn_active - cache.seen_chunk(H(5)) - assert cache._txn_active - assert cache.chunks - cache.rollback() - assert not cache._txn_active - assert not hasattr(cache, "chunks") - - def test_incref_after_add_chunk(self, cache): + def test_reuse_after_add_chunk(self, cache): assert cache.add_chunk(H(3), {}, b"5678", stats=Statistics()) == (H(3), 4) - assert cache.chunk_incref(H(3), 4, Statistics()) == (H(3), 4) + assert cache.reuse_chunk(H(3), 4, Statistics()) == (H(3), 4) - def test_existing_incref_after_add_chunk(self, cache): - """This case occurs with part files, see Archive.chunk_file.""" + def test_existing_reuse_after_add_chunk(self, cache): assert cache.add_chunk(H(1), {}, b"5678", stats=Statistics()) == (H(1), 4) - assert cache.chunk_incref(H(1), 4, Statistics()) == (H(1), 4) + assert cache.reuse_chunk(H(1), 4, Statistics()) == (H(1), 4) diff --git a/src/borg/testsuite/conftest.py b/src/borg/testsuite/conftest.py index 4708d9170c..42bc4f0edf 100644 --- a/src/borg/testsuite/conftest.py +++ b/src/borg/testsuite/conftest.py @@ -9,8 +9,7 @@ pytest.register_assert_rewrite("borg.testsuite") -import borg.cache # noqa: E402 -from borg.archiver import Archiver +from borg.archiver import Archiver # noqa: E402 from borg.logger import setup_logging # noqa: E402 # Ensure that the loggers exist for all tests @@ -56,28 +55,6 @@ def pytest_report_header(config, start_path): return output -class DefaultPatches: - def __init__(self, request): - self.org_cache_wipe_cache = borg.cache.LocalCache.wipe_cache - - def wipe_should_not_be_called(*a, **kw): - raise AssertionError( - "Cache wipe was triggered, if this is part of the test add " "@pytest.mark.allow_cache_wipe" - ) - - if "allow_cache_wipe" not in request.keywords: - borg.cache.LocalCache.wipe_cache = wipe_should_not_be_called - request.addfinalizer(self.undo) - - def undo(self): - borg.cache.LocalCache.wipe_cache = self.org_cache_wipe_cache - - -@pytest.fixture(autouse=True) -def default_patches(request): - return DefaultPatches(request) - - @pytest.fixture() def set_env_variables(): os.environ["BORG_CHECK_I_KNOW_WHAT_I_AM_DOING"] = "YES" diff --git a/src/borg/testsuite/locking.py b/src/borg/testsuite/fslocking.py similarity index 99% rename from src/borg/testsuite/locking.py rename to src/borg/testsuite/fslocking.py index 131fdef3ab..d091626c80 100644 --- a/src/borg/testsuite/locking.py +++ b/src/borg/testsuite/fslocking.py @@ -6,7 +6,7 @@ import pytest from ..platform import get_process_id, process_alive -from ..locking import ( +from ..fslocking import ( TimeoutTimer, ExclusiveLock, Lock, diff --git a/src/borg/testsuite/hashindex.py b/src/borg/testsuite/hashindex.py index 54e56e5f3a..6bf3a2ab15 100644 --- a/src/borg/testsuite/hashindex.py +++ b/src/borg/testsuite/hashindex.py @@ -86,7 +86,7 @@ def _generic_test(self, cls, make_value, sha): def test_nsindex(self): self._generic_test( - NSIndex, lambda x: (x, x, x), "0d7880dbe02b64f03c471e60e193a1333879b4f23105768b10c9222accfeac5e" + NSIndex, lambda x: (x, x, x), "640b909cf07884cc11fdf5431ffc27dee399770ceadecce31dffecd130a311a3" ) def test_chunkindex(self): @@ -102,7 +102,7 @@ def test_resize(self): initial_size = os.path.getsize(filepath) self.assert_equal(len(idx), 0) for x in range(n): - idx[H(x)] = x, x, x, x + idx[H(x)] = x, x, x idx.write(filepath) assert initial_size < os.path.getsize(filepath) for x in range(n): @@ -114,7 +114,7 @@ def test_resize(self): def test_iteritems(self): idx = NSIndex() for x in range(100): - idx[H(x)] = x, x, x, x + idx[H(x)] = x, x, x iterator = idx.iteritems() all = list(iterator) self.assert_equal(len(all), 100) @@ -124,99 +124,6 @@ def test_iteritems(self): self.assert_equal(len(second_half), 50) self.assert_equal(second_half, all[50:]) - def test_chunkindex_merge(self): - idx1 = ChunkIndex() - idx1[H(1)] = 1, 100 - idx1[H(2)] = 2, 200 - idx1[H(3)] = 3, 300 - # no H(4) entry - idx2 = ChunkIndex() - idx2[H(1)] = 4, 100 - idx2[H(2)] = 5, 200 - # no H(3) entry - idx2[H(4)] = 6, 400 - idx1.merge(idx2) - assert idx1[H(1)] == (5, 100) - assert idx1[H(2)] == (7, 200) - assert idx1[H(3)] == (3, 300) - assert idx1[H(4)] == (6, 400) - - def test_chunkindex_summarize(self): - idx = ChunkIndex() - idx[H(1)] = 1, 1000 - idx[H(2)] = 2, 2000 - idx[H(3)] = 3, 3000 - - size, unique_size, unique_chunks, chunks = idx.summarize() - assert size == 1000 + 2 * 2000 + 3 * 3000 - assert unique_size == 1000 + 2000 + 3000 - assert chunks == 1 + 2 + 3 - assert unique_chunks == 3 - - def test_flags(self): - idx = NSIndex() - key = H(0) - self.assert_raises(KeyError, idx.flags, key, 0) - idx[key] = 0, 0, 0 # create entry - # check bit 0 and 1, should be both 0 after entry creation - self.assert_equal(idx.flags(key, mask=3), 0) - # set bit 0 - idx.flags(key, mask=1, value=1) - self.assert_equal(idx.flags(key, mask=1), 1) - # set bit 1 - idx.flags(key, mask=2, value=2) - self.assert_equal(idx.flags(key, mask=2), 2) - # check both bit 0 and 1, both should be set - self.assert_equal(idx.flags(key, mask=3), 3) - # clear bit 1 - idx.flags(key, mask=2, value=0) - self.assert_equal(idx.flags(key, mask=2), 0) - # clear bit 0 - idx.flags(key, mask=1, value=0) - self.assert_equal(idx.flags(key, mask=1), 0) - # check both bit 0 and 1, both should be cleared - self.assert_equal(idx.flags(key, mask=3), 0) - - def test_flags_iteritems(self): - idx = NSIndex() - keys_flagged0 = {H(i) for i in (1, 2, 3, 42)} - keys_flagged1 = {H(i) for i in (11, 12, 13, 142)} - keys_flagged2 = {H(i) for i in (21, 22, 23, 242)} - keys_flagged3 = {H(i) for i in (31, 32, 33, 342)} - for key in keys_flagged0: - idx[key] = 0, 0, 0 # create entry - idx.flags(key, mask=3, value=0) # not really necessary, unflagged is default - for key in keys_flagged1: - idx[key] = 0, 0, 0 # create entry - idx.flags(key, mask=3, value=1) - for key in keys_flagged2: - idx[key] = 0, 0, 0 # create entry - idx.flags(key, mask=3, value=2) - for key in keys_flagged3: - idx[key] = 0, 0, 0 # create entry - idx.flags(key, mask=3, value=3) - # check if we can iterate over all items - k_all = {k for k, v in idx.iteritems()} - self.assert_equal(k_all, keys_flagged0 | keys_flagged1 | keys_flagged2 | keys_flagged3) - # check if we can iterate over the flagged0 items - k0 = {k for k, v in idx.iteritems(mask=3, value=0)} - self.assert_equal(k0, keys_flagged0) - # check if we can iterate over the flagged1 items - k1 = {k for k, v in idx.iteritems(mask=3, value=1)} - self.assert_equal(k1, keys_flagged1) - # check if we can iterate over the flagged2 items - k1 = {k for k, v in idx.iteritems(mask=3, value=2)} - self.assert_equal(k1, keys_flagged2) - # check if we can iterate over the flagged3 items - k1 = {k for k, v in idx.iteritems(mask=3, value=3)} - self.assert_equal(k1, keys_flagged3) - # check if we can iterate over the flagged1 + flagged3 items - k1 = {k for k, v in idx.iteritems(mask=1, value=1)} - self.assert_equal(k1, keys_flagged1 | keys_flagged3) - # check if we can iterate over the flagged0 + flagged2 items - k1 = {k for k, v in idx.iteritems(mask=1, value=0)} - self.assert_equal(k1, keys_flagged0 | keys_flagged2) - class HashIndexExtraTestCase(BaseTestCase): """These tests are separate because they should not become part of the selftest.""" @@ -265,55 +172,6 @@ def test_size_on_disk_accurate(self): class HashIndexRefcountingTestCase(BaseTestCase): - def test_chunkindex_limit(self): - idx = ChunkIndex() - idx[H(1)] = ChunkIndex.MAX_VALUE - 1, 1 - - # 5 is arbitrary, any number of incref/decrefs shouldn't move it once it's limited - for i in range(5): - # first incref to move it to the limit - refcount, *_ = idx.incref(H(1)) - assert refcount == ChunkIndex.MAX_VALUE - for i in range(5): - refcount, *_ = idx.decref(H(1)) - assert refcount == ChunkIndex.MAX_VALUE - - def _merge(self, refcounta, refcountb): - def merge(refcount1, refcount2): - idx1 = ChunkIndex() - idx1[H(1)] = refcount1, 1 - idx2 = ChunkIndex() - idx2[H(1)] = refcount2, 1 - idx1.merge(idx2) - refcount, *_ = idx1[H(1)] - return refcount - - result = merge(refcounta, refcountb) - # check for commutativity - assert result == merge(refcountb, refcounta) - return result - - def test_chunkindex_merge_limit1(self): - # Check that it does *not* limit at MAX_VALUE - 1 - # (MAX_VALUE is odd) - half = ChunkIndex.MAX_VALUE // 2 - assert self._merge(half, half) == ChunkIndex.MAX_VALUE - 1 - - def test_chunkindex_merge_limit2(self): - # 3000000000 + 2000000000 > MAX_VALUE - assert self._merge(3000000000, 2000000000) == ChunkIndex.MAX_VALUE - - def test_chunkindex_merge_limit3(self): - # Crossover point: both addition and limit semantics will yield the same result - half = ChunkIndex.MAX_VALUE // 2 - assert self._merge(half + 1, half) == ChunkIndex.MAX_VALUE - - def test_chunkindex_merge_limit4(self): - # Beyond crossover, result of addition would be 2**31 - half = ChunkIndex.MAX_VALUE // 2 - assert self._merge(half + 2, half) == ChunkIndex.MAX_VALUE - assert self._merge(half + 1, half + 1) == ChunkIndex.MAX_VALUE - def test_chunkindex_add(self): idx1 = ChunkIndex() idx1.add(H(1), 5, 6) @@ -321,35 +179,6 @@ def test_chunkindex_add(self): idx1.add(H(1), 1, 2) assert idx1[H(1)] == (6, 2) - def test_incref_limit(self): - idx1 = ChunkIndex() - idx1[H(1)] = ChunkIndex.MAX_VALUE, 6 - idx1.incref(H(1)) - refcount, *_ = idx1[H(1)] - assert refcount == ChunkIndex.MAX_VALUE - - def test_decref_limit(self): - idx1 = ChunkIndex() - idx1[H(1)] = ChunkIndex.MAX_VALUE, 6 - idx1.decref(H(1)) - refcount, *_ = idx1[H(1)] - assert refcount == ChunkIndex.MAX_VALUE - - def test_decref_zero(self): - idx1 = ChunkIndex() - idx1[H(1)] = 0, 0 - with self.assert_raises(AssertionError): - idx1.decref(H(1)) - - def test_incref_decref(self): - idx1 = ChunkIndex() - idx1.add(H(1), 5, 6) - assert idx1[H(1)] == (5, 6) - idx1.incref(H(1)) - assert idx1[H(1)] == (6, 6) - idx1.decref(H(1)) - assert idx1[H(1)] == (5, 6) - def test_setitem_raises(self): idx1 = ChunkIndex() with self.assert_raises(AssertionError): @@ -357,10 +186,6 @@ def test_setitem_raises(self): def test_keyerror(self): idx = ChunkIndex() - with self.assert_raises(KeyError): - idx.incref(H(1)) - with self.assert_raises(KeyError): - idx.decref(H(1)) with self.assert_raises(KeyError): idx[H(1)] with self.assert_raises(OverflowError): @@ -405,17 +230,6 @@ def test_identical_creation(self): serialized = self._serialize_hashindex(idx1) assert self._unpack(serialized) == self._unpack(self.HASHINDEX) - def test_read_known_good(self): - idx1 = self._deserialize_hashindex(self.HASHINDEX) - assert idx1[H(1)] == (1, 2) - assert idx1[H(2)] == (2**31 - 1, 0) - assert idx1[H(3)] == (4294962296, 0) - - idx2 = ChunkIndex() - idx2[H(3)] = 2**32 - 123456, 6 - idx1.merge(idx2) - assert idx1[H(3)] == (ChunkIndex.MAX_VALUE, 6) - class HashIndexIntegrityTestCase(HashIndexDataTestCase): def write_integrity_checked_index(self, tempdir): @@ -549,25 +363,14 @@ def test_all_at_back(self): self.compare_compact("ED****") self.compare_compact("D*****") - def test_merge(self): - master = ChunkIndex() - idx1 = ChunkIndex() - idx1[H(1)] = 1, 100 - idx1[H(2)] = 2, 200 - idx1[H(3)] = 3, 300 - idx1.compact() - assert idx1.size() == 1024 + 3 * (32 + 2 * 4) - master.merge(idx1) - self.compare_indexes(idx1, master) - class NSIndexTestCase(BaseTestCase): def test_nsindex_segment_limit(self): idx = NSIndex() with self.assert_raises(AssertionError): - idx[H(1)] = NSIndex.MAX_VALUE + 1, 0, 0, 0 + idx[H(1)] = NSIndex.MAX_VALUE + 1, 0, 0 assert H(1) not in idx - idx[H(2)] = NSIndex.MAX_VALUE, 0, 0, 0 + idx[H(2)] = NSIndex.MAX_VALUE, 0, 0 assert H(2) in idx @@ -595,7 +398,7 @@ def HH(x, y, z): for y in range(700): # stay below max load not to trigger resize idx[HH(0, y, 0)] = (0, y, 0) - assert idx.size() == 1024 + 1031 * 48 # header + 1031 buckets + assert idx.size() == 1024 + 1031 * 44 # header + 1031 buckets # delete lots of the collisions, creating lots of tombstones for y in range(400): # stay above min load not to trigger resize diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index c9a83f6ab4..4f29ce7481 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -187,6 +187,14 @@ def test_ssh(self, monkeypatch, keys_dir): "host='2a02:0001:0002:0003:0004:0005:0006:0007', port=1234, path='/some/path')" ) + def test_sftp(self, monkeypatch, keys_dir): + monkeypatch.delenv("BORG_REPO", raising=False) + assert ( + repr(Location("sftp://user@host:1234/some/path")) + == "Location(proto='sftp', user='user', host='host', port=1234, path='/some/path')" + ) + assert Location("sftp://user@host:1234/some/path").to_key_filename() == keys_dir + "host__some_path" + def test_socket(self, monkeypatch, keys_dir): monkeypatch.delenv("BORG_REPO", raising=False) assert ( diff --git a/src/borg/testsuite/legacyrepository.py b/src/borg/testsuite/legacyrepository.py new file mode 100644 index 0000000000..d4e33097bf --- /dev/null +++ b/src/borg/testsuite/legacyrepository.py @@ -0,0 +1,1115 @@ +import logging +import os +import sys +from typing import Optional +from unittest.mock import patch + +import pytest + +from ..checksums import xxh64 +from ..hashindex import NSIndex +from ..helpers import Location +from ..helpers import IntegrityError +from ..helpers import msgpack +from ..fslocking import Lock, LockFailed +from ..platformflags import is_win32 +from ..legacyremote import LegacyRemoteRepository, InvalidRPCMethod, PathNotAllowed +from ..legacyrepository import LegacyRepository, LoggedIO +from ..legacyrepository import MAGIC, MAX_DATA_SIZE, TAG_DELETE, TAG_PUT2, TAG_PUT, TAG_COMMIT +from ..repoobj import RepoObj +from .hashindex import H + + +@pytest.fixture() +def repository(tmp_path): + repository_location = os.fspath(tmp_path / "repository") + yield LegacyRepository(repository_location, exclusive=True, create=True) + + +@pytest.fixture() +def remote_repository(tmp_path): + if is_win32: + pytest.skip("Remote repository does not yet work on Windows.") + repository_location = Location("ssh://__testsuite__" + os.fspath(tmp_path / "repository")) + yield LegacyRemoteRepository(repository_location, exclusive=True, create=True) + + +def pytest_generate_tests(metafunc): + # Generates tests that run on both local and remote repos + if "repo_fixtures" in metafunc.fixturenames: + metafunc.parametrize("repo_fixtures", ["repository", "remote_repository"]) + + +def get_repository_from_fixture(repo_fixtures, request): + # returns the repo object from the fixture for tests that run on both local and remote repos + return request.getfixturevalue(repo_fixtures) + + +def reopen(repository, exclusive: Optional[bool] = True, create=False): + if isinstance(repository, LegacyRepository): + if repository.io is not None or repository.lock is not None: + raise RuntimeError("Repo must be closed before a reopen. Cannot support nested repository contexts.") + return LegacyRepository(repository.path, exclusive=exclusive, create=create) + + if isinstance(repository, LegacyRemoteRepository): + if repository.p is not None or repository.sock is not None: + raise RuntimeError("Remote repo must be closed before a reopen. Cannot support nested repository contexts.") + return LegacyRemoteRepository(repository.location, exclusive=exclusive, create=create) + + raise TypeError( + f"Invalid argument type. Expected 'Repository' or 'RemoteRepository', received '{type(repository).__name__}'." + ) + + +def get_path(repository): + if isinstance(repository, LegacyRepository): + return repository.path + + if isinstance(repository, LegacyRemoteRepository): + return repository.location.path + + raise TypeError( + f"Invalid argument type. Expected 'Repository' or 'RemoteRepository', received '{type(repository).__name__}'." + ) + + +def fchunk(data, meta=b""): + # create a raw chunk that has valid RepoObj layout, but does not use encryption or compression. + hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta), xxh64(data)) + assert isinstance(data, bytes) + chunk = hdr + meta + data + return chunk + + +def pchunk(chunk): + # parse data and meta from a raw chunk made by fchunk + hdr_size = RepoObj.obj_header.size + hdr = chunk[:hdr_size] + meta_size, data_size = RepoObj.obj_header.unpack(hdr)[0:2] + meta = chunk[hdr_size : hdr_size + meta_size] + data = chunk[hdr_size + meta_size : hdr_size + meta_size + data_size] + return data, meta + + +def pdchunk(chunk): + # parse only data from a raw chunk made by fchunk + return pchunk(chunk)[0] + + +def add_keys(repository): + repository.put(H(0), fchunk(b"foo")) + repository.put(H(1), fchunk(b"bar")) + repository.put(H(3), fchunk(b"bar")) + repository.commit(compact=False) + repository.put(H(1), fchunk(b"bar2")) + repository.put(H(2), fchunk(b"boo")) + repository.delete(H(3)) + + +def repo_dump(repository, label=None): + label = label + ": " if label is not None else "" + H_trans = {H(i): i for i in range(10)} + H_trans[None] = -1 # key == None appears in commits + tag_trans = {TAG_PUT2: "put2", TAG_PUT: "put", TAG_DELETE: "del", TAG_COMMIT: "comm"} + for segment, fn in repository.io.segment_iterator(): + for tag, key, offset, size, _ in repository.io.iter_objects(segment): + print("%s%s H(%d) -> %s[%d..+%d]" % (label, tag_trans[tag], H_trans[key], fn, offset, size)) + print() + + +def test_basic_operations(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + for x in range(100): + repository.put(H(x), fchunk(b"SOMEDATA")) + key50 = H(50) + assert pdchunk(repository.get(key50)) == b"SOMEDATA" + repository.delete(key50) + with pytest.raises(LegacyRepository.ObjectNotFound): + repository.get(key50) + repository.commit(compact=False) + with reopen(repository) as repository: + with pytest.raises(LegacyRepository.ObjectNotFound): + repository.get(key50) + for x in range(100): + if x == 50: + continue + assert pdchunk(repository.get(H(x))) == b"SOMEDATA" + + +def test_multiple_transactions(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"foo")) + repository.put(H(1), fchunk(b"foo")) + repository.commit(compact=False) + repository.delete(H(0)) + repository.put(H(1), fchunk(b"bar")) + repository.commit(compact=False) + assert pdchunk(repository.get(H(1))) == b"bar" + + +def test_read_data(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + meta, data = b"meta", b"data" + hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta), xxh64(data)) + chunk_complete = hdr + meta + data + chunk_short = hdr + meta + repository.put(H(0), chunk_complete) + repository.commit(compact=False) + assert repository.get(H(0)) == chunk_complete + assert repository.get(H(0), read_data=True) == chunk_complete + assert repository.get(H(0), read_data=False) == chunk_short + + +def test_consistency(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"foo")) + assert pdchunk(repository.get(H(0))) == b"foo" + repository.put(H(0), fchunk(b"foo2")) + assert pdchunk(repository.get(H(0))) == b"foo2" + repository.put(H(0), fchunk(b"bar")) + assert pdchunk(repository.get(H(0))) == b"bar" + repository.delete(H(0)) + with pytest.raises(LegacyRepository.ObjectNotFound): + repository.get(H(0)) + + +def test_consistency2(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"foo")) + assert pdchunk(repository.get(H(0))) == b"foo" + repository.commit(compact=False) + repository.put(H(0), fchunk(b"foo2")) + assert pdchunk(repository.get(H(0))) == b"foo2" + repository.rollback() + assert pdchunk(repository.get(H(0))) == b"foo" + + +def test_overwrite_in_same_transaction(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"foo")) + repository.put(H(0), fchunk(b"foo2")) + repository.commit(compact=False) + assert pdchunk(repository.get(H(0))) == b"foo2" + + +def test_single_kind_transactions(repo_fixtures, request): + # put + with get_repository_from_fixture(repo_fixtures, request) as repository: + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=False) + # replace + with reopen(repository) as repository: + repository.put(H(0), fchunk(b"bar")) + repository.commit(compact=False) + # delete + with reopen(repository) as repository: + repository.delete(H(0)) + repository.commit(compact=False) + + +def test_list(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + for x in range(100): + repository.put(H(x), fchunk(b"SOMEDATA")) + repository.commit(compact=False) + repo_list = repository.list() + assert len(repo_list) == 100 + first_half = repository.list(limit=50) + assert len(first_half) == 50 + assert first_half == repo_list[:50] + second_half = repository.list(marker=first_half[-1]) + assert len(second_half) == 50 + assert second_half == repo_list[50:] + assert len(repository.list(limit=50)) == 50 + + +def test_max_data_size(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + max_data = b"x" * (MAX_DATA_SIZE - RepoObj.obj_header.size) + repository.put(H(0), fchunk(max_data)) + assert pdchunk(repository.get(H(0))) == max_data + with pytest.raises(IntegrityError): + repository.put(H(1), fchunk(max_data + b"x")) + + +def _assert_sparse(repository): + # the superseded 123456... PUT + assert repository.compact[0] == 41 + 8 + len(fchunk(b"123456789")) + # a COMMIT + assert repository.compact[1] == 9 + # the DELETE issued by the superseding PUT (or issued directly) + assert repository.compact[2] == 41 + repository._rebuild_sparse(0) + assert repository.compact[0] == 41 + 8 + len(fchunk(b"123456789")) # 9 is chunk or commit? + + +def test_sparse1(repository): + with repository: + repository.put(H(0), fchunk(b"foo")) + repository.put(H(1), fchunk(b"123456789")) + repository.commit(compact=False) + repository.put(H(1), fchunk(b"bar")) + _assert_sparse(repository) + + +def test_sparse2(repository): + with repository: + repository.put(H(0), fchunk(b"foo")) + repository.put(H(1), fchunk(b"123456789")) + repository.commit(compact=False) + repository.delete(H(1)) + _assert_sparse(repository) + + +def test_sparse_delete(repository): + with repository: + chunk0 = fchunk(b"1245") + repository.put(H(0), chunk0) + repository.delete(H(0)) + repository.io._write_fd.sync() + # the on-line tracking works on a per-object basis... + assert repository.compact[0] == 41 + 8 + 41 + len(chunk0) + repository._rebuild_sparse(0) + # ...while _rebuild_sparse can mark whole segments as completely sparse (which then includes the segment magic) + assert repository.compact[0] == 41 + 8 + 41 + len(chunk0) + len(MAGIC) + repository.commit(compact=True) + assert 0 not in [segment for segment, _ in repository.io.segment_iterator()] + + +def test_uncommitted_garbage(repository): + with repository: + # uncommitted garbage should be no problem, it is cleaned up automatically. + # we just have to be careful with invalidation of cached FDs in LoggedIO. + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=False) + # write some crap to an uncommitted segment file + last_segment = repository.io.get_latest_segment() + with open(repository.io.segment_filename(last_segment + 1), "wb") as f: + f.write(MAGIC + b"crapcrapcrap") + with reopen(repository) as repository: + # usually, opening the repo and starting a transaction should trigger a cleanup. + repository.put(H(0), fchunk(b"bar")) # this may trigger compact_segments() + repository.commit(compact=True) + # the point here is that nothing blows up with an exception. + + +def test_replay_of_missing_index(repository): + with repository: + add_keys(repository) + for name in os.listdir(repository.path): + if name.startswith("index."): + os.unlink(os.path.join(repository.path, name)) + with reopen(repository) as repository: + assert len(repository) == 3 + assert repository.check() is True + + +def test_crash_before_compact_segments(repository): + with repository: + add_keys(repository) + repository.compact_segments = None + try: + repository.commit(compact=True) + except TypeError: + pass + with reopen(repository) as repository: + assert len(repository) == 3 + assert repository.check() is True + + +def test_crash_before_write_index(repository): + with repository: + add_keys(repository) + repository.write_index = None + try: + repository.commit(compact=False) + except TypeError: + pass + with reopen(repository) as repository: + assert len(repository) == 3 + assert repository.check() is True + + +def test_replay_lock_upgrade_old(repository): + with repository: + add_keys(repository) + for name in os.listdir(repository.path): + if name.startswith("index."): + os.unlink(os.path.join(repository.path, name)) + with patch.object(Lock, "upgrade", side_effect=LockFailed) as upgrade: + with reopen(repository, exclusive=None) as repository: + # simulate old client that always does lock upgrades + # the repo is only locked by a shared read lock, but to replay segments, + # we need an exclusive write lock - check if the lock gets upgraded. + with pytest.raises(LockFailed): + len(repository) + upgrade.assert_called_once_with() + + +def test_replay_lock_upgrade(repository): + with repository: + add_keys(repository) + for name in os.listdir(repository.path): + if name.startswith("index."): + os.unlink(os.path.join(repository.path, name)) + with patch.object(Lock, "upgrade", side_effect=LockFailed) as upgrade: + with reopen(repository, exclusive=False) as repository: + # current client usually does not do lock upgrade, except for replay + # the repo is only locked by a shared read lock, but to replay segments, + # we need an exclusive write lock - check if the lock gets upgraded. + with pytest.raises(LockFailed): + len(repository) + upgrade.assert_called_once_with() + + +def test_crash_before_deleting_compacted_segments(repository): + with repository: + add_keys(repository) + repository.io.delete_segment = None + try: + repository.commit(compact=False) + except TypeError: + pass + with reopen(repository) as repository: + assert len(repository) == 3 + assert repository.check() is True + assert len(repository) == 3 + + +def test_ignores_commit_tag_in_data(repository): + with repository: + repository.put(H(0), LoggedIO.COMMIT) + with reopen(repository) as repository: + io = repository.io + assert not io.is_committed_segment(io.get_latest_segment()) + + +def test_moved_deletes_are_tracked(repository): + with repository: + repository.put(H(1), fchunk(b"1")) + repository.put(H(2), fchunk(b"2")) + repository.commit(compact=False) + repo_dump(repository, "p1 p2 c") + repository.delete(H(1)) + repository.commit(compact=True) + repo_dump(repository, "d1 cc") + last_segment = repository.io.get_latest_segment() - 1 + num_deletes = 0 + for tag, key, offset, size, _ in repository.io.iter_objects(last_segment): + if tag == TAG_DELETE: + assert key == H(1) + num_deletes += 1 + assert num_deletes == 1 + assert last_segment in repository.compact + repository.put(H(3), fchunk(b"3")) + repository.commit(compact=True) + repo_dump(repository, "p3 cc") + assert last_segment not in repository.compact + assert not repository.io.segment_exists(last_segment) + for segment, _ in repository.io.segment_iterator(): + for tag, key, offset, size, _ in repository.io.iter_objects(segment): + assert tag != TAG_DELETE + assert key != H(1) + # after compaction, there should be no empty shadowed_segments lists left over. + # we have no put or del anymore for H(1), so we lost knowledge about H(1). + assert H(1) not in repository.shadow_index + + +def test_shadowed_entries_are_preserved1(repository): + # this tests the shadowing-by-del behaviour + with repository: + get_latest_segment = repository.io.get_latest_segment + repository.put(H(1), fchunk(b"1")) + # This is the segment with our original PUT of interest + put_segment = get_latest_segment() + repository.commit(compact=False) + # we now delete H(1), and force this segment not to be compacted, which can happen + # if it's not sparse enough (symbolized by H(2) here). + repository.delete(H(1)) + repository.put(H(2), fchunk(b"1")) + del_segment = get_latest_segment() + # we pretend these are mostly dense (not sparse) and won't be compacted + del repository.compact[put_segment] + del repository.compact[del_segment] + repository.commit(compact=True) + # we now perform an unrelated operation on the segment containing the DELETE, + # causing it to be compacted. + repository.delete(H(2)) + repository.commit(compact=True) + assert repository.io.segment_exists(put_segment) + assert not repository.io.segment_exists(del_segment) + # basic case, since the index survived this must be ok + assert H(1) not in repository + # nuke index, force replay + os.unlink(os.path.join(repository.path, "index.%d" % get_latest_segment())) + # must not reappear + assert H(1) not in repository + + +def test_shadowed_entries_are_preserved2(repository): + # this tests the shadowing-by-double-put behaviour, see issue #5661 + # assume this repo state: + # seg1: PUT H1 + # seg2: COMMIT + # seg3: DEL H1, PUT H1, DEL H1, PUT H2 + # seg4: COMMIT + # Note how due to the final DEL H1 in seg3, H1 is effectively deleted. + # + # compaction of only seg3: + # PUT H1 gets dropped because it is not needed any more. + # DEL H1 must be kept, because there is still a PUT H1 in seg1 which must not + # "reappear" in the index if the index gets rebuilt. + with repository: + get_latest_segment = repository.io.get_latest_segment + repository.put(H(1), fchunk(b"1")) + # This is the segment with our original PUT of interest + put_segment = get_latest_segment() + repository.commit(compact=False) + # We now put H(1) again (which implicitly does DEL(H(1)) followed by PUT(H(1), ...)), + # delete H(1) afterwards, and force this segment to not be compacted, which can happen + # if it's not sparse enough (symbolized by H(2) here). + repository.put(H(1), fchunk(b"1")) + repository.delete(H(1)) + repository.put(H(2), fchunk(b"1")) + delete_segment = get_latest_segment() + # We pretend these are mostly dense (not sparse) and won't be compacted + del repository.compact[put_segment] + del repository.compact[delete_segment] + repository.commit(compact=True) + # Now we perform an unrelated operation on the segment containing the DELETE, + # causing it to be compacted. + repository.delete(H(2)) + repository.commit(compact=True) + assert repository.io.segment_exists(put_segment) + assert not repository.io.segment_exists(delete_segment) + # Basic case, since the index survived this must be ok + assert H(1) not in repository + # Nuke index, force replay + os.unlink(os.path.join(repository.path, "index.%d" % get_latest_segment())) + # Must not reappear + assert H(1) not in repository # F + + +def test_shadow_index_rollback(repository): + with repository: + repository.put(H(1), fchunk(b"1")) + repository.delete(H(1)) + assert repository.shadow_index[H(1)] == [0] + repository.commit(compact=True) + repo_dump(repository, "p1 d1 cc") + # note how an empty list means that nothing is shadowed for sure + assert repository.shadow_index[H(1)] == [] # because the deletion is considered unstable + repository.put(H(1), b"1") + repository.delete(H(1)) + repo_dump(repository, "p1 d1") + # 0 put/delete; 1 commit; 2 compacted; 3 commit; 4 put/delete + assert repository.shadow_index[H(1)] == [4] + repository.rollback() + repo_dump(repository, "r") + repository.put(H(2), fchunk(b"1")) + # after the rollback, segment 4 shouldn't be considered anymore + assert repository.shadow_index[H(1)] == [] # because the deletion is considered unstable + + +def test_destroy_append_only(repository): + with repository: + # can't destroy append only repo (via the API) + repository.append_only = True + with pytest.raises(ValueError): + repository.destroy() + assert repository.append_only + + +def test_append_only(repository): + def segments_in_repository(repo): + return len(list(repo.io.segment_iterator())) + + with repository: + repository.append_only = True + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=False) + + repository.append_only = False + assert segments_in_repository(repository) == 2 + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=True) + # normal: compact squashes the data together, only one segment + assert segments_in_repository(repository) == 2 + + repository.append_only = True + assert segments_in_repository(repository) == 2 + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=False) + # append only: does not compact, only new segments written + assert segments_in_repository(repository) == 4 + + +def test_additional_free_space(repository): + with repository: + add_keys(repository) + repository.config.set("repository", "additional_free_space", "1000T") + repository.save_key(b"shortcut to save_config") + with reopen(repository) as repository: + repository.put(H(0), fchunk(b"foobar")) + with pytest.raises(LegacyRepository.InsufficientFreeSpaceError): + repository.commit(compact=False) + assert os.path.exists(repository.path) + + +def test_create_free_space(repository): + with repository: + repository.additional_free_space = 1e20 + with pytest.raises(LegacyRepository.InsufficientFreeSpaceError): + add_keys(repository) + assert not os.path.exists(repository.path) + + +def test_tracking(repository): + with repository: + assert repository.storage_quota_use == 0 + ch1 = fchunk(bytes(1234)) + repository.put(H(1), ch1) + assert repository.storage_quota_use == len(ch1) + 41 + 8 + ch2 = fchunk(bytes(5678)) + repository.put(H(2), ch2) + assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) + repository.delete(H(1)) + assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) # we have not compacted yet + repository.commit(compact=False) + assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) # we have not compacted yet + with reopen(repository) as repository: + # open new transaction; hints and thus quota data is not loaded unless needed. + ch3 = fchunk(b"") + repository.put(H(3), ch3) + repository.delete(H(3)) + assert repository.storage_quota_use == len(ch1) + len(ch2) + len(ch3) + 3 * ( + 41 + 8 + ) # we have not compacted yet + repository.commit(compact=True) + assert repository.storage_quota_use == len(ch2) + 41 + 8 + + +def test_exceed_quota(repository): + with repository: + assert repository.storage_quota_use == 0 + repository.storage_quota = 80 + ch1 = fchunk(b"x" * 7) + repository.put(H(1), ch1) + assert repository.storage_quota_use == len(ch1) + 41 + 8 + repository.commit(compact=False) + with pytest.raises(LegacyRepository.StorageQuotaExceeded): + ch2 = fchunk(b"y" * 13) + repository.put(H(2), ch2) + assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2 # check ch2!? + with pytest.raises(LegacyRepository.StorageQuotaExceeded): + repository.commit(compact=False) + assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2 # check ch2!? + with reopen(repository) as repository: + repository.storage_quota = 161 + # open new transaction; hints and thus quota data is not loaded unless needed. + repository.put(H(1), ch1) + # we have 2 puts for H(1) here and not yet compacted. + assert repository.storage_quota_use == len(ch1) * 2 + (41 + 8) * 2 + repository.commit(compact=True) + assert repository.storage_quota_use == len(ch1) + 41 + 8 # now we have compacted. + + +def make_auxiliary(repository): + with repository: + repository.put(H(0), fchunk(b"foo")) + repository.commit(compact=False) + + +def do_commit(repository): + with repository: + repository.put(H(0), fchunk(b"fox")) + repository.commit(compact=False) + + +def test_corrupted_hints(repository): + make_auxiliary(repository) + with open(os.path.join(repository.path, "hints.1"), "ab") as fd: + fd.write(b"123456789") + do_commit(repository) + + +def test_deleted_hints(repository): + make_auxiliary(repository) + os.unlink(os.path.join(repository.path, "hints.1")) + do_commit(repository) + + +def test_deleted_index(repository): + make_auxiliary(repository) + os.unlink(os.path.join(repository.path, "index.1")) + do_commit(repository) + + +def test_unreadable_hints(repository): + make_auxiliary(repository) + hints = os.path.join(repository.path, "hints.1") + os.unlink(hints) + os.mkdir(hints) + with pytest.raises(OSError): + do_commit(repository) + + +def test_index(repository): + make_auxiliary(repository) + with open(os.path.join(repository.path, "index.1"), "wb") as fd: + fd.write(b"123456789") + do_commit(repository) + + +def test_index_outside_transaction(repository): + make_auxiliary(repository) + with open(os.path.join(repository.path, "index.1"), "wb") as fd: + fd.write(b"123456789") + with repository: + assert len(repository) == 1 + + +def _corrupt_index(repository): + # HashIndex is able to detect incorrect headers and file lengths, + # but on its own it can't tell if the data is correct. + index_path = os.path.join(repository.path, "index.1") + with open(index_path, "r+b") as fd: + index_data = fd.read() + # Flip one bit in a key stored in the index + corrupted_key = (int.from_bytes(H(0), "little") ^ 1).to_bytes(32, "little") + corrupted_index_data = index_data.replace(H(0), corrupted_key) + assert corrupted_index_data != index_data + assert len(corrupted_index_data) == len(index_data) + fd.seek(0) + fd.write(corrupted_index_data) + + +def test_index_corrupted(repository): + make_auxiliary(repository) + _corrupt_index(repository) + with repository: + # data corruption is detected due to mismatching checksums, and fixed by rebuilding the index. + assert len(repository) == 1 + assert pdchunk(repository.get(H(0))) == b"foo" + + +def test_index_corrupted_without_integrity(repository): + make_auxiliary(repository) + _corrupt_index(repository) + integrity_path = os.path.join(repository.path, "integrity.1") + os.unlink(integrity_path) + with repository: + # since the corrupted key is not noticed, the repository still thinks it contains one key... + assert len(repository) == 1 + with pytest.raises(LegacyRepository.ObjectNotFound): + # ... but the real, uncorrupted key is not found in the corrupted index. + repository.get(H(0)) + + +def test_unreadable_index(repository): + make_auxiliary(repository) + index = os.path.join(repository.path, "index.1") + os.unlink(index) + os.mkdir(index) + with pytest.raises(OSError): + do_commit(repository) + + +def test_unknown_integrity_version(repository): + make_auxiliary(repository) + # for now an unknown integrity data version is ignored and not an error. + integrity_path = os.path.join(repository.path, "integrity.1") + with open(integrity_path, "r+b") as fd: + msgpack.pack({b"version": 4.7}, fd) # borg only understands version 2 + fd.truncate() + with repository: + # no issues accessing the repository + assert len(repository) == 1 + assert pdchunk(repository.get(H(0))) == b"foo" + + +def _subtly_corrupted_hints_setup(repository): + with repository: + repository.append_only = True + assert len(repository) == 1 + assert pdchunk(repository.get(H(0))) == b"foo" + repository.put(H(1), fchunk(b"bar")) + repository.put(H(2), fchunk(b"baz")) + repository.commit(compact=False) + repository.put(H(2), fchunk(b"bazz")) + repository.commit(compact=False) + hints_path = os.path.join(repository.path, "hints.5") + with open(hints_path, "r+b") as fd: + hints = msgpack.unpack(fd) + fd.seek(0) + # corrupt segment refcount + assert hints["segments"][2] == 1 + hints["segments"][2] = 0 + msgpack.pack(hints, fd) + fd.truncate() + + +def test_subtly_corrupted_hints(repository): + make_auxiliary(repository) + _subtly_corrupted_hints_setup(repository) + with repository: + repository.append_only = False + repository.put(H(3), fchunk(b"1234")) + # do a compaction run, which succeeds since the failed checksum prompted a rebuild of the index+hints. + repository.commit(compact=True) + assert len(repository) == 4 + assert pdchunk(repository.get(H(0))) == b"foo" + assert pdchunk(repository.get(H(1))) == b"bar" + assert pdchunk(repository.get(H(2))) == b"bazz" + + +def test_subtly_corrupted_hints_without_integrity(repository): + make_auxiliary(repository) + _subtly_corrupted_hints_setup(repository) + integrity_path = os.path.join(repository.path, "integrity.5") + os.unlink(integrity_path) + with repository: + repository.append_only = False + repository.put(H(3), fchunk(b"1234")) + # do a compaction run, which fails since the corrupted refcount wasn't detected and causes an assertion failure. + with pytest.raises(AssertionError) as exc_info: + repository.commit(compact=True) + assert "Corrupted segment reference count" in str(exc_info.value) + + +def list_indices(repo_path): + return [name for name in os.listdir(repo_path) if name.startswith("index.")] + + +def check(repository, repo_path, repair=False, status=True): + assert repository.check(repair=repair) == status + # Make sure no tmp files are left behind + tmp_files = [name for name in os.listdir(repo_path) if "tmp" in name] + assert tmp_files == [], "Found tmp files" + + +def get_objects(repository, *ids): + for id_ in ids: + pdchunk(repository.get(H(id_))) + + +def add_objects(repository, segments): + for ids in segments: + for id_ in ids: + repository.put(H(id_), fchunk(b"data")) + repository.commit(compact=False) + + +def get_head(repo_path): + return sorted(int(n) for n in os.listdir(os.path.join(repo_path, "data", "0")) if n.isdigit())[-1] + + +def open_index(repo_path): + return NSIndex.read(os.path.join(repo_path, f"index.{get_head(repo_path)}")) + + +def corrupt_object(repo_path, id_): + idx = open_index(repo_path) + segment, offset, _ = idx[H(id_)] + with open(os.path.join(repo_path, "data", "0", str(segment)), "r+b") as fd: + fd.seek(offset) + fd.write(b"BOOM") + + +def delete_segment(repository, segment): + repository.io.delete_segment(segment) + + +def delete_index(repo_path): + os.unlink(os.path.join(repo_path, f"index.{get_head(repo_path)}")) + + +def rename_index(repo_path, new_name): + os.replace(os.path.join(repo_path, f"index.{get_head(repo_path)}"), os.path.join(repo_path, new_name)) + + +def list_objects(repository): + return {int(key) for key in repository.list()} + + +def test_repair_corrupted_segment(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repo_path = get_path(repository) + add_objects(repository, [[1, 2, 3], [4, 5], [6]]) + assert {1, 2, 3, 4, 5, 6} == list_objects(repository) + check(repository, repo_path, status=True) + corrupt_object(repo_path, 5) + with pytest.raises(IntegrityError): + get_objects(repository, 5) + repository.rollback() + # make sure a regular check does not repair anything + check(repository, repo_path, status=False) + check(repository, repo_path, status=False) + # make sure a repair actually repairs the repo + check(repository, repo_path, repair=True, status=True) + get_objects(repository, 4) + check(repository, repo_path, status=True) + assert {1, 2, 3, 4, 6} == list_objects(repository) + + +def test_repair_missing_segment(repository): + # only test on local repo - files in RemoteRepository cannot be deleted + with repository: + add_objects(repository, [[1, 2, 3], [4, 5, 6]]) + assert {1, 2, 3, 4, 5, 6} == list_objects(repository) + check(repository, repository.path, status=True) + delete_segment(repository, 2) + repository.rollback() + check(repository, repository.path, repair=True, status=True) + assert {1, 2, 3} == list_objects(repository) + + +def test_repair_missing_commit_segment(repository): + # only test on local repo - files in RemoteRepository cannot be deleted + with repository: + add_objects(repository, [[1, 2, 3], [4, 5, 6]]) + delete_segment(repository, 3) + with pytest.raises(LegacyRepository.ObjectNotFound): + get_objects(repository, 4) + assert {1, 2, 3} == list_objects(repository) + + +def test_repair_corrupted_commit_segment(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repo_path = get_path(repository) + add_objects(repository, [[1, 2, 3], [4, 5, 6]]) + with open(os.path.join(repo_path, "data", "0", "3"), "r+b") as fd: + fd.seek(-1, os.SEEK_END) + fd.write(b"X") + with pytest.raises(LegacyRepository.ObjectNotFound): + get_objects(repository, 4) + check(repository, repo_path, status=True) + get_objects(repository, 3) + assert {1, 2, 3} == list_objects(repository) + + +def test_repair_no_commits(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repo_path = get_path(repository) + add_objects(repository, [[1, 2, 3]]) + with open(os.path.join(repo_path, "data", "0", "1"), "r+b") as fd: + fd.seek(-1, os.SEEK_END) + fd.write(b"X") + with pytest.raises(LegacyRepository.CheckNeeded): + get_objects(repository, 4) + check(repository, repo_path, status=False) + check(repository, repo_path, status=False) + assert list_indices(repo_path) == ["index.1"] + check(repository, repo_path, repair=True, status=True) + assert list_indices(repo_path) == ["index.2"] + check(repository, repo_path, status=True) + get_objects(repository, 3) + assert {1, 2, 3} == list_objects(repository) + + +def test_repair_missing_index(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repo_path = get_path(repository) + add_objects(repository, [[1, 2, 3], [4, 5, 6]]) + delete_index(repo_path) + check(repository, repo_path, status=True) + get_objects(repository, 4) + assert {1, 2, 3, 4, 5, 6} == list_objects(repository) + + +def test_repair_index_too_new(repo_fixtures, request): + with get_repository_from_fixture(repo_fixtures, request) as repository: + repo_path = get_path(repository) + add_objects(repository, [[1, 2, 3], [4, 5, 6]]) + assert list_indices(repo_path) == ["index.3"] + rename_index(repo_path, "index.100") + check(repository, repo_path, status=True) + assert list_indices(repo_path) == ["index.3"] + get_objects(repository, 4) + assert {1, 2, 3, 4, 5, 6} == list_objects(repository) + + +def test_crash_before_compact(repository): + # only test on local repo - we can't mock-patch a RemoteRepository class in another process! + with repository: + repository.put(H(0), fchunk(b"data")) + repository.put(H(0), fchunk(b"data2")) + # simulate a crash before compact + with patch.object(LegacyRepository, "compact_segments") as compact: + repository.commit(compact=True) + compact.assert_called_once_with(0.1) + with reopen(repository) as repository: + check(repository, repository.path, repair=True) + assert pdchunk(repository.get(H(0))) == b"data2" + + +def test_hints_persistence(repository): + with repository: + repository.put(H(0), fchunk(b"data")) + repository.delete(H(0)) + repository.commit(compact=False) + shadow_index_expected = repository.shadow_index + compact_expected = repository.compact + segments_expected = repository.segments + # close and re-open the repository (create fresh Repository instance) to + # check whether hints were persisted to / reloaded from disk + with reopen(repository) as repository: + repository.put(H(42), fchunk(b"foobar")) # this will call prepare_txn() and load the hints data + # check if hints persistence worked: + assert shadow_index_expected == repository.shadow_index + assert compact_expected == repository.compact + del repository.segments[2] # ignore the segment created by put(H(42), ...) + assert segments_expected == repository.segments + with reopen(repository) as repository: + check(repository, repository.path, repair=True) + with reopen(repository) as repository: + repository.put(H(42), fchunk(b"foobar")) # this will call prepare_txn() and load the hints data + assert shadow_index_expected == repository.shadow_index + # sizes do not match, with vs. without header? + # assert compact_expected == repository.compact + del repository.segments[2] # ignore the segment created by put(H(42), ...) + assert segments_expected == repository.segments + + +def test_hints_behaviour(repository): + with repository: + repository.put(H(0), fchunk(b"data")) + assert repository.shadow_index == {} + assert len(repository.compact) == 0 + repository.delete(H(0)) + repository.commit(compact=False) + # now there should be an entry for H(0) in shadow_index + assert H(0) in repository.shadow_index + assert len(repository.shadow_index[H(0)]) == 1 + assert 0 in repository.compact # segment 0 can be compacted + repository.put(H(42), fchunk(b"foobar")) # see also do_compact() + repository.commit(compact=True, threshold=0.0) # compact completely! + # nothing to compact anymore! no info left about stuff that does not exist anymore: + assert H(0) not in repository.shadow_index + # segment 0 was compacted away, no info about it left: + assert 0 not in repository.compact + assert 0 not in repository.segments + + +def _get_mock_args(): + class MockArgs: + remote_path = "borg" + umask = 0o077 + debug_topics = [] + rsh = None + + def __contains__(self, item): + # to behave like argparse.Namespace + return hasattr(self, item) + + return MockArgs() + + +def test_remote_invalid_rpc(remote_repository): + with remote_repository: + with pytest.raises(InvalidRPCMethod): + remote_repository.call("__init__", {}) + + +def test_remote_rpc_exception_transport(remote_repository): + with remote_repository: + s1 = "test string" + + try: + remote_repository.call("inject_exception", {"kind": "DoesNotExist"}) + except LegacyRepository.DoesNotExist as e: + assert len(e.args) == 1 + assert e.args[0] == remote_repository.location.processed + + try: + remote_repository.call("inject_exception", {"kind": "AlreadyExists"}) + except LegacyRepository.AlreadyExists as e: + assert len(e.args) == 1 + assert e.args[0] == remote_repository.location.processed + + try: + remote_repository.call("inject_exception", {"kind": "CheckNeeded"}) + except LegacyRepository.CheckNeeded as e: + assert len(e.args) == 1 + assert e.args[0] == remote_repository.location.processed + + try: + remote_repository.call("inject_exception", {"kind": "IntegrityError"}) + except IntegrityError as e: + assert len(e.args) == 1 + assert e.args[0] == s1 + + try: + remote_repository.call("inject_exception", {"kind": "PathNotAllowed"}) + except PathNotAllowed as e: + assert len(e.args) == 1 + assert e.args[0] == "foo" + + try: + remote_repository.call("inject_exception", {"kind": "ObjectNotFound"}) + except LegacyRepository.ObjectNotFound as e: + assert len(e.args) == 2 + assert e.args[0] == s1 + assert e.args[1] == remote_repository.location.processed + + try: + remote_repository.call("inject_exception", {"kind": "InvalidRPCMethod"}) + except InvalidRPCMethod as e: + assert len(e.args) == 1 + assert e.args[0] == s1 + + try: + remote_repository.call("inject_exception", {"kind": "divide"}) + except LegacyRemoteRepository.RPCError as e: + assert e.unpacked + assert e.get_message() == "ZeroDivisionError: integer division or modulo by zero\n" + assert e.exception_class == "ZeroDivisionError" + assert len(e.exception_full) > 0 + + +def test_remote_ssh_cmd(remote_repository): + with remote_repository: + args = _get_mock_args() + remote_repository._args = args + assert remote_repository.ssh_cmd(Location("ssh://example.com/foo")) == ["ssh", "example.com"] + assert remote_repository.ssh_cmd(Location("ssh://user@example.com/foo")) == ["ssh", "user@example.com"] + assert remote_repository.ssh_cmd(Location("ssh://user@example.com:1234/foo")) == [ + "ssh", + "-p", + "1234", + "user@example.com", + ] + os.environ["BORG_RSH"] = "ssh --foo" + assert remote_repository.ssh_cmd(Location("ssh://example.com/foo")) == ["ssh", "--foo", "example.com"] + + +def test_remote_borg_cmd(remote_repository): + with remote_repository: + assert remote_repository.borg_cmd(None, testing=True) == [sys.executable, "-m", "borg", "serve"] + args = _get_mock_args() + # XXX without next line we get spurious test fails when using pytest-xdist, root cause unknown: + logging.getLogger().setLevel(logging.INFO) + # note: test logger is on info log level, so --info gets added automagically + assert remote_repository.borg_cmd(args, testing=False) == ["borg", "serve", "--info"] + args.remote_path = "borg-0.28.2" + assert remote_repository.borg_cmd(args, testing=False) == ["borg-0.28.2", "serve", "--info"] + args.debug_topics = ["something_client_side", "repository_compaction"] + assert remote_repository.borg_cmd(args, testing=False) == [ + "borg-0.28.2", + "serve", + "--info", + "--debug-topic=borg.debug.repository_compaction", + ] + args = _get_mock_args() + args.storage_quota = 0 + assert remote_repository.borg_cmd(args, testing=False) == ["borg", "serve", "--info"] + args.storage_quota = 314159265 + assert remote_repository.borg_cmd(args, testing=False) == [ + "borg", + "serve", + "--info", + "--storage-quota=314159265", + ] + args.rsh = "ssh -i foo" + remote_repository._args = args + assert remote_repository.ssh_cmd(Location("ssh://example.com/foo")) == ["ssh", "-i", "foo", "example.com"] diff --git a/src/borg/testsuite/platform.py b/src/borg/testsuite/platform.py index 40ea3d78c0..527bb11787 100644 --- a/src/borg/testsuite/platform.py +++ b/src/borg/testsuite/platform.py @@ -8,7 +8,7 @@ from ..platform import acl_get, acl_set from ..platform import get_process_id, process_alive from . import unopened_tempfile -from .locking import free_pid # NOQA +from .fslocking import free_pid # NOQA def fakeroot_detected(): diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index ca60b53b81..05cb74ea40 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -2,18 +2,15 @@ import os import sys from typing import Optional -from unittest.mock import patch import pytest -from ..hashindex import NSIndex +from ..checksums import xxh64 from ..helpers import Location from ..helpers import IntegrityError -from ..helpers import msgpack -from ..locking import Lock, LockFailed from ..platformflags import is_win32 from ..remote import RemoteRepository, InvalidRPCMethod, PathNotAllowed -from ..repository import Repository, LoggedIO, MAGIC, MAX_DATA_SIZE, TAG_DELETE, TAG_PUT2, TAG_PUT, TAG_COMMIT +from ..repository import Repository, MAX_DATA_SIZE from ..repoobj import RepoObj from .hashindex import H @@ -45,9 +42,9 @@ def get_repository_from_fixture(repo_fixtures, request): def reopen(repository, exclusive: Optional[bool] = True, create=False): if isinstance(repository, Repository): - if repository.io is not None or repository.lock is not None: + if repository.opened: raise RuntimeError("Repo must be closed before a reopen. Cannot support nested repository contexts.") - return Repository(repository.path, exclusive=exclusive, create=create) + return Repository(repository._location, exclusive=exclusive, create=create) if isinstance(repository, RemoteRepository): if repository.p is not None or repository.sock is not None: @@ -59,33 +56,21 @@ def reopen(repository, exclusive: Optional[bool] = True, create=False): ) -def get_path(repository): - if isinstance(repository, Repository): - return repository.path - - if isinstance(repository, RemoteRepository): - return repository.location.path - - raise TypeError( - f"Invalid argument type. Expected 'Repository' or 'RemoteRepository', received '{type(repository).__name__}'." - ) - - def fchunk(data, meta=b""): - # create a raw chunk that has valid RepoObj layout, but does not use encryption or compression. - meta_len = RepoObj.meta_len_hdr.pack(len(meta)) + # format chunk: create a raw chunk that has valid RepoObj layout, but does not use encryption or compression. + hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta), xxh64(data)) assert isinstance(data, bytes) - chunk = meta_len + meta + data + chunk = hdr + meta + data return chunk def pchunk(chunk): - # parse data and meta from a raw chunk made by fchunk - meta_len_size = RepoObj.meta_len_hdr.size - meta_len = chunk[:meta_len_size] - meta_len = RepoObj.meta_len_hdr.unpack(meta_len)[0] - meta = chunk[meta_len_size : meta_len_size + meta_len] - data = chunk[meta_len_size + meta_len :] + # parse chunk: parse data and meta from a raw chunk made by fchunk + hdr_size = RepoObj.obj_header.size + hdr = chunk[:hdr_size] + meta_size, data_size = RepoObj.obj_header.unpack(hdr)[0:2] + meta = chunk[hdr_size : hdr_size + meta_size] + data = chunk[hdr_size + meta_size : hdr_size + meta_size + data_size] return data, meta @@ -94,27 +79,6 @@ def pdchunk(chunk): return pchunk(chunk)[0] -def add_keys(repository): - repository.put(H(0), fchunk(b"foo")) - repository.put(H(1), fchunk(b"bar")) - repository.put(H(3), fchunk(b"bar")) - repository.commit(compact=False) - repository.put(H(1), fchunk(b"bar2")) - repository.put(H(2), fchunk(b"boo")) - repository.delete(H(3)) - - -def repo_dump(repository, label=None): - label = label + ": " if label is not None else "" - H_trans = {H(i): i for i in range(10)} - H_trans[None] = -1 # key == None appears in commits - tag_trans = {TAG_PUT2: "put2", TAG_PUT: "put", TAG_DELETE: "del", TAG_COMMIT: "comm"} - for segment, fn in repository.io.segment_iterator(): - for tag, key, offset, size, _ in repository.io.iter_objects(segment): - print("%s%s H(%d) -> %s[%d..+%d]" % (label, tag_trans[tag], H_trans[key], fn, offset, size)) - print() - - def test_basic_operations(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: for x in range(100): @@ -124,7 +88,6 @@ def test_basic_operations(repo_fixtures, request): repository.delete(key50) with pytest.raises(Repository.ObjectNotFound): repository.get(key50) - repository.commit(compact=False) with reopen(repository) as repository: with pytest.raises(Repository.ObjectNotFound): repository.get(key50) @@ -134,25 +97,13 @@ def test_basic_operations(repo_fixtures, request): assert pdchunk(repository.get(H(x))) == b"SOMEDATA" -def test_multiple_transactions(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"foo")) - repository.put(H(1), fchunk(b"foo")) - repository.commit(compact=False) - repository.delete(H(0)) - repository.put(H(1), fchunk(b"bar")) - repository.commit(compact=False) - assert pdchunk(repository.get(H(1))) == b"bar" - - def test_read_data(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: meta, data = b"meta", b"data" - meta_len = RepoObj.meta_len_hdr.pack(len(meta)) - chunk_complete = meta_len + meta + data - chunk_short = meta_len + meta + hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta), xxh64(data)) + chunk_complete = hdr + meta + data + chunk_short = hdr + meta repository.put(H(0), chunk_complete) - repository.commit(compact=False) assert repository.get(H(0)) == chunk_complete assert repository.get(H(0), read_data=True) == chunk_complete assert repository.get(H(0), read_data=False) == chunk_short @@ -171,717 +122,30 @@ def test_consistency(repo_fixtures, request): repository.get(H(0)) -def test_consistency2(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"foo")) - assert pdchunk(repository.get(H(0))) == b"foo" - repository.commit(compact=False) - repository.put(H(0), fchunk(b"foo2")) - assert pdchunk(repository.get(H(0))) == b"foo2" - repository.rollback() - assert pdchunk(repository.get(H(0))) == b"foo" - - -def test_overwrite_in_same_transaction(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"foo")) - repository.put(H(0), fchunk(b"foo2")) - repository.commit(compact=False) - assert pdchunk(repository.get(H(0))) == b"foo2" - - -def test_single_kind_transactions(repo_fixtures, request): - # put - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=False) - # replace - with reopen(repository) as repository: - repository.put(H(0), fchunk(b"bar")) - repository.commit(compact=False) - # delete - with reopen(repository) as repository: - repository.delete(H(0)) - repository.commit(compact=False) - - def test_list(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: for x in range(100): repository.put(H(x), fchunk(b"SOMEDATA")) - repository.commit(compact=False) repo_list = repository.list() assert len(repo_list) == 100 first_half = repository.list(limit=50) assert len(first_half) == 50 assert first_half == repo_list[:50] - second_half = repository.list(marker=first_half[-1]) + second_half = repository.list(marker=first_half[-1][0]) assert len(second_half) == 50 assert second_half == repo_list[50:] assert len(repository.list(limit=50)) == 50 -def test_scan(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - for x in range(100): - repository.put(H(x), fchunk(b"SOMEDATA")) - repository.commit(compact=False) - ids, _ = repository.scan() - assert len(ids) == 100 - first_half, state = repository.scan(limit=50) - assert len(first_half) == 50 - assert first_half == ids[:50] - second_half, _ = repository.scan(state=state) - assert len(second_half) == 50 - assert second_half == ids[50:] - # check result order == on-disk order (which is hash order) - for x in range(100): - assert ids[x] == H(x) - - -def test_scan_modify(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - for x in range(100): - repository.put(H(x), fchunk(b"ORIGINAL")) - repository.commit(compact=False) - # now we scan, read and modify chunks at the same time - count = 0 - ids, _ = repository.scan() - for id in ids: - # scan results are in same order as we put the chunks into the repo (into the segment file) - assert id == H(count) - chunk = repository.get(id) - # check that we **only** get data that was committed when we started scanning - # and that we do not run into the new data we put into the repo. - assert pdchunk(chunk) == b"ORIGINAL" - count += 1 - repository.put(id, fchunk(b"MODIFIED")) - assert count == 100 - repository.commit() - - # now we have committed all the modified chunks, and **only** must get the modified ones. - count = 0 - ids, _ = repository.scan() - for id in ids: - # scan results are in same order as we put the chunks into the repo (into the segment file) - assert id == H(count) - chunk = repository.get(id) - assert pdchunk(chunk) == b"MODIFIED" - count += 1 - assert count == 100 - - def test_max_data_size(repo_fixtures, request): with get_repository_from_fixture(repo_fixtures, request) as repository: - max_data = b"x" * (MAX_DATA_SIZE - RepoObj.meta_len_hdr.size) + max_data = b"x" * (MAX_DATA_SIZE - RepoObj.obj_header.size) repository.put(H(0), fchunk(max_data)) assert pdchunk(repository.get(H(0))) == max_data with pytest.raises(IntegrityError): repository.put(H(1), fchunk(max_data + b"x")) -def test_set_flags(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - id = H(0) - repository.put(id, fchunk(b"")) - assert repository.flags(id) == 0x00000000 # init == all zero - repository.flags(id, mask=0x00000001, value=0x00000001) - assert repository.flags(id) == 0x00000001 - repository.flags(id, mask=0x00000002, value=0x00000002) - assert repository.flags(id) == 0x00000003 - repository.flags(id, mask=0x00000001, value=0x00000000) - assert repository.flags(id) == 0x00000002 - repository.flags(id, mask=0x00000002, value=0x00000000) - assert repository.flags(id) == 0x00000000 - - -def test_get_flags(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - id = H(0) - repository.put(id, fchunk(b"")) - assert repository.flags(id) == 0x00000000 # init == all zero - repository.flags(id, mask=0xC0000003, value=0x80000001) - assert repository.flags(id, mask=0x00000001) == 0x00000001 - assert repository.flags(id, mask=0x00000002) == 0x00000000 - assert repository.flags(id, mask=0x40000008) == 0x00000000 - assert repository.flags(id, mask=0x80000000) == 0x80000000 - - -def test_flags_many(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - ids_flagged = [H(0), H(1)] - ids_default_flags = [H(2), H(3)] - [repository.put(id, fchunk(b"")) for id in ids_flagged + ids_default_flags] - repository.flags_many(ids_flagged, mask=0xFFFFFFFF, value=0xDEADBEEF) - assert list(repository.flags_many(ids_default_flags)) == [0x00000000, 0x00000000] - assert list(repository.flags_many(ids_flagged)) == [0xDEADBEEF, 0xDEADBEEF] - assert list(repository.flags_many(ids_flagged, mask=0xFFFF0000)) == [0xDEAD0000, 0xDEAD0000] - assert list(repository.flags_many(ids_flagged, mask=0x0000FFFF)) == [0x0000BEEF, 0x0000BEEF] - - -def test_flags_persistence(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repository.put(H(0), fchunk(b"default")) - repository.put(H(1), fchunk(b"one one zero")) - # we do not set flags for H(0), so we can later check their default state. - repository.flags(H(1), mask=0x00000007, value=0x00000006) - repository.commit(compact=False) - with reopen(repository) as repository: - # we query all flags to check if the initial flags were all zero and - # only the ones we explicitly set to one are as expected. - assert repository.flags(H(0), mask=0xFFFFFFFF) == 0x00000000 - assert repository.flags(H(1), mask=0xFFFFFFFF) == 0x00000006 - - -def _assert_sparse(repository): - # the superseded 123456... PUT - assert repository.compact[0] == 41 + 8 + len(fchunk(b"123456789")) - # a COMMIT - assert repository.compact[1] == 9 - # the DELETE issued by the superseding PUT (or issued directly) - assert repository.compact[2] == 41 - repository._rebuild_sparse(0) - assert repository.compact[0] == 41 + 8 + len(fchunk(b"123456789")) # 9 is chunk or commit? - - -def test_sparse1(repository): - with repository: - repository.put(H(0), fchunk(b"foo")) - repository.put(H(1), fchunk(b"123456789")) - repository.commit(compact=False) - repository.put(H(1), fchunk(b"bar")) - _assert_sparse(repository) - - -def test_sparse2(repository): - with repository: - repository.put(H(0), fchunk(b"foo")) - repository.put(H(1), fchunk(b"123456789")) - repository.commit(compact=False) - repository.delete(H(1)) - _assert_sparse(repository) - - -def test_sparse_delete(repository): - with repository: - chunk0 = fchunk(b"1245") - repository.put(H(0), chunk0) - repository.delete(H(0)) - repository.io._write_fd.sync() - # the on-line tracking works on a per-object basis... - assert repository.compact[0] == 41 + 8 + 41 + len(chunk0) - repository._rebuild_sparse(0) - # ...while _rebuild_sparse can mark whole segments as completely sparse (which then includes the segment magic) - assert repository.compact[0] == 41 + 8 + 41 + len(chunk0) + len(MAGIC) - repository.commit(compact=True) - assert 0 not in [segment for segment, _ in repository.io.segment_iterator()] - - -def test_uncommitted_garbage(repository): - with repository: - # uncommitted garbage should be no problem, it is cleaned up automatically. - # we just have to be careful with invalidation of cached FDs in LoggedIO. - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=False) - # write some crap to an uncommitted segment file - last_segment = repository.io.get_latest_segment() - with open(repository.io.segment_filename(last_segment + 1), "wb") as f: - f.write(MAGIC + b"crapcrapcrap") - with reopen(repository) as repository: - # usually, opening the repo and starting a transaction should trigger a cleanup. - repository.put(H(0), fchunk(b"bar")) # this may trigger compact_segments() - repository.commit(compact=True) - # the point here is that nothing blows up with an exception. - - -def test_replay_of_missing_index(repository): - with repository: - add_keys(repository) - for name in os.listdir(repository.path): - if name.startswith("index."): - os.unlink(os.path.join(repository.path, name)) - with reopen(repository) as repository: - assert len(repository) == 3 - assert repository.check() is True - - -def test_crash_before_compact_segments(repository): - with repository: - add_keys(repository) - repository.compact_segments = None - try: - repository.commit(compact=True) - except TypeError: - pass - with reopen(repository) as repository: - assert len(repository) == 3 - assert repository.check() is True - - -def test_crash_before_write_index(repository): - with repository: - add_keys(repository) - repository.write_index = None - try: - repository.commit(compact=False) - except TypeError: - pass - with reopen(repository) as repository: - assert len(repository) == 3 - assert repository.check() is True - - -def test_replay_lock_upgrade_old(repository): - with repository: - add_keys(repository) - for name in os.listdir(repository.path): - if name.startswith("index."): - os.unlink(os.path.join(repository.path, name)) - with patch.object(Lock, "upgrade", side_effect=LockFailed) as upgrade: - with reopen(repository, exclusive=None) as repository: - # simulate old client that always does lock upgrades - # the repo is only locked by a shared read lock, but to replay segments, - # we need an exclusive write lock - check if the lock gets upgraded. - with pytest.raises(LockFailed): - len(repository) - upgrade.assert_called_once_with() - - -def test_replay_lock_upgrade(repository): - with repository: - add_keys(repository) - for name in os.listdir(repository.path): - if name.startswith("index."): - os.unlink(os.path.join(repository.path, name)) - with patch.object(Lock, "upgrade", side_effect=LockFailed) as upgrade: - with reopen(repository, exclusive=False) as repository: - # current client usually does not do lock upgrade, except for replay - # the repo is only locked by a shared read lock, but to replay segments, - # we need an exclusive write lock - check if the lock gets upgraded. - with pytest.raises(LockFailed): - len(repository) - upgrade.assert_called_once_with() - - -def test_crash_before_deleting_compacted_segments(repository): - with repository: - add_keys(repository) - repository.io.delete_segment = None - try: - repository.commit(compact=False) - except TypeError: - pass - with reopen(repository) as repository: - assert len(repository) == 3 - assert repository.check() is True - assert len(repository) == 3 - - -def test_ignores_commit_tag_in_data(repository): - with repository: - repository.put(H(0), LoggedIO.COMMIT) - with reopen(repository) as repository: - io = repository.io - assert not io.is_committed_segment(io.get_latest_segment()) - - -def test_moved_deletes_are_tracked(repository): - with repository: - repository.put(H(1), fchunk(b"1")) - repository.put(H(2), fchunk(b"2")) - repository.commit(compact=False) - repo_dump(repository, "p1 p2 c") - repository.delete(H(1)) - repository.commit(compact=True) - repo_dump(repository, "d1 cc") - last_segment = repository.io.get_latest_segment() - 1 - num_deletes = 0 - for tag, key, offset, size, _ in repository.io.iter_objects(last_segment): - if tag == TAG_DELETE: - assert key == H(1) - num_deletes += 1 - assert num_deletes == 1 - assert last_segment in repository.compact - repository.put(H(3), fchunk(b"3")) - repository.commit(compact=True) - repo_dump(repository, "p3 cc") - assert last_segment not in repository.compact - assert not repository.io.segment_exists(last_segment) - for segment, _ in repository.io.segment_iterator(): - for tag, key, offset, size, _ in repository.io.iter_objects(segment): - assert tag != TAG_DELETE - assert key != H(1) - # after compaction, there should be no empty shadowed_segments lists left over. - # we have no put or del anymore for H(1), so we lost knowledge about H(1). - assert H(1) not in repository.shadow_index - - -def test_shadowed_entries_are_preserved1(repository): - # this tests the shadowing-by-del behaviour - with repository: - get_latest_segment = repository.io.get_latest_segment - repository.put(H(1), fchunk(b"1")) - # This is the segment with our original PUT of interest - put_segment = get_latest_segment() - repository.commit(compact=False) - # we now delete H(1), and force this segment not to be compacted, which can happen - # if it's not sparse enough (symbolized by H(2) here). - repository.delete(H(1)) - repository.put(H(2), fchunk(b"1")) - del_segment = get_latest_segment() - # we pretend these are mostly dense (not sparse) and won't be compacted - del repository.compact[put_segment] - del repository.compact[del_segment] - repository.commit(compact=True) - # we now perform an unrelated operation on the segment containing the DELETE, - # causing it to be compacted. - repository.delete(H(2)) - repository.commit(compact=True) - assert repository.io.segment_exists(put_segment) - assert not repository.io.segment_exists(del_segment) - # basic case, since the index survived this must be ok - assert H(1) not in repository - # nuke index, force replay - os.unlink(os.path.join(repository.path, "index.%d" % get_latest_segment())) - # must not reappear - assert H(1) not in repository - - -def test_shadowed_entries_are_preserved2(repository): - # this tests the shadowing-by-double-put behaviour, see issue #5661 - # assume this repo state: - # seg1: PUT H1 - # seg2: COMMIT - # seg3: DEL H1, PUT H1, DEL H1, PUT H2 - # seg4: COMMIT - # Note how due to the final DEL H1 in seg3, H1 is effectively deleted. - # - # compaction of only seg3: - # PUT H1 gets dropped because it is not needed any more. - # DEL H1 must be kept, because there is still a PUT H1 in seg1 which must not - # "reappear" in the index if the index gets rebuilt. - with repository: - get_latest_segment = repository.io.get_latest_segment - repository.put(H(1), fchunk(b"1")) - # This is the segment with our original PUT of interest - put_segment = get_latest_segment() - repository.commit(compact=False) - # We now put H(1) again (which implicitly does DEL(H(1)) followed by PUT(H(1), ...)), - # delete H(1) afterwards, and force this segment to not be compacted, which can happen - # if it's not sparse enough (symbolized by H(2) here). - repository.put(H(1), fchunk(b"1")) - repository.delete(H(1)) - repository.put(H(2), fchunk(b"1")) - delete_segment = get_latest_segment() - # We pretend these are mostly dense (not sparse) and won't be compacted - del repository.compact[put_segment] - del repository.compact[delete_segment] - repository.commit(compact=True) - # Now we perform an unrelated operation on the segment containing the DELETE, - # causing it to be compacted. - repository.delete(H(2)) - repository.commit(compact=True) - assert repository.io.segment_exists(put_segment) - assert not repository.io.segment_exists(delete_segment) - # Basic case, since the index survived this must be ok - assert H(1) not in repository - # Nuke index, force replay - os.unlink(os.path.join(repository.path, "index.%d" % get_latest_segment())) - # Must not reappear - assert H(1) not in repository # F - - -def test_shadow_index_rollback(repository): - with repository: - repository.put(H(1), fchunk(b"1")) - repository.delete(H(1)) - assert repository.shadow_index[H(1)] == [0] - repository.commit(compact=True) - repo_dump(repository, "p1 d1 cc") - # note how an empty list means that nothing is shadowed for sure - assert repository.shadow_index[H(1)] == [] # because the deletion is considered unstable - repository.put(H(1), b"1") - repository.delete(H(1)) - repo_dump(repository, "p1 d1") - # 0 put/delete; 1 commit; 2 compacted; 3 commit; 4 put/delete - assert repository.shadow_index[H(1)] == [4] - repository.rollback() - repo_dump(repository, "r") - repository.put(H(2), fchunk(b"1")) - # after the rollback, segment 4 shouldn't be considered anymore - assert repository.shadow_index[H(1)] == [] # because the deletion is considered unstable - - -def test_destroy_append_only(repository): - with repository: - # can't destroy append only repo (via the API) - repository.append_only = True - with pytest.raises(ValueError): - repository.destroy() - assert repository.append_only - - -def test_append_only(repository): - def segments_in_repository(repo): - return len(list(repo.io.segment_iterator())) - - with repository: - repository.append_only = True - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=False) - - repository.append_only = False - assert segments_in_repository(repository) == 2 - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=True) - # normal: compact squashes the data together, only one segment - assert segments_in_repository(repository) == 2 - - repository.append_only = True - assert segments_in_repository(repository) == 2 - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=False) - # append only: does not compact, only new segments written - assert segments_in_repository(repository) == 4 - - -def test_additional_free_space(repository): - with repository: - add_keys(repository) - repository.config.set("repository", "additional_free_space", "1000T") - repository.save_key(b"shortcut to save_config") - with reopen(repository) as repository: - repository.put(H(0), fchunk(b"foobar")) - with pytest.raises(Repository.InsufficientFreeSpaceError): - repository.commit(compact=False) - assert os.path.exists(repository.path) - - -def test_create_free_space(repository): - with repository: - repository.additional_free_space = 1e20 - with pytest.raises(Repository.InsufficientFreeSpaceError): - add_keys(repository) - assert not os.path.exists(repository.path) - - -def test_tracking(repository): - with repository: - assert repository.storage_quota_use == 0 - ch1 = fchunk(bytes(1234)) - repository.put(H(1), ch1) - assert repository.storage_quota_use == len(ch1) + 41 + 8 - ch2 = fchunk(bytes(5678)) - repository.put(H(2), ch2) - assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) - repository.delete(H(1)) - assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) # we have not compacted yet - repository.commit(compact=False) - assert repository.storage_quota_use == len(ch1) + len(ch2) + 2 * (41 + 8) # we have not compacted yet - with reopen(repository) as repository: - # open new transaction; hints and thus quota data is not loaded unless needed. - ch3 = fchunk(b"") - repository.put(H(3), ch3) - repository.delete(H(3)) - assert repository.storage_quota_use == len(ch1) + len(ch2) + len(ch3) + 3 * ( - 41 + 8 - ) # we have not compacted yet - repository.commit(compact=True) - assert repository.storage_quota_use == len(ch2) + 41 + 8 - - -def test_exceed_quota(repository): - with repository: - assert repository.storage_quota_use == 0 - repository.storage_quota = 80 - ch1 = fchunk(b"x" * 7) - repository.put(H(1), ch1) - assert repository.storage_quota_use == len(ch1) + 41 + 8 - repository.commit(compact=False) - with pytest.raises(Repository.StorageQuotaExceeded): - ch2 = fchunk(b"y" * 13) - repository.put(H(2), ch2) - assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2 # check ch2!? - with pytest.raises(Repository.StorageQuotaExceeded): - repository.commit(compact=False) - assert repository.storage_quota_use == len(ch1) + len(ch2) + (41 + 8) * 2 # check ch2!? - with reopen(repository) as repository: - repository.storage_quota = 150 - # open new transaction; hints and thus quota data is not loaded unless needed. - repository.put(H(1), ch1) - # we have 2 puts for H(1) here and not yet compacted. - assert repository.storage_quota_use == len(ch1) * 2 + (41 + 8) * 2 - repository.commit(compact=True) - assert repository.storage_quota_use == len(ch1) + 41 + 8 # now we have compacted. - - -def make_auxiliary(repository): - with repository: - repository.put(H(0), fchunk(b"foo")) - repository.commit(compact=False) - - -def do_commit(repository): - with repository: - repository.put(H(0), fchunk(b"fox")) - repository.commit(compact=False) - - -def test_corrupted_hints(repository): - make_auxiliary(repository) - with open(os.path.join(repository.path, "hints.1"), "ab") as fd: - fd.write(b"123456789") - do_commit(repository) - - -def test_deleted_hints(repository): - make_auxiliary(repository) - os.unlink(os.path.join(repository.path, "hints.1")) - do_commit(repository) - - -def test_deleted_index(repository): - make_auxiliary(repository) - os.unlink(os.path.join(repository.path, "index.1")) - do_commit(repository) - - -def test_unreadable_hints(repository): - make_auxiliary(repository) - hints = os.path.join(repository.path, "hints.1") - os.unlink(hints) - os.mkdir(hints) - with pytest.raises(OSError): - do_commit(repository) - - -def test_index(repository): - make_auxiliary(repository) - with open(os.path.join(repository.path, "index.1"), "wb") as fd: - fd.write(b"123456789") - do_commit(repository) - - -def test_index_outside_transaction(repository): - make_auxiliary(repository) - with open(os.path.join(repository.path, "index.1"), "wb") as fd: - fd.write(b"123456789") - with repository: - assert len(repository) == 1 - - -def _corrupt_index(repository): - # HashIndex is able to detect incorrect headers and file lengths, - # but on its own it can't tell if the data is correct. - index_path = os.path.join(repository.path, "index.1") - with open(index_path, "r+b") as fd: - index_data = fd.read() - # Flip one bit in a key stored in the index - corrupted_key = (int.from_bytes(H(0), "little") ^ 1).to_bytes(32, "little") - corrupted_index_data = index_data.replace(H(0), corrupted_key) - assert corrupted_index_data != index_data - assert len(corrupted_index_data) == len(index_data) - fd.seek(0) - fd.write(corrupted_index_data) - - -def test_index_corrupted(repository): - make_auxiliary(repository) - _corrupt_index(repository) - with repository: - # data corruption is detected due to mismatching checksums, and fixed by rebuilding the index. - assert len(repository) == 1 - assert pdchunk(repository.get(H(0))) == b"foo" - - -def test_index_corrupted_without_integrity(repository): - make_auxiliary(repository) - _corrupt_index(repository) - integrity_path = os.path.join(repository.path, "integrity.1") - os.unlink(integrity_path) - with repository: - # since the corrupted key is not noticed, the repository still thinks it contains one key... - assert len(repository) == 1 - with pytest.raises(Repository.ObjectNotFound): - # ... but the real, uncorrupted key is not found in the corrupted index. - repository.get(H(0)) - - -def test_unreadable_index(repository): - make_auxiliary(repository) - index = os.path.join(repository.path, "index.1") - os.unlink(index) - os.mkdir(index) - with pytest.raises(OSError): - do_commit(repository) - - -def test_unknown_integrity_version(repository): - make_auxiliary(repository) - # for now an unknown integrity data version is ignored and not an error. - integrity_path = os.path.join(repository.path, "integrity.1") - with open(integrity_path, "r+b") as fd: - msgpack.pack({b"version": 4.7}, fd) # borg only understands version 2 - fd.truncate() - with repository: - # no issues accessing the repository - assert len(repository) == 1 - assert pdchunk(repository.get(H(0))) == b"foo" - - -def _subtly_corrupted_hints_setup(repository): - with repository: - repository.append_only = True - assert len(repository) == 1 - assert pdchunk(repository.get(H(0))) == b"foo" - repository.put(H(1), fchunk(b"bar")) - repository.put(H(2), fchunk(b"baz")) - repository.commit(compact=False) - repository.put(H(2), fchunk(b"bazz")) - repository.commit(compact=False) - hints_path = os.path.join(repository.path, "hints.5") - with open(hints_path, "r+b") as fd: - hints = msgpack.unpack(fd) - fd.seek(0) - # corrupt segment refcount - assert hints["segments"][2] == 1 - hints["segments"][2] = 0 - msgpack.pack(hints, fd) - fd.truncate() - - -def test_subtly_corrupted_hints(repository): - make_auxiliary(repository) - _subtly_corrupted_hints_setup(repository) - with repository: - repository.append_only = False - repository.put(H(3), fchunk(b"1234")) - # do a compaction run, which succeeds since the failed checksum prompted a rebuild of the index+hints. - repository.commit(compact=True) - assert len(repository) == 4 - assert pdchunk(repository.get(H(0))) == b"foo" - assert pdchunk(repository.get(H(1))) == b"bar" - assert pdchunk(repository.get(H(2))) == b"bazz" - - -def test_subtly_corrupted_hints_without_integrity(repository): - make_auxiliary(repository) - _subtly_corrupted_hints_setup(repository) - integrity_path = os.path.join(repository.path, "integrity.5") - os.unlink(integrity_path) - with repository: - repository.append_only = False - repository.put(H(3), fchunk(b"1234")) - # do a compaction run, which fails since the corrupted refcount wasn't detected and causes an assertion failure. - with pytest.raises(AssertionError) as exc_info: - repository.commit(compact=True) - assert "Corrupted segment reference count" in str(exc_info.value) - - -def list_indices(repo_path): - return [name for name in os.listdir(repo_path) if name.startswith("index.")] - - def check(repository, repo_path, repair=False, status=True): assert repository.check(repair=repair) == status # Make sure no tmp files are left behind @@ -889,209 +153,6 @@ def check(repository, repo_path, repair=False, status=True): assert tmp_files == [], "Found tmp files" -def get_objects(repository, *ids): - for id_ in ids: - pdchunk(repository.get(H(id_))) - - -def add_objects(repository, segments): - for ids in segments: - for id_ in ids: - repository.put(H(id_), fchunk(b"data")) - repository.commit(compact=False) - - -def get_head(repo_path): - return sorted(int(n) for n in os.listdir(os.path.join(repo_path, "data", "0")) if n.isdigit())[-1] - - -def open_index(repo_path): - return NSIndex.read(os.path.join(repo_path, f"index.{get_head(repo_path)}")) - - -def corrupt_object(repo_path, id_): - idx = open_index(repo_path) - segment, offset, _ = idx[H(id_)] - with open(os.path.join(repo_path, "data", "0", str(segment)), "r+b") as fd: - fd.seek(offset) - fd.write(b"BOOM") - - -def delete_segment(repository, segment): - repository.io.delete_segment(segment) - - -def delete_index(repo_path): - os.unlink(os.path.join(repo_path, f"index.{get_head(repo_path)}")) - - -def rename_index(repo_path, new_name): - os.replace(os.path.join(repo_path, f"index.{get_head(repo_path)}"), os.path.join(repo_path, new_name)) - - -def list_objects(repository): - return {int(key) for key in repository.list()} - - -def test_repair_corrupted_segment(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repo_path = get_path(repository) - add_objects(repository, [[1, 2, 3], [4, 5], [6]]) - assert {1, 2, 3, 4, 5, 6} == list_objects(repository) - check(repository, repo_path, status=True) - corrupt_object(repo_path, 5) - with pytest.raises(IntegrityError): - get_objects(repository, 5) - repository.rollback() - # make sure a regular check does not repair anything - check(repository, repo_path, status=False) - check(repository, repo_path, status=False) - # make sure a repair actually repairs the repo - check(repository, repo_path, repair=True, status=True) - get_objects(repository, 4) - check(repository, repo_path, status=True) - assert {1, 2, 3, 4, 6} == list_objects(repository) - - -def test_repair_missing_segment(repository): - # only test on local repo - files in RemoteRepository cannot be deleted - with repository: - add_objects(repository, [[1, 2, 3], [4, 5, 6]]) - assert {1, 2, 3, 4, 5, 6} == list_objects(repository) - check(repository, repository.path, status=True) - delete_segment(repository, 2) - repository.rollback() - check(repository, repository.path, repair=True, status=True) - assert {1, 2, 3} == list_objects(repository) - - -def test_repair_missing_commit_segment(repository): - # only test on local repo - files in RemoteRepository cannot be deleted - with repository: - add_objects(repository, [[1, 2, 3], [4, 5, 6]]) - delete_segment(repository, 3) - with pytest.raises(Repository.ObjectNotFound): - get_objects(repository, 4) - assert {1, 2, 3} == list_objects(repository) - - -def test_repair_corrupted_commit_segment(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repo_path = get_path(repository) - add_objects(repository, [[1, 2, 3], [4, 5, 6]]) - with open(os.path.join(repo_path, "data", "0", "3"), "r+b") as fd: - fd.seek(-1, os.SEEK_END) - fd.write(b"X") - with pytest.raises(Repository.ObjectNotFound): - get_objects(repository, 4) - check(repository, repo_path, status=True) - get_objects(repository, 3) - assert {1, 2, 3} == list_objects(repository) - - -def test_repair_no_commits(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repo_path = get_path(repository) - add_objects(repository, [[1, 2, 3]]) - with open(os.path.join(repo_path, "data", "0", "1"), "r+b") as fd: - fd.seek(-1, os.SEEK_END) - fd.write(b"X") - with pytest.raises(Repository.CheckNeeded): - get_objects(repository, 4) - check(repository, repo_path, status=False) - check(repository, repo_path, status=False) - assert list_indices(repo_path) == ["index.1"] - check(repository, repo_path, repair=True, status=True) - assert list_indices(repo_path) == ["index.2"] - check(repository, repo_path, status=True) - get_objects(repository, 3) - assert {1, 2, 3} == list_objects(repository) - - -def test_repair_missing_index(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repo_path = get_path(repository) - add_objects(repository, [[1, 2, 3], [4, 5, 6]]) - delete_index(repo_path) - check(repository, repo_path, status=True) - get_objects(repository, 4) - assert {1, 2, 3, 4, 5, 6} == list_objects(repository) - - -def test_repair_index_too_new(repo_fixtures, request): - with get_repository_from_fixture(repo_fixtures, request) as repository: - repo_path = get_path(repository) - add_objects(repository, [[1, 2, 3], [4, 5, 6]]) - assert list_indices(repo_path) == ["index.3"] - rename_index(repo_path, "index.100") - check(repository, repo_path, status=True) - assert list_indices(repo_path) == ["index.3"] - get_objects(repository, 4) - assert {1, 2, 3, 4, 5, 6} == list_objects(repository) - - -def test_crash_before_compact(repository): - # only test on local repo - we can't mock-patch a RemoteRepository class in another process! - with repository: - repository.put(H(0), fchunk(b"data")) - repository.put(H(0), fchunk(b"data2")) - # simulate a crash before compact - with patch.object(Repository, "compact_segments") as compact: - repository.commit(compact=True) - compact.assert_called_once_with(0.1) - with reopen(repository) as repository: - check(repository, repository.path, repair=True) - assert pdchunk(repository.get(H(0))) == b"data2" - - -def test_hints_persistence(repository): - with repository: - repository.put(H(0), fchunk(b"data")) - repository.delete(H(0)) - repository.commit(compact=False) - shadow_index_expected = repository.shadow_index - compact_expected = repository.compact - segments_expected = repository.segments - # close and re-open the repository (create fresh Repository instance) to - # check whether hints were persisted to / reloaded from disk - with reopen(repository) as repository: - repository.put(H(42), fchunk(b"foobar")) # this will call prepare_txn() and load the hints data - # check if hints persistence worked: - assert shadow_index_expected == repository.shadow_index - assert compact_expected == repository.compact - del repository.segments[2] # ignore the segment created by put(H(42), ...) - assert segments_expected == repository.segments - with reopen(repository) as repository: - check(repository, repository.path, repair=True) - with reopen(repository) as repository: - repository.put(H(42), fchunk(b"foobar")) # this will call prepare_txn() and load the hints data - assert shadow_index_expected == repository.shadow_index - # sizes do not match, with vs. without header? - # assert compact_expected == repository.compact - del repository.segments[2] # ignore the segment created by put(H(42), ...) - assert segments_expected == repository.segments - - -def test_hints_behaviour(repository): - with repository: - repository.put(H(0), fchunk(b"data")) - assert repository.shadow_index == {} - assert len(repository.compact) == 0 - repository.delete(H(0)) - repository.commit(compact=False) - # now there should be an entry for H(0) in shadow_index - assert H(0) in repository.shadow_index - assert len(repository.shadow_index[H(0)]) == 1 - assert 0 in repository.compact # segment 0 can be compacted - repository.put(H(42), fchunk(b"foobar")) # see also do_compact() - repository.commit(compact=True, threshold=0.0) # compact completely! - # nothing to compact anymore! no info left about stuff that does not exist anymore: - assert H(0) not in repository.shadow_index - # segment 0 was compacted away, no info about it left: - assert 0 not in repository.compact - assert 0 not in repository.segments - - def _get_mock_args(): class MockArgs: remote_path = "borg" diff --git a/src/borg/testsuite/shellpattern.py b/src/borg/testsuite/shellpattern.py index e8b1acd1af..7b89cfd6bd 100644 --- a/src/borg/testsuite/shellpattern.py +++ b/src/borg/testsuite/shellpattern.py @@ -124,9 +124,9 @@ def test_mismatch(path, patterns): def test_match_end(): regex = shellpattern.translate("*-home") # default is match_end == string end assert re.match(regex, "2017-07-03-home") - assert not re.match(regex, "2017-07-03-home.checkpoint") + assert not re.match(regex, "2017-07-03-home.xxx") - match_end = r"(%s)?\Z" % r"\.checkpoint(\.\d+)?" # with/without checkpoint ending + match_end = r"(\.xxx)?\Z" # with/without .xxx ending regex = shellpattern.translate("*-home", match_end=match_end) assert re.match(regex, "2017-07-03-home") - assert re.match(regex, "2017-07-03-home.checkpoint") + assert re.match(regex, "2017-07-03-home.xxx") diff --git a/src/borg/testsuite/storelocking.py b/src/borg/testsuite/storelocking.py new file mode 100644 index 0000000000..4fbf0be34a --- /dev/null +++ b/src/borg/testsuite/storelocking.py @@ -0,0 +1,100 @@ +import time + +import pytest + +from borgstore.store import Store + +from ..storelocking import Lock, NotLocked, LockTimeout + +ID1 = "foo", 1, 1 +ID2 = "bar", 2, 2 + + +@pytest.fixture() +def lockstore(tmpdir): + store = Store("file://" + str(tmpdir / "lockstore")) + store.create() + with store: + yield store + store.destroy() + + +class TestLock: + def test_cm(self, lockstore): + with Lock(lockstore, exclusive=True, id=ID1) as lock: + assert lock.got_exclusive_lock() + with Lock(lockstore, exclusive=False, id=ID1) as lock: + assert not lock.got_exclusive_lock() + + def test_got_exclusive_lock(self, lockstore): + lock = Lock(lockstore, exclusive=True, id=ID1) + assert not lock.got_exclusive_lock() + lock.acquire() + assert lock.got_exclusive_lock() + lock.release() + assert not lock.got_exclusive_lock() + + def test_exclusive_lock(self, lockstore): + # there must not be 2 exclusive locks + with Lock(lockstore, exclusive=True, id=ID1): + with pytest.raises(LockTimeout): + Lock(lockstore, exclusive=True, id=ID2).acquire() + # acquiring an exclusive lock will time out if the non-exclusive does not go away + with Lock(lockstore, exclusive=False, id=ID1): + with pytest.raises(LockTimeout): + Lock(lockstore, exclusive=True, id=ID2).acquire() + + def test_double_nonexclusive_lock_succeeds(self, lockstore): + with Lock(lockstore, exclusive=False, id=ID1): + with Lock(lockstore, exclusive=False, id=ID2): + pass + + def test_not_locked(self, lockstore): + lock = Lock(lockstore, exclusive=True, id=ID1) + with pytest.raises(NotLocked): + lock.release() + lock = Lock(lockstore, exclusive=False, id=ID1) + with pytest.raises(NotLocked): + lock.release() + + def test_break_lock(self, lockstore): + lock = Lock(lockstore, exclusive=True, id=ID1).acquire() + lock.break_lock() + with Lock(lockstore, exclusive=True, id=ID2): + pass + with Lock(lockstore, exclusive=True, id=ID1): + pass + + def test_lock_refresh_stale_removal(self, lockstore): + # stale after 2s, refreshable after 1s + lock = Lock(lockstore, exclusive=True, id=ID1, stale=2) + lock.acquire() + lock_keys_a00 = set(lock._get_locks()) + time.sleep(0.5) + lock.refresh() # shouldn't change locks, existing lock too young + lock_keys_a05 = set(lock._get_locks()) + time.sleep(0.6) + lock.refresh() # that should refresh the lock! + lock_keys_b00 = set(lock._get_locks()) + time.sleep(2.1) + lock_keys_b21 = set(lock._get_locks()) # now the lock should be stale & gone. + assert lock_keys_a00 == lock_keys_a05 # was too young, no refresh done + assert len(lock_keys_a00) == 1 + assert lock_keys_a00 != lock_keys_b00 # refresh done, new lock has different key + assert len(lock_keys_b00) == 1 + assert len(lock_keys_b21) == 0 # stale lock was ignored + assert len(list(lock.store.list("locks"))) == 0 # stale lock was removed from store + + def test_migrate_lock(self, lockstore): + old_id, new_id = ID1, ID2 + assert old_id[1] != new_id[1] # different PIDs (like when doing daemonize()) + lock = Lock(lockstore, id=old_id).acquire() + old_locks = lock._find_locks(only_mine=True) + assert lock.id == old_id # lock is for old id / PID + lock.migrate_lock(old_id, new_id) # fix the lock + assert lock.id == new_id # lock corresponds to the new id / PID + new_locks = lock._find_locks(only_mine=True) + assert old_locks != new_locks + assert len(old_locks) == len(new_locks) == 1 + assert old_locks[0]["hostid"] == old_id[0] + assert new_locks[0]["hostid"] == new_id[0] diff --git a/src/borg/upgrade.py b/src/borg/upgrade.py index 22a27c18c7..35d71bec2a 100644 --- a/src/borg/upgrade.py +++ b/src/borg/upgrade.py @@ -85,7 +85,7 @@ def upgrade_item(self, *, item): if chunks is not None: item.chunks = chunks for chunk_id, chunk_size in chunks: - self.cache.chunk_incref(chunk_id, chunk_size, self.archive.stats) + self.cache.reuse_chunk(chunk_id, chunk_size, self.archive.stats) if chunks_healthy is not None: item.chunks_healthy = chunks del item.source # not used for hardlinks any more, replaced by hlid diff --git a/tox.ini b/tox.ini index 10f3e7f30e..402186dcd2 100644 --- a/tox.ini +++ b/tox.ini @@ -42,7 +42,7 @@ deps = pytest mypy pkgconfig -commands = mypy +commands = mypy --ignore-missing-imports [testenv:docs] changedir = docs