From f2e2850bafd1178205bb081af43d791846747f88 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 18 Sep 2024 17:22:37 +0200 Subject: [PATCH] build ad-hoc files cache based on previous archive in series also: fix the tests, so they use a series. --- src/borg/archive.py | 2 +- src/borg/archiver/create_cmd.py | 1 + src/borg/cache.py | 31 ++++++++++++++++------- src/borg/testsuite/archiver/create_cmd.py | 28 ++++++++++---------- 4 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 77673ea92b..cb5ecba770 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1345,7 +1345,7 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal, item.chunks.append(chunk_entry) else: # normal case, no "2nd+" hardlink if not is_special_file: - hashed_path = safe_encode(os.path.join(self.cwd, path)) + hashed_path = safe_encode(item.path) # path as in archive item! started_hashing = time.monotonic() path_hash = self.key.id_hash(hashed_path) self.stats.hashing_time += time.monotonic() - started_hashing diff --git a/src/borg/archiver/create_cmd.py b/src/borg/archiver/create_cmd.py index e9f2b1afbd..24526dedf0 100644 --- a/src/borg/archiver/create_cmd.py +++ b/src/borg/archiver/create_cmd.py @@ -225,6 +225,7 @@ def create_inner(archive, cache, fso): lock_wait=self.lock_wait, cache_mode=args.files_cache_mode, iec=args.iec, + archive_name=args.name, ) as cache: archive = Archive( manifest, diff --git a/src/borg/cache.py b/src/borg/cache.py index 4d50fa2c00..78bf165cf6 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -15,8 +15,8 @@ from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Error from .helpers import get_cache_dir, get_security_dir -from .helpers import hex_to_bin, parse_stringified_list -from .helpers import format_file_size +from .helpers import hex_to_bin, bin_to_hex, parse_stringified_list +from .helpers import format_file_size, safe_encode from .helpers import yes from .helpers import ProgressIndicatorMessage from .helpers import msgpack @@ -347,6 +347,7 @@ def __new__( lock_wait=None, cache_mode=FILES_CACHE_MODE_DISABLED, iec=False, + archive_name=None, ): return AdHocWithFilesCache( manifest=manifest, @@ -356,6 +357,7 @@ def __new__( iec=iec, lock_wait=lock_wait, cache_mode=cache_mode, + archive_name=archive_name, ) @@ -369,8 +371,8 @@ class FilesCacheMixin: If so, we use the cached chunks list and skip reading/chunking the file contents. """ - def __init__(self, cache_mode, previous_archive_id=None): - self.previous_archive_id = previous_archive_id + def __init__(self, cache_mode, archive_name=None): + self.archive_name = archive_name # ideally a SERIES name assert not ("c" in cache_mode and "m" in cache_mode) assert "d" in cache_mode or "c" in cache_mode or "m" in cache_mode self.cache_mode = cache_mode @@ -387,20 +389,30 @@ def files(self): def _build_files_cache(self): if "d" in self.cache_mode: # d(isabled) return {} - if self.previous_archive_id is None: + + if not self.archive_name: return {} from .archive import Archive + # get the latest archive with the IDENTICAL name, supporting archive series: + archives = self.manifest.archives.list(match=self.archive_name, sort_by=["ts"], last=1) + if not archives: + # nothing found + return {} + prev_archive = archives[0] + files = {} - logger.debug("Building files cache ...") + logger.debug( + f"Building files cache from {prev_archive.name} {prev_archive.ts} {bin_to_hex(prev_archive.id)} ..." + ) files_cache_logger.debug("FILES-CACHE-BUILD: starting...") - archive = Archive(self.manifest, self.previous_archive_id) + archive = Archive(self.manifest, prev_archive.id) for item in archive.iter_items(preload=False): # only put regular files' infos into the files cache: if stat.S_ISREG(item.mode): assert "chunks" in item # TODO: is item.chunks ready? - path_hash = self.key.id_hash(item.path) # TODO: NOT the full absolute path as it used to be! + path_hash = self.key.id_hash(safe_encode(item.path)) # keep track of the key(s) for the most recent timestamp(s): ctime_ns = item.ctime if ctime_ns > self._newest_cmtime: @@ -585,13 +597,14 @@ def __init__( lock_wait=None, cache_mode=FILES_CACHE_MODE_DISABLED, iec=False, + archive_name=None, ): """ :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository :param lock_wait: timeout for lock acquisition (int [s] or None [wait forever]) :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison """ - FilesCacheMixin.__init__(self, cache_mode) # TODO: give previous_archive_id + FilesCacheMixin.__init__(self, cache_mode, archive_name) ChunksMixin.__init__(self) assert isinstance(manifest, Manifest) self.manifest = manifest diff --git a/src/borg/testsuite/archiver/create_cmd.py b/src/borg/testsuite/archiver/create_cmd.py index 739e43553a..ac9611359e 100644 --- a/src/borg/testsuite/archiver/create_cmd.py +++ b/src/borg/testsuite/archiver/create_cmd.py @@ -657,7 +657,7 @@ def test_file_status(archivers, request): assert "A input/file1" in output assert "A input/file2" in output # should find first file as unmodified - output = cmd(archiver, "create", "--list", "test2", "input") + output = cmd(archiver, "create", "--list", "test", "input") assert "U input/file1" in output # although surprising, this is expected. For why, see: # https://borgbackup.readthedocs.org/en/latest/faq.html#i-am-seeing-a-added-status-for-a-unchanged-file @@ -674,13 +674,13 @@ def test_file_status_cs_cache_mode(archivers, request): time.sleep(1) # file2 must have newer timestamps than file1 create_regular_file(archiver.input_path, "file2", size=10) cmd(archiver, "repo-create", RK_ENCRYPTION) - cmd(archiver, "create", "test1", "input", "--list", "--files-cache=ctime,size") + cmd(archiver, "create", "test", "input", "--list", "--files-cache=ctime,size") # modify file1, but cheat with the mtime (and atime) and also keep same size: st = os.stat("input/file1") create_regular_file(archiver.input_path, "file1", contents=b"321") os.utime("input/file1", ns=(st.st_atime_ns, st.st_mtime_ns)) # this mode uses ctime for change detection, so it should find file1 as modified - output = cmd(archiver, "create", "test2", "input", "--list", "--files-cache=ctime,size") + output = cmd(archiver, "create", "test", "input", "--list", "--files-cache=ctime,size") assert "M input/file1" in output @@ -691,12 +691,12 @@ def test_file_status_ms_cache_mode(archivers, request): time.sleep(1) # file2 must have newer timestamps than file1 create_regular_file(archiver.input_path, "file2", size=10) cmd(archiver, "repo-create", RK_ENCRYPTION) - cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test1", "input") + cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test", "input") # change mode of file1, no content change: st = os.stat("input/file1") os.chmod("input/file1", st.st_mode ^ stat.S_IRWXO) # this triggers a ctime change, but mtime is unchanged # this mode uses mtime for change detection, so it should find file1 as unmodified - output = cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test2", "input") + output = cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test", "input") assert "U input/file1" in output @@ -707,9 +707,9 @@ def test_file_status_rc_cache_mode(archivers, request): time.sleep(1) # file2 must have newer timestamps than file1 create_regular_file(archiver.input_path, "file2", size=10) cmd(archiver, "repo-create", RK_ENCRYPTION) - cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test1", "input") + cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test", "input") # no changes here, but this mode rechunks unconditionally - output = cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test2", "input") + output = cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test", "input") assert "A input/file1" in output @@ -729,7 +729,7 @@ def test_file_status_excluded(archivers, request): if has_lchflags: assert "- input/file3" in output # should find second file as excluded - output = cmd(archiver, "create", "test1", "input", "--list", "--exclude-nodump", "--exclude", "*/file2") + output = cmd(archiver, "create", "test", "input", "--list", "--exclude-nodump", "--exclude", "*/file2") assert "U input/file1" in output assert "- input/file2" in output if has_lchflags: @@ -762,14 +762,14 @@ def to_dict(borg_create_output): create_regular_file(archiver.input_path, "testfile1", contents=b"test1") time.sleep(1.0 if is_darwin else 0.01) # testfile2 must have newer timestamps than testfile1 create_regular_file(archiver.input_path, "testfile2", contents=b"test2") - result = cmd(archiver, "create", "--stats", "test_archive2", archiver.input_path) + result = cmd(archiver, "create", "--stats", "test_archive", archiver.input_path) result = to_dict(result) assert result["Added files"] == 2 assert result["Unchanged files"] == 0 assert result["Modified files"] == 0 # Archive a dir with 1 unmodified file and 1 modified create_regular_file(archiver.input_path, "testfile1", contents=b"new data") - result = cmd(archiver, "create", "--stats", "test_archive3", archiver.input_path) + result = cmd(archiver, "create", "--stats", "test_archive", archiver.input_path) result = to_dict(result) # Should process testfile2 as added because of # https://borgbackup.readthedocs.io/en/stable/faq.html#i-am-seeing-a-added-status-for-an-unchanged-file @@ -807,18 +807,18 @@ def test_create_topical(archivers, request): output = cmd(archiver, "create", "test", "input") assert "file1" not in output # shouldn't be listed even if unchanged - output = cmd(archiver, "create", "test0", "input") + output = cmd(archiver, "create", "test", "input") assert "file1" not in output # should list the file as unchanged - output = cmd(archiver, "create", "test1", "input", "--list", "--filter=U") + output = cmd(archiver, "create", "test", "input", "--list", "--filter=U") assert "file1" in output # should *not* list the file as changed - output = cmd(archiver, "create", "test2", "input", "--list", "--filter=AM") + output = cmd(archiver, "create", "test", "input", "--list", "--filter=AM") assert "file1" not in output # change the file create_regular_file(archiver.input_path, "file1", size=1024 * 100) # should list the file as changed - output = cmd(archiver, "create", "test3", "input", "--list", "--filter=AM") + output = cmd(archiver, "create", "test", "input", "--list", "--filter=AM") assert "file1" in output