Skip to content

Commit

Permalink
analyze: sum up plaintext sizes rather than count
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasWaldmann committed Sep 29, 2024
1 parent ff3b331 commit 7533f23
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 13 deletions.
17 changes: 10 additions & 7 deletions src/borg/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,22 +56,26 @@ def analyze_archives(self) -> None:
def analyze_archive(self, id):
"""compute the set of chunks for each directory in this archive"""
archive = Archive(self.manifest, id)
chunks_by_path = defaultdict(set) # collect all chunk IDs generated from files in this directory path
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
for item in archive.iter_items():
if "chunks" in item:
item_chunks = set(id for id, size in item.chunks)
item_chunks = dict(item.chunks) # chunk id -> plaintext size
directory_path = os.path.dirname(item.path)
chunks_by_path[directory_path].update(item_chunks)
return chunks_by_path

def analyze_change(self, base, new):
"""for each directory path, count the chunks changed (removed or added chunks) between base and new."""
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""

def analyze_path_change(path):
base_chunks = base[path]
new_chunks = new[path]
different_chunks = base_chunks.symmetric_difference(new_chunks) # removed or added chunks
self.difference_by_path[directory_path] += len(different_chunks)
# add up added chunks' sizes
for id in new_chunks.keys() - base_chunks.keys():
self.difference_by_path[directory_path] += new_chunks[id]
# add up removed chunks' sizes
for id in base_chunks.keys() - new_chunks.keys():
self.difference_by_path[directory_path] += base_chunks[id]

for directory_path in base:
analyze_path_change(directory_path)
Expand All @@ -85,8 +89,7 @@ def report(self):
print("=========================================")
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
difference = self.difference_by_path[directory_path]
if difference > 0:
print(f"{directory_path}: {difference}")
print(f"{directory_path}: {difference}")


class AnalyzeMixIn:
Expand Down
12 changes: 6 additions & 6 deletions src/borg/testsuite/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,23 @@ def analyze_archives():
input_path = pathlib.Path(archiver.input_path)

# 1st archive
(input_path / "file1").write_text("foo")
(input_path / "file1").write_text("1")
create_archive()

# 2nd archive
(input_path / "file2").write_text("bar")
(input_path / "file2").write_text("22")
create_archive()

assert "/input: 1" in analyze_archives() # 2nd archive added 1 chunk for input path
assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path

# 3rd archive
(input_path / "file3").write_text("baz")
(input_path / "file3").write_text("333")
create_archive()

assert "/input: 2" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path
assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path

# 4th archive
(input_path / "file2").unlink()
create_archive()

assert "/input: 3" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1
assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1

0 comments on commit 7533f23

Please sign in to comment.