From baed0bc5d0fd52153b44750a6ab72fa98bd926ef Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 10 Feb 2026 08:07:47 +0100 Subject: [PATCH] list --format: add fingerprint placeholder This allows users to compare file content efficiently without reading the full file data, by exposing a hash of the chunk IDs and the relevant conditions for valid comparisons, like chunker params, chunker seed/key, id key, key type, etc. This is based on PR #5167 by @hrehfeld, code + discussion, with some changes: - the conditions hash now includes more relevant input params - returning a single value that is composed of 2 parts - tests (including new buzhash64) Example output (different files in same archive): 1e88bfb02d0a5320-a539587200c33b857f9827d01fcb7dabacf30501c83929e7308668d43f4a6302 file1 1e88bfb02d0a5320-9ed78a4c14d0506d9ae75d914cca90db64655ddea22647dd1c89f19e2fc080ae file2 The fingerprint has 2 parts: First part: same hash, indicates same chunking / chunk id generation params, meaning that the second part is valid to be compared. Second part: different hash, because file content is different. same hash here would mean same content. --- src/borg/helpers/parseformat.py | 24 +++++++- src/borg/testsuite/archiver/list_cmd_test.py | 58 ++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 9c829ebc7c..5132446b5a 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -15,6 +15,7 @@ from collections import OrderedDict from datetime import datetime, timezone from functools import partial +from hashlib import sha256 from string import Formatter from ..logger import create_logger @@ -876,6 +877,7 @@ class ItemFormatter(BaseFormatter): "isoctime": "file change time (ISO 8601 format)", "isoatime": "file access time (ISO 8601 format)", "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)", + "fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)", "archiveid": "internal ID of the archive", "archivename": "name of the archive", } @@ -883,7 +885,7 @@ class ItemFormatter(BaseFormatter): ("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"), ("size", "num_chunks"), ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"), - tuple(sorted(hash_algorithms)), + tuple(["fingerprint"] + sorted(hash_algorithms)), ("archiveid", "archivename", "extra"), ) @@ -903,6 +905,15 @@ def __init__(self, archive, format): self.archive = archive # track which keys were requested in the format string self.format_keys = {f[1] for f in Formatter().parse(format)} + + # we want a hash over the conditions that influence the chunk ID list for a given file content: + # - the id algorithm and key + # - the chunker seed (if any - buzhash64 derives seed from id_key) + # - the chunker params + key = archive.key + conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}" + self.conditions_hash = sha256(conditions.encode()).hexdigest() + self.call_keys = { "size": self.calculate_size, "num_chunks": self.calculate_num_chunks, @@ -912,6 +923,7 @@ def __init__(self, archive, format): "mtime": partial(self.format_time, "mtime"), "ctime": partial(self.format_time, "ctime"), "atime": partial(self.format_time, "atime"), + "fingerprint": self.calculate_fingerprint, } for hash_function in self.hash_algorithms: self.call_keys[hash_function] = partial(self.hash_item, hash_function) @@ -963,6 +975,16 @@ def calculate_size(self, item): # note: does not support hard link slaves, they will be size 0 return item.get_size() + def calculate_fingerprint(self, item): + # calculate a very fast file contents fingerprint + chunks = item.get("chunks") + if chunks is None: + return "" + chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest() + # we do not encounter many different conditions hashes, so the collision probability is low. + # thus, we can keep it short and only return 64 bits from the conditions hash. + return f"{self.conditions_hash[:16]}-{chunks_hash}" + def hash_item(self, hash_function, item): if "chunks" not in item: return "" diff --git a/src/borg/testsuite/archiver/list_cmd_test.py b/src/borg/testsuite/archiver/list_cmd_test.py index c0f66ec5b1..20224b4ae5 100644 --- a/src/borg/testsuite/archiver/list_cmd_test.py +++ b/src/borg/testsuite/archiver/list_cmd_test.py @@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request): assert inodes["input/fileA"] != inodes["input/fileC"] else: pytest.skip("Platform does not provide inode numbers for items") + + +def test_fingerprint(archivers, request): + archiver = request.getfixturevalue(archivers) + cmd(archiver, "repo-create", RK_ENCRYPTION) + create_regular_file(archiver.input_path, "file1", contents=b"content") + create_regular_file(archiver.input_path, "file2", contents=b"other") + cmd(archiver, "create", "test1", "input") + + output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}") + fingerprints1 = {} + for line in output.splitlines(): + fp, path = line.split(" ", 1) + fingerprints1[path] = fp + + # Same content, same chunker params -> same fingerprint + cmd(archiver, "create", "test2", "input") + output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}") + fingerprints2 = {} + for line in output.splitlines(): + fp, path = line.split(" ", 1) + fingerprints2[path] = fp + assert fingerprints1 == fingerprints2 + + # Modified content -> different fingerprint + create_regular_file(archiver.input_path, "file1", contents=b"modification") + cmd(archiver, "create", "test3", "input") + output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}") + fingerprints3 = {} + for line in output.splitlines(): + fp, path = line.split(" ", 1) + fingerprints3[path] = fp + assert fingerprints1["input/file1"] != fingerprints3["input/file1"] + # Unmodified file should still match + assert fingerprints1["input/file2"] == fingerprints3["input/file2"] + + # Different chunker params -> different fingerprint + # We can use the same repo but specify different chunker params for a new archive + cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input") + output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}") + fingerprints4 = {} + for line in output.splitlines(): + fp, path = line.split(" ", 1) + fingerprints4[path] = fp + + # Even unmodified files should have different fingerprints because conditions_hash changed + assert fingerprints1["input/file2"] != fingerprints4["input/file2"] + + # Also try with buzhash64 + cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input") + output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}") + fingerprints5 = {} + for line in output.splitlines(): + fp, path = line.split(" ", 1) + fingerprints5[path] = fp + + # Even unmodified files should have different fingerprints because conditions_hash changed + assert fingerprints1["input/file2"] != fingerprints5["input/file2"]