From baed0bc5d0fd52153b44750a6ab72fa98bd926ef Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 10 Feb 2026 08:07:47 +0100
Subject: [PATCH] list --format: add fingerprint placeholder

This allows users to compare file content efficiently without reading the
full file data, by exposing a hash of the chunk IDs and the relevant
conditions for valid comparisons, like chunker params, chunker seed/key,
id key, key type, etc.

This is based on PR #5167 by @hrehfeld, code + discussion, with some changes:
- the conditions hash now includes more relevant input params
- returning a single value that is composed of 2 parts
- tests (including new buzhash64)

Example output (different files in same archive):

1e88bfb02d0a5320-a539587200c33b857f9827d01fcb7dabacf30501c83929e7308668d43f4a6302 file1
1e88bfb02d0a5320-9ed78a4c14d0506d9ae75d914cca90db64655ddea22647dd1c89f19e2fc080ae file2

The fingerprint has 2 parts:

First part: same hash, indicates same chunking / chunk id generation params,
            meaning that the second part is valid to be compared.

Second part: different hash, because file content is different.
             same hash here would mean same content.
---
 src/borg/helpers/parseformat.py              | 24 +++++++-
 src/borg/testsuite/archiver/list_cmd_test.py | 58 ++++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index 9c829ebc7c..5132446b5a 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -15,6 +15,7 @@
 from collections import OrderedDict
 from datetime import datetime, timezone
 from functools import partial
+from hashlib import sha256
 from string import Formatter
 
 from ..logger import create_logger
@@ -876,6 +877,7 @@ class ItemFormatter(BaseFormatter):
         "isoctime": "file change time (ISO 8601 format)",
         "isoatime": "file access time (ISO 8601 format)",
         "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
+        "fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
         "archiveid": "internal ID of the archive",
         "archivename": "name of the archive",
     }
@@ -883,7 +885,7 @@ class ItemFormatter(BaseFormatter):
         ("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"),
         ("size", "num_chunks"),
         ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
-        tuple(sorted(hash_algorithms)),
+        tuple(["fingerprint"] + sorted(hash_algorithms)),
         ("archiveid", "archivename", "extra"),
     )
 
@@ -903,6 +905,15 @@ def __init__(self, archive, format):
         self.archive = archive
         # track which keys were requested in the format string
         self.format_keys = {f[1] for f in Formatter().parse(format)}
+
+        # we want a hash over the conditions that influence the chunk ID list for a given file content:
+        # - the id algorithm and key
+        # - the chunker seed (if any - buzhash64 derives seed from id_key)
+        # - the chunker params
+        key = archive.key
+        conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}"
+        self.conditions_hash = sha256(conditions.encode()).hexdigest()
+
         self.call_keys = {
             "size": self.calculate_size,
             "num_chunks": self.calculate_num_chunks,
@@ -912,6 +923,7 @@ def __init__(self, archive, format):
             "mtime": partial(self.format_time, "mtime"),
             "ctime": partial(self.format_time, "ctime"),
             "atime": partial(self.format_time, "atime"),
+            "fingerprint": self.calculate_fingerprint,
         }
         for hash_function in self.hash_algorithms:
             self.call_keys[hash_function] = partial(self.hash_item, hash_function)
@@ -963,6 +975,16 @@ def calculate_size(self, item):
         # note: does not support hard link slaves, they will be size 0
         return item.get_size()
 
+    def calculate_fingerprint(self, item):
+        # calculate a very fast file contents fingerprint
+        chunks = item.get("chunks")
+        if chunks is None:
+            return ""
+        chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest()
+        # we do not encounter many different conditions hashes, so the collision probability is low.
+        # thus, we can keep it short and only return 64 bits from the conditions hash.
+        return f"{self.conditions_hash[:16]}-{chunks_hash}"
+
     def hash_item(self, hash_function, item):
         if "chunks" not in item:
             return ""
diff --git a/src/borg/testsuite/archiver/list_cmd_test.py b/src/borg/testsuite/archiver/list_cmd_test.py
index c0f66ec5b1..20224b4ae5 100644
--- a/src/borg/testsuite/archiver/list_cmd_test.py
+++ b/src/borg/testsuite/archiver/list_cmd_test.py
@@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request):
         assert inodes["input/fileA"] != inodes["input/fileC"]
     else:
         pytest.skip("Platform does not provide inode numbers for items")
+
+
+def test_fingerprint(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+    create_regular_file(archiver.input_path, "file1", contents=b"content")
+    create_regular_file(archiver.input_path, "file2", contents=b"other")
+    cmd(archiver, "create", "test1", "input")
+
+    output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}")
+    fingerprints1 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints1[path] = fp
+
+    # Same content, same chunker params -> same fingerprint
+    cmd(archiver, "create", "test2", "input")
+    output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}")
+    fingerprints2 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints2[path] = fp
+    assert fingerprints1 == fingerprints2
+
+    # Modified content -> different fingerprint
+    create_regular_file(archiver.input_path, "file1", contents=b"modification")
+    cmd(archiver, "create", "test3", "input")
+    output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}")
+    fingerprints3 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints3[path] = fp
+    assert fingerprints1["input/file1"] != fingerprints3["input/file1"]
+    # Unmodified file should still match
+    assert fingerprints1["input/file2"] == fingerprints3["input/file2"]
+
+    # Different chunker params -> different fingerprint
+    # We can use the same repo but specify different chunker params for a new archive
+    cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input")
+    output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}")
+    fingerprints4 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints4[path] = fp
+
+    # Even unmodified files should have different fingerprints because conditions_hash changed
+    assert fingerprints1["input/file2"] != fingerprints4["input/file2"]
+
+    # Also try with buzhash64
+    cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
+    output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
+    fingerprints5 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints5[path] = fp
+
+    # Even unmodified files should have different fingerprints because conditions_hash changed
+    assert fingerprints1["input/file2"] != fingerprints5["input/file2"]