Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion src/borg/helpers/parseformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from collections import OrderedDict
from datetime import datetime, timezone
from functools import partial
from hashlib import sha256
from string import Formatter

from ..logger import create_logger
Expand Down Expand Up @@ -876,14 +877,15 @@ class ItemFormatter(BaseFormatter):
"isoctime": "file change time (ISO 8601 format)",
"isoatime": "file access time (ISO 8601 format)",
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
"fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
"archiveid": "internal ID of the archive",
"archivename": "name of the archive",
}
KEY_GROUPS = (
("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"),
("size", "num_chunks"),
("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
tuple(sorted(hash_algorithms)),
tuple(["fingerprint"] + sorted(hash_algorithms)),
("archiveid", "archivename", "extra"),
)

Expand All @@ -903,6 +905,15 @@ def __init__(self, archive, format):
self.archive = archive
# track which keys were requested in the format string
self.format_keys = {f[1] for f in Formatter().parse(format)}

# we want a hash over the conditions that influence the chunk ID list for a given file content:
# - the id algorithm and key
# - the chunker seed (if any - buzhash64 derives seed from id_key)
# - the chunker params
key = archive.key
conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}"
self.conditions_hash = sha256(conditions.encode()).hexdigest()

self.call_keys = {
"size": self.calculate_size,
"num_chunks": self.calculate_num_chunks,
Expand All @@ -912,6 +923,7 @@ def __init__(self, archive, format):
"mtime": partial(self.format_time, "mtime"),
"ctime": partial(self.format_time, "ctime"),
"atime": partial(self.format_time, "atime"),
"fingerprint": self.calculate_fingerprint,
}
for hash_function in self.hash_algorithms:
self.call_keys[hash_function] = partial(self.hash_item, hash_function)
Expand Down Expand Up @@ -963,6 +975,16 @@ def calculate_size(self, item):
# note: does not support hard link slaves, they will be size 0
return item.get_size()

def calculate_fingerprint(self, item):
# calculate a very fast file contents fingerprint
chunks = item.get("chunks")
if chunks is None:
return ""
chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest()
# we do not encounter many different conditions hashes, so the collision probability is low.
# thus, we can keep it short and only return 64 bits from the conditions hash.
return f"{self.conditions_hash[:16]}-{chunks_hash}"

def hash_item(self, hash_function, item):
if "chunks" not in item:
return ""
Expand Down
58 changes: 58 additions & 0 deletions src/borg/testsuite/archiver/list_cmd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request):
assert inodes["input/fileA"] != inodes["input/fileC"]
else:
pytest.skip("Platform does not provide inode numbers for items")


def test_fingerprint(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_regular_file(archiver.input_path, "file1", contents=b"content")
create_regular_file(archiver.input_path, "file2", contents=b"other")
cmd(archiver, "create", "test1", "input")

output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}")
fingerprints1 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints1[path] = fp

# Same content, same chunker params -> same fingerprint
cmd(archiver, "create", "test2", "input")
output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}")
fingerprints2 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints2[path] = fp
assert fingerprints1 == fingerprints2

# Modified content -> different fingerprint
create_regular_file(archiver.input_path, "file1", contents=b"modification")
cmd(archiver, "create", "test3", "input")
output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}")
fingerprints3 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints3[path] = fp
assert fingerprints1["input/file1"] != fingerprints3["input/file1"]
# Unmodified file should still match
assert fingerprints1["input/file2"] == fingerprints3["input/file2"]

# Different chunker params -> different fingerprint
# We can use the same repo but specify different chunker params for a new archive
cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input")
output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}")
fingerprints4 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints4[path] = fp

# Even unmodified files should have different fingerprints because conditions_hash changed
assert fingerprints1["input/file2"] != fingerprints4["input/file2"]

# Also try with buzhash64
cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
fingerprints5 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints5[path] = fp

# Even unmodified files should have different fingerprints because conditions_hash changed
assert fingerprints1["input/file2"] != fingerprints5["input/file2"]
Loading