From 289400341cd8fb3a96da0653f840da4c45b078b5 Mon Sep 17 00:00:00 2001 From: Hernan Gelaf-Romer Date: Thu, 12 Feb 2026 13:57:18 -0500 Subject: [PATCH] BackupBoundaries global coverage can allow premature WAL deletion when backup roots have different host coverage --- .../hbase/backup/master/BackupLogCleaner.java | 12 ++++-- .../hbase/backup/util/BackupBoundaries.java | 38 +++++++++++++------ .../backup/master/TestBackupLogCleaner.java | 28 +++++++++++++- 3 files changed, 62 insertions(+), 16 deletions(-) diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java index 7c62b7a681d8..4551a5df9de2 100644 --- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java +++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java @@ -128,7 +128,8 @@ private BackupBoundaries serverToPreservationBoundaryTs(BackupSystemTable sysTab for (TableName table : backupInfo.getTableSetTimestampMap().keySet()) { for (Map.Entry entry : backupInfo.getTableSetTimestampMap().get(table) .entrySet()) { - builder.addBackupTimestamps(entry.getKey(), entry.getValue(), startCode); + builder.addBackupTimestamps(backupInfo.getBackupId(), entry.getKey(), entry.getValue(), + startCode); } } } @@ -137,9 +138,12 @@ private BackupBoundaries serverToPreservationBoundaryTs(BackupSystemTable sysTab if (LOG.isDebugEnabled()) { LOG.debug("Boundaries oldestStartCode: {}", boundaries.getOldestStartCode()); - for (Map.Entry entry : boundaries.getBoundaries().entrySet()) { - LOG.debug("Server: {}, WAL cleanup boundary: {}", entry.getKey().getHostName(), - entry.getValue()); + for (Map.Entry> entry : boundaries.getBoundaries().entrySet()) { + String backupId = entry.getKey(); + for (Map.Entry addressAndTs : entry.getValue().entrySet()) { + LOG.debug("Backup: {}, Server: {}, WAL cleanup boundary: {}", backupId, + addressAndTs.getKey().getHostName(), addressAndTs.getValue()); + } } } diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/util/BackupBoundaries.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/util/BackupBoundaries.java index eb08a0ef5e08..9c375e2c5ca9 100644 --- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/util/BackupBoundaries.java +++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/util/BackupBoundaries.java @@ -38,22 +38,35 @@ public class BackupBoundaries { private static final BackupBoundaries EMPTY_BOUNDARIES = new BackupBoundaries(Collections.emptyMap(), Long.MAX_VALUE); - // This map tracks, for every RegionServer, the least recent (= oldest / lowest timestamp) - // inclusion in any backup. In other words, it is the timestamp boundary up to which all backup - // roots have included the WAL in their backup. - private final Map boundaries; + // Tracks WAL cleanup boundaries separately for each backup root to ensure WALs are only deleted + // when ALL backup roots no longer need them. The outer map key is the backup ID of the most + // recent backup from each backup root. For each backup root, the inner map stores the WAL + // timestamp boundary per RegionServer (the oldest WAL timestamp included in that backup root's + // most recent backup). A WAL file can only be deleted if it's older than the boundary for ALL + // backup roots, protecting WALs needed by any backup root even when other roots have already + // backed up that host at a later timestamp. + private final Map> boundaries; // The minimum WAL roll timestamp from the most recent backup of each backup root, used as a // fallback cleanup boundary for RegionServers without explicit backup boundaries (e.g., servers // that joined after backups began) private final long oldestStartCode; - private BackupBoundaries(Map boundaries, long oldestStartCode) { + private BackupBoundaries(Map> boundaries, long oldestStartCode) { this.boundaries = boundaries; this.oldestStartCode = oldestStartCode; } public boolean isDeletable(Path walLogPath) { + for (Map boundaries : this.boundaries.values()) { + if (!isDeletable(boundaries, walLogPath)) { + return false; + } + } + return true; + } + + private boolean isDeletable(Map boundaries, Path walLogPath) { try { String hostname = BackupUtils.parseHostNameFromLogFile(walLogPath); @@ -100,7 +113,7 @@ public boolean isDeletable(Path walLogPath) { } } - public Map getBoundaries() { + public Map> getBoundaries() { return boundaries; } @@ -113,7 +126,7 @@ public static BackupBoundariesBuilder builder(long tsCleanupBuffer) { } public static class BackupBoundariesBuilder { - private final Map boundaries = new HashMap<>(); + private final Map> boundaries = new HashMap<>(); private final long tsCleanupBuffer; private long oldestStartCode = Long.MAX_VALUE; @@ -122,12 +135,15 @@ private BackupBoundariesBuilder(long tsCleanupBuffer) { this.tsCleanupBuffer = tsCleanupBuffer; } - public BackupBoundariesBuilder addBackupTimestamps(String host, long hostLogRollTs, - long backupStartCode) { + public BackupBoundariesBuilder addBackupTimestamps(String backupId, String host, + long hostLogRollTs, long backupStartCode) { + Map backupBoundaries = + boundaries.computeIfAbsent(backupId, ignore -> new HashMap<>()); + Address address = Address.fromString(host); - Long storedTs = boundaries.get(address); + Long storedTs = backupBoundaries.get(address); if (storedTs == null || hostLogRollTs < storedTs) { - boundaries.put(address, hostLogRollTs); + backupBoundaries.put(address, hostLogRollTs); } if (oldestStartCode > backupStartCode) { diff --git a/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java b/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java index 41060347b1db..e26e4a335e47 100644 --- a/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java +++ b/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java @@ -300,7 +300,7 @@ public void testCanDeleteFileWithNewServerWALs() { Path oldWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500000"); String host = BackupUtils.parseHostNameFromLogFile(oldWAL); BackupBoundaries boundaries = BackupBoundaries.builder(0L) - .addBackupTimestamps(host, backupStartCode, backupStartCode).build(); + .addBackupTimestamps("backup1", host, backupStartCode, backupStartCode).build(); assertTrue("WAL older than backup should be deletable", BackupLogCleaner.canDeleteFile(boundaries, oldWAL)); @@ -334,6 +334,32 @@ public void testCleansUpArchivedHMasterWal() { assertTrue(BackupLogCleaner.canDeleteFile(empty, masterPath)); } + @Test + public void testBackupBoundariesProtectsWALsNeededByAnyRoot() { + long t1 = 1000L; + long t2 = 2000L; + + Path walBetweenBoundaries = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1500"); + String host = BackupUtils.parseHostNameFromLogFile(walBetweenBoundaries); + + BackupBoundaries boundaries = + BackupBoundaries.builder(0L).addBackupTimestamps("backup-B1", host, t1, t1) + .addBackupTimestamps("backup-B2", host, t2, t2).build(); + + assertFalse( + "WAL at 1500 should NOT be deletable because backup B1 " + "(boundary=" + t1 + + ") still needs it, even though backup B2 (boundary=" + t2 + ") doesn't", + BackupLogCleaner.canDeleteFile(boundaries, walBetweenBoundaries)); + + Path walBeforeBoth = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500"); + assertTrue("WAL at 500 should be deletable because it's before both boundaries", + BackupLogCleaner.canDeleteFile(boundaries, walBeforeBoth)); + + Path walAfterBoth = new Path("/hbase/oldWALs/server1%2C60020%2C12345.2500"); + assertFalse("WAL at 2500 should NOT be deletable because it's after both boundaries", + BackupLogCleaner.canDeleteFile(boundaries, walAfterBoth)); + } + private Set mergeAsSet(Collection toCopy, Collection toAdd) { Set result = new LinkedHashSet<>(toCopy); result.addAll(toAdd);