From fafafe89a9127919f8b1676bef47e6344885c3a3 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Tue, 16 Dec 2025 09:02:57 -0800 Subject: [PATCH 1/3] HIVE-29367: preventing Long overflows in ConvertJoinMapJoin --- .../hive/ql/optimizer/ConvertJoinMapJoin.java | 26 +- .../ql/optimizer/TestConvertJoinMapJoin.java | 164 +++++++++++ .../clientpositive/mapjoin_stats_overflow.q | 26 ++ .../llap/mapjoin_stats_overflow.q.out | 270 ++++++++++++++++++ 4 files changed, 475 insertions(+), 11 deletions(-) create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java create mode 100644 ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q create mode 100644 ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index a622a0a7c02d..31f19d311dd3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -280,20 +280,20 @@ private boolean selectJoinForLlap(OptimizeTezProcContext context, JoinOperator j continue; } Operator parentOp = joinOp.getParentOperators().get(pos); - totalSize += computeOnlineDataSize(parentOp.getStatistics()); + totalSize = StatsUtils.safeAdd(totalSize, computeOnlineDataSize(parentOp.getStatistics())); } // Size of bigtable long bigTableSize = computeOnlineDataSize(joinOp.getParentOperators().get(mapJoinConversionPos).getStatistics()); // Network cost of DPHJ - long networkCostDPHJ = totalSize + bigTableSize; + long networkCostDPHJ = StatsUtils.safeAdd(totalSize, bigTableSize); LOG.info("Cost of dynamically partitioned hash join : total small table size = " + totalSize + " bigTableSize = " + bigTableSize + "networkCostDPHJ = " + networkCostDPHJ); // Network cost of map side join - long networkCostMJ = numNodes * totalSize; + long networkCostMJ = StatsUtils.safeMult(numNodes, totalSize); LOG.info("Cost of Bucket Map Join : numNodes = " + numNodes + " total small table size = " + totalSize + " networkCostMJ = " + networkCostMJ); @@ -363,9 +363,13 @@ public long computeOnlineDataSizeGeneric(Statistics statistics, long overHeadPer numRows = 1; } long worstCaseNeededSlots = 1L << DoubleMath.log2(numRows / hashTableLoadFactor, RoundingMode.UP); - onlineDataSize += statistics.getDataSize() - hashTableDataSizeAdjustment(numRows, statistics.getColumnStats()); - onlineDataSize += overHeadPerRow * statistics.getNumRows(); - onlineDataSize += overHeadPerSlot * worstCaseNeededSlots; + long adjustedDataSize = Math.max(0L, + statistics.getDataSize() - hashTableDataSizeAdjustment(numRows, statistics.getColumnStats())); + onlineDataSize = StatsUtils.safeAdd(onlineDataSize, adjustedDataSize); + onlineDataSize = StatsUtils.safeAdd(onlineDataSize, + StatsUtils.safeMult(overHeadPerRow, statistics.getNumRows())); + onlineDataSize = StatsUtils.safeAdd(onlineDataSize, + StatsUtils.safeMult(overHeadPerSlot, worstCaseNeededSlots)); return onlineDataSize; } @@ -384,7 +388,7 @@ private static long hashTableDataSizeAdjustment(long numRows, List 0 ? numRows - cs.getNumNulls() + 1 : numRows; + long nonNullCount = cs.getNumNulls() > 0 ? Math.max(1L, numRows - cs.getNumNulls() + 1) : numRows; double overhead = 0; if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) @@ -1248,7 +1252,7 @@ public MapJoinConversion getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr if (bigInputStat != null && selectedBigTable) { // We are replacing the current big table with a new one, thus // we need to count the current one as a map table then. - totalSize += computeOnlineDataSize(bigInputStat); + totalSize = StatsUtils.safeAdd(totalSize, computeOnlineDataSize(bigInputStat)); // Check if number of distinct keys is greater than given max number of entries // for HashMap if (checkMapJoinThresholds && !checkNumberOfEntriesForHashTable(joinOp, bigTablePosition, context)) { @@ -1257,7 +1261,7 @@ public MapJoinConversion getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr } else if (!selectedBigTable) { // This is not the first table and we are not using it as big table, // in fact, we're adding this table as a map table - totalSize += inputSize; + totalSize = StatsUtils.safeAdd(totalSize, inputSize); // Check if number of distinct keys is greater than given max number of entries // for HashMap if (checkMapJoinThresholds && !checkNumberOfEntriesForHashTable(joinOp, pos, context)) { @@ -1342,7 +1346,7 @@ private static Long computeCumulativeCardinality(Operator= 0, "Result should not be negative due to overflow"); + } + + @Test + public void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() { + ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); + converter.hashTableLoadFactor = 0.75f; + Statistics stats = new Statistics(Long.MAX_VALUE / 2, 1000L, 0L, 0L); + + long result = converter.computeOnlineDataSizeGeneric(stats, Long.MAX_VALUE / 4, Long.MAX_VALUE / 4); + + assertTrue(result >= 0, "Result should not be negative due to overflow"); + assertEquals(Long.MAX_VALUE, result, "Result should saturate at Long.MAX_VALUE"); + } + + @Test + public void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() { + ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); + converter.hashTableLoadFactor = 0.75f; + Statistics stats = new Statistics(100L, 10000L, 0L, 0L); + List colStats = new ArrayList<>(); + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setNumNulls(Long.MAX_VALUE); + colStats.add(cs); + stats.setColumnStats(colStats); + + long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L); + + assertTrue(result >= 0, "Result should not be negative due to underflow in nonNullCount"); + } + + @Test + public void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() { + ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); + converter.hashTableLoadFactor = 0.75f; + Statistics stats = new Statistics(1000000L, 100L, 0L, 0L); + List colStats = new ArrayList<>(); + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setNumNulls(0L); + colStats.add(cs); + stats.setColumnStats(colStats); + + long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L); + + assertTrue(result >= 0, "Result should not be negative when adjustment > dataSize"); + } + + @Test + public void testComputeOnlineDataSizeGenericAllExtremeValues() { + ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); + converter.hashTableLoadFactor = 0.75f; + Statistics stats = new Statistics(Long.MAX_VALUE, Long.MAX_VALUE, 0L, 0L); + List colStats = new ArrayList<>(); + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setNumNulls(Long.MAX_VALUE); + colStats.add(cs); + stats.setColumnStats(colStats); + + long result = converter.computeOnlineDataSizeGeneric(stats, Long.MAX_VALUE, Long.MAX_VALUE); + + assertTrue(result >= 0, "Result should not be negative with extreme values"); + assertEquals(Long.MAX_VALUE, result, "Result should saturate at Long.MAX_VALUE"); + } + + @Test + public void testComputeCumulativeCardinalityWithParentsOverflow() { + Operator parent1 = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(parent1.getParentOperators()).thenReturn(Collections.emptyList()); + Operator parent2 = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(parent2.getParentOperators()).thenReturn(Collections.emptyList()); + Operator mockOp = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(mockOp.getParentOperators()).thenReturn(Arrays.asList(parent1, parent2)); + + Long result = invokeComputeCumulativeCardinality(mockOp); + + assertNotNull(result, "Result should not be null"); + assertTrue(result >= 0, "Result should not be negative due to overflow"); + assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate at Long.MAX_VALUE"); + } + + @Test + public void testComputeCumulativeCardinalityDeepTreeOverflow() { + Operator leaf = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(leaf.getParentOperators()).thenReturn(Collections.emptyList()); + Operator mid1 = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(mid1.getParentOperators()).thenReturn(Collections.singletonList(leaf)); + Operator mid2 = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(mid2.getParentOperators()).thenReturn(Collections.singletonList(mid1)); + Operator root = createMockOperatorWithStats(Long.MAX_VALUE / 2); + when(root.getParentOperators()).thenReturn(Collections.singletonList(mid2)); + + Long result = invokeComputeCumulativeCardinality(root); + + assertNotNull(result, "Result should not be null"); + assertTrue(result >= 0, "Result should not be negative due to overflow"); + assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate at Long.MAX_VALUE"); + } + + @SuppressWarnings("unchecked") + private Operator createMockOperatorWithStats(long numRows) { + Operator mockOp = mock(Operator.class); + Statistics stats = new Statistics(numRows, numRows * 100, 0L, 0L); + when(mockOp.getStatistics()).thenReturn(stats); + return mockOp; + } + + private Long invokeComputeCumulativeCardinality(Operator op) { + try { + Method method = ConvertJoinMapJoin.class.getDeclaredMethod( + "computeCumulativeCardinality", Operator.class); + method.setAccessible(true); + return (Long) method.invoke(null, op); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q b/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q new file mode 100644 index 000000000000..9867c8c44798 --- /dev/null +++ b/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q @@ -0,0 +1,26 @@ +-- Test overflow handling in computeOnlineDataSize with Long.MAX_VALUE statistics + +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=10000000; + +CREATE TABLE t1 (k BIGINT, v STRING); +CREATE TABLE t2 (k BIGINT, v STRING); + +-- Case 1: Normal statistics - t1 fits in 10MB threshold, MapJoin expected +ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='100000'); +ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='10000','numNulls'='0'); +ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20'); + +ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='10000000'); +ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000000','numNulls'='0'); +ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20'); + +EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k; + +-- Case 2: Long.MAX_VALUE numRows - without fix, overflow causes negative size and incorrect MapJoin +ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807'); +ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000','numNulls'='0'); +ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20'); + +EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k; diff --git a/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out b/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out new file mode 100644 index 000000000000..e7cdad56bdea --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out @@ -0,0 +1,270 @@ +PREHOOK: query: CREATE TABLE t1 (k BIGINT, v STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: CREATE TABLE t1 (k BIGINT, v STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE t2 (k BIGINT, v STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2 +POSTHOOK: query: CREATE TABLE t2 (k BIGINT, v STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2 +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='100000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='100000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='10000','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='10000','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='10000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2 +PREHOOK: Output: default@t2 +POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='10000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@t2 +PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000000','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2 +PREHOOK: Output: default@t2 +POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000000','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@t2 +PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t2 +PREHOOK: Output: default@t2 +POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t2 +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Map 1 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + filterExpr: k is not null (type: boolean) + Statistics: Num rows: 10000 Data size: 80000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: k is not null (type: boolean) + Statistics: Num rows: 10000 Data size: 80000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: k (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 80000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 10000 Data size: 80000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 2 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: k is not null (type: boolean) + probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_25_container, bigKeyColName:k, smallTablePos:0, keyRatio:0.01 + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: k is not null (type: boolean) + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: k (type: bigint), v (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col2 + input vertices: + 0 Map 1 + Statistics: Num rows: 10000 Data size: 1020000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 10000 Data size: 1020000 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 10000 Data size: 1020000 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + filterExpr: k is not null (type: boolean) + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: k is not null (type: boolean) + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: k (type: bigint), v (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: k is not null (type: boolean) + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: k is not null (type: boolean) + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: k (type: bigint), v (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 1000000 Data size: 102000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 9223372036854775807 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + From b8bb1e96e4506855542e25d1f7f058a95af1fc19 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Tue, 16 Dec 2025 18:26:35 -0800 Subject: [PATCH 2/3] HIVE-29367: attempt a rebuild From eee1347af1bd7c85490a63d22d858bc49341f82b Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Thu, 18 Dec 2025 11:01:09 -0800 Subject: [PATCH 3/3] HIVE-29367: quality gate feedback --- .../ql/optimizer/TestConvertJoinMapJoin.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java index 00003773aa53..82005de35a2a 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java @@ -35,10 +35,10 @@ import org.apache.hadoop.hive.ql.plan.Statistics; import org.junit.jupiter.api.Test; -public class TestConvertJoinMapJoin { +class TestConvertJoinMapJoin { @Test - public void testComputeOnlineDataSizeGenericLargeDataSize() { + void testComputeOnlineDataSizeGenericLargeDataSize() { ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); converter.hashTableLoadFactor = 0.75f; Statistics stats = new Statistics(1000L, Long.MAX_VALUE, 0L, 0L); @@ -49,7 +49,7 @@ public void testComputeOnlineDataSizeGenericLargeDataSize() { } @Test - public void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() { + void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() { ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); converter.hashTableLoadFactor = 0.75f; Statistics stats = new Statistics(Long.MAX_VALUE / 2, 1000L, 0L, 0L); @@ -61,7 +61,7 @@ public void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() { } @Test - public void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() { + void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() { ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); converter.hashTableLoadFactor = 0.75f; Statistics stats = new Statistics(100L, 10000L, 0L, 0L); @@ -77,7 +77,7 @@ public void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() { } @Test - public void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() { + void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() { ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); converter.hashTableLoadFactor = 0.75f; Statistics stats = new Statistics(1000000L, 100L, 0L, 0L); @@ -93,7 +93,7 @@ public void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() { } @Test - public void testComputeOnlineDataSizeGenericAllExtremeValues() { + void testComputeOnlineDataSizeGenericAllExtremeValues() { ConvertJoinMapJoin converter = new ConvertJoinMapJoin(); converter.hashTableLoadFactor = 0.75f; Statistics stats = new Statistics(Long.MAX_VALUE, Long.MAX_VALUE, 0L, 0L); @@ -110,7 +110,7 @@ public void testComputeOnlineDataSizeGenericAllExtremeValues() { } @Test - public void testComputeCumulativeCardinalityWithParentsOverflow() { + void testComputeCumulativeCardinalityWithParentsOverflow() { Operator parent1 = createMockOperatorWithStats(Long.MAX_VALUE / 2); when(parent1.getParentOperators()).thenReturn(Collections.emptyList()); Operator parent2 = createMockOperatorWithStats(Long.MAX_VALUE / 2); @@ -126,7 +126,7 @@ public void testComputeCumulativeCardinalityWithParentsOverflow() { } @Test - public void testComputeCumulativeCardinalityDeepTreeOverflow() { + void testComputeCumulativeCardinalityDeepTreeOverflow() { Operator leaf = createMockOperatorWithStats(Long.MAX_VALUE / 2); when(leaf.getParentOperators()).thenReturn(Collections.emptyList()); Operator mid1 = createMockOperatorWithStats(Long.MAX_VALUE / 2);