HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing (#6382)

shubhluck · zabetak · web-flow · commit 876021b99fb3 · 2026-04-13T15:04:46.000+02:00
Co-authored-by: Stamatis Zampetakis &lt;zabetak@gmail.com&gt;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
           LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
           LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
         }
+        boolean useColStats = roi.filterStats.getColumnStats() != null;
         StatsUtils.updateStats(roi.filterStats, newNumRows,
-            true, roi.filterOperator, roi.colNames);
+            useColStats, roi.filterOperator, roi.colNames);
         if (LOG.isDebugEnabled()) {
           LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
         }
diff --git a/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q
@@ -0,0 +1,11 @@
+-- HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing
+set hive.stats.fetch.column.stats=false;
+
+create table big (id int, val string) partitioned by (bday int);
+alter table big add partition (bday=20260410);
+alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000');
+
+create table small (id int, val string);
+alter table small update statistics set ('numRows' = '1000');
+
+explain select big.val, small.val from big join small on big.id = small.id;
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out
@@ -0,0 +1,167 @@
+PREHOOK: query: create table big (id int, val string) partitioned by (bday int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@big
+POSTHOOK: query: create table big (id int, val string) partitioned by (bday int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@big
+PREHOOK: query: alter table big add partition (bday=20260410)
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@big
+POSTHOOK: query: alter table big add partition (bday=20260410)
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@big
+POSTHOOK: Output: default@big@bday=20260410
+PREHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
+PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS
+PREHOOK: Input: default@big
+PREHOOK: Output: default@big@bday=20260410
+POSTHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
+POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS
+POSTHOOK: Input: default@big
+POSTHOOK: Input: default@big@bday=20260410
+POSTHOOK: Output: default@big@bday=20260410
+PREHOOK: query: create table small (id int, val string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@small
+POSTHOOK: query: create table small (id int, val string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@small
+PREHOOK: query: alter table small update statistics set ('numRows' = '1000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@small
+PREHOOK: Output: default@small
+POSTHOOK: query: alter table small update statistics set ('numRows' = '1000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@small
+POSTHOOK: Output: default@small
+PREHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@big
+PREHOOK: Input: default@big@bday=20260410
+PREHOOK: Input: default@small
+#### A masked pattern was here ####
+POSTHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@big
+POSTHOOK: Input: default@big@bday=20260410
+POSTHOOK: Input: default@small
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+        Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: big
+                  filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
+                  Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
+                    Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: small
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                  Filter Operator
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                        value expressions: _col1 (type: string)
+                      Select Operator
+                        expressions: _col0 (type: int)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000)
+                          minReductionHashAggr: 0.99
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
+                          Reduce Output Operator
+                            null sort order: 
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
+                            value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
+                outputColumnNames: _col1, _col3
+                Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
+                Select Operator
+                  expressions: _col1 (type: string), _col3 (type: string)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=1000000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
+                Reduce Output Operator
+                  null sort order: 
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
+                  value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

Original file line number	Diff line number	Diff line change
`@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)`
`1977`	`1977`	`LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);`
`1978`	`1978`	`LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());`
`1979`	`1979`	`}`
	`1980`	`+ boolean useColStats = roi.filterStats.getColumnStats() != null;`
`1980`	`1981`	`StatsUtils.updateStats(roi.filterStats, newNumRows,`
`1981`		`- true, roi.filterOperator, roi.colNames);`
	`1982`	`+ useColStats, roi.filterOperator, roi.colNames);`
`1982`	`1983`	`if (LOG.isDebugEnabled()) {`
`1983`	`1984`	`LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);`
`1984`	`1985`	`}`