Skip to content

Commit 876021b

Browse files
shubhluckzabetak
andauthored
HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing (#6382)
Co-authored-by: Stamatis Zampetakis <zabetak@gmail.com>
1 parent 01d9111 commit 876021b

3 files changed

Lines changed: 180 additions & 1 deletion

File tree

ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
19771977
LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
19781978
LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
19791979
}
1980+
boolean useColStats = roi.filterStats.getColumnStats() != null;
19801981
StatsUtils.updateStats(roi.filterStats, newNumRows,
1981-
true, roi.filterOperator, roi.colNames);
1982+
useColStats, roi.filterOperator, roi.colNames);
19821983
if (LOG.isDebugEnabled()) {
19831984
LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
19841985
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-- HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing
2+
set hive.stats.fetch.column.stats=false;
3+
4+
create table big (id int, val string) partitioned by (bday int);
5+
alter table big add partition (bday=20260410);
6+
alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000');
7+
8+
create table small (id int, val string);
9+
alter table small update statistics set ('numRows' = '1000');
10+
11+
explain select big.val, small.val from big join small on big.id = small.id;
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
PREHOOK: query: create table big (id int, val string) partitioned by (bday int)
2+
PREHOOK: type: CREATETABLE
3+
PREHOOK: Output: database:default
4+
PREHOOK: Output: default@big
5+
POSTHOOK: query: create table big (id int, val string) partitioned by (bday int)
6+
POSTHOOK: type: CREATETABLE
7+
POSTHOOK: Output: database:default
8+
POSTHOOK: Output: default@big
9+
PREHOOK: query: alter table big add partition (bday=20260410)
10+
PREHOOK: type: ALTERTABLE_ADDPARTS
11+
PREHOOK: Output: default@big
12+
POSTHOOK: query: alter table big add partition (bday=20260410)
13+
POSTHOOK: type: ALTERTABLE_ADDPARTS
14+
POSTHOOK: Output: default@big
15+
POSTHOOK: Output: default@big@bday=20260410
16+
PREHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
17+
PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS
18+
PREHOOK: Input: default@big
19+
PREHOOK: Output: default@big@bday=20260410
20+
POSTHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000')
21+
POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS
22+
POSTHOOK: Input: default@big
23+
POSTHOOK: Input: default@big@bday=20260410
24+
POSTHOOK: Output: default@big@bday=20260410
25+
PREHOOK: query: create table small (id int, val string)
26+
PREHOOK: type: CREATETABLE
27+
PREHOOK: Output: database:default
28+
PREHOOK: Output: default@small
29+
POSTHOOK: query: create table small (id int, val string)
30+
POSTHOOK: type: CREATETABLE
31+
POSTHOOK: Output: database:default
32+
POSTHOOK: Output: default@small
33+
PREHOOK: query: alter table small update statistics set ('numRows' = '1000')
34+
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
35+
PREHOOK: Input: default@small
36+
PREHOOK: Output: default@small
37+
POSTHOOK: query: alter table small update statistics set ('numRows' = '1000')
38+
POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
39+
POSTHOOK: Input: default@small
40+
POSTHOOK: Output: default@small
41+
PREHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
42+
PREHOOK: type: QUERY
43+
PREHOOK: Input: default@big
44+
PREHOOK: Input: default@big@bday=20260410
45+
PREHOOK: Input: default@small
46+
#### A masked pattern was here ####
47+
POSTHOOK: query: explain select big.val, small.val from big join small on big.id = small.id
48+
POSTHOOK: type: QUERY
49+
POSTHOOK: Input: default@big
50+
POSTHOOK: Input: default@big@bday=20260410
51+
POSTHOOK: Input: default@small
52+
#### A masked pattern was here ####
53+
STAGE DEPENDENCIES:
54+
Stage-1 is a root stage
55+
Stage-0 depends on stages: Stage-1
56+
57+
STAGE PLANS:
58+
Stage: Stage-1
59+
Tez
60+
#### A masked pattern was here ####
61+
Edges:
62+
Map 1 <- Reducer 4 (BROADCAST_EDGE)
63+
Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
64+
Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
65+
#### A masked pattern was here ####
66+
Vertices:
67+
Map 1
68+
Map Operator Tree:
69+
TableScan
70+
alias: big
71+
filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
72+
Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
73+
Filter Operator
74+
predicate: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean)
75+
Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
76+
Select Operator
77+
expressions: id (type: int), val (type: string)
78+
outputColumnNames: _col0, _col1
79+
Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
80+
Reduce Output Operator
81+
key expressions: _col0 (type: int)
82+
null sort order: z
83+
sort order: +
84+
Map-reduce partition columns: _col0 (type: int)
85+
Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE
86+
value expressions: _col1 (type: string)
87+
Execution mode: vectorized, llap
88+
LLAP IO: all inputs
89+
Map 3
90+
Map Operator Tree:
91+
TableScan
92+
alias: small
93+
filterExpr: id is not null (type: boolean)
94+
Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: NONE
95+
Filter Operator
96+
predicate: id is not null (type: boolean)
97+
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
98+
Select Operator
99+
expressions: id (type: int), val (type: string)
100+
outputColumnNames: _col0, _col1
101+
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
102+
Reduce Output Operator
103+
key expressions: _col0 (type: int)
104+
null sort order: z
105+
sort order: +
106+
Map-reduce partition columns: _col0 (type: int)
107+
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
108+
value expressions: _col1 (type: string)
109+
Select Operator
110+
expressions: _col0 (type: int)
111+
outputColumnNames: _col0
112+
Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
113+
Group By Operator
114+
aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000)
115+
minReductionHashAggr: 0.99
116+
mode: hash
117+
outputColumnNames: _col0, _col1, _col2
118+
Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
119+
Reduce Output Operator
120+
null sort order:
121+
sort order:
122+
Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
123+
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
124+
Execution mode: vectorized, llap
125+
LLAP IO: all inputs
126+
Reducer 2
127+
Execution mode: llap
128+
Reduce Operator Tree:
129+
Merge Join Operator
130+
condition map:
131+
Inner Join 0 to 1
132+
keys:
133+
0 _col0 (type: int)
134+
1 _col0 (type: int)
135+
outputColumnNames: _col1, _col3
136+
Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
137+
Select Operator
138+
expressions: _col1 (type: string), _col3 (type: string)
139+
outputColumnNames: _col0, _col1
140+
Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
141+
File Output Operator
142+
compressed: false
143+
Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE
144+
table:
145+
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
146+
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
147+
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
148+
Reducer 4
149+
Execution mode: vectorized, llap
150+
Reduce Operator Tree:
151+
Group By Operator
152+
aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=1000000)
153+
mode: final
154+
outputColumnNames: _col0, _col1, _col2
155+
Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
156+
Reduce Output Operator
157+
null sort order:
158+
sort order:
159+
Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE
160+
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
161+
162+
Stage: Stage-0
163+
Fetch Operator
164+
limit: -1
165+
Processor Tree:
166+
ListSink
167+

0 commit comments

Comments
 (0)