Skip to content

Commit da9c253

Browse files
authored
HBASE-30073 Test fixes for some flappers and a reproducible error (#8057)
Signed-off by: Charles Connell <cconnell@apache.org>
1 parent 7bb4a57 commit da9c253

12 files changed

Lines changed: 153 additions & 67 deletions

File tree

hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,6 @@ protected List<RegionPlan> balanceTable(TableName tableName,
581581
rackManager, regionCacheRatioOnOldServerMap);
582582

583583
long startTime = EnvironmentEdgeManager.currentTime();
584-
cluster.setStopRequestedAt(startTime + maxRunningTime);
585584

586585
initCosts(cluster);
587586
balancerConditionals.loadClusterState(cluster);
@@ -632,6 +631,10 @@ protected List<RegionPlan> balanceTable(TableName tableName,
632631
currentCost / sumMultiplier, functionCost(), computedMaxSteps);
633632

634633
final String initFunctionTotalCosts = totalCostsPerFunc();
634+
long searchStartTime = EnvironmentEdgeManager.currentTime();
635+
// Budget maxRunningTime for the stochastic walk only; initialization (cluster costs, etc.)
636+
// can be substantial on busy hosts and must not consume the search deadline.
637+
cluster.setStopRequestedAt(searchStartTime + maxRunningTime);
635638
// Perform a stochastic walk to see if we can get a good fit.
636639
long step;
637640
boolean planImprovedConditionals = false;

hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public class HBaseJupiterExtension implements InvocationInterceptor, BeforeAllCa
8484

8585
private static final Map<String, Duration> TAG_TO_TIMEOUT =
8686
ImmutableMap.of(SmallTests.TAG, Duration.ofMinutes(3), MediumTests.TAG, Duration.ofMinutes(6),
87-
LargeTests.TAG, Duration.ofMinutes(13), IntegrationTests.TAG, Duration.ZERO);
87+
LargeTests.TAG, Duration.ofMinutes(20), IntegrationTests.TAG, Duration.ZERO);
8888

8989
private static final String EXECUTOR = "executor";
9090

hbase-compression/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
<artifactId>hbase-resource-bundle</artifactId>
4646
<optional>true</optional>
4747
</dependency>
48+
<dependency>
49+
<groupId>org.junit.jupiter</groupId>
50+
<artifactId>junit-jupiter-api</artifactId>
51+
<scope>test</scope>
52+
</dependency>
4853
</dependencies>
4954

5055
<build>

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,13 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str
434434

435435
String sysValue = System.getProperty(propertyName);
436436

437-
if (sysValue != null) {
437+
// Check if directory sharing should be disabled for this test.
438+
// Tests that run with high parallelism and don't need shared directories can set this
439+
// to avoid race conditions where one test's tearDown() deletes directories another test
440+
// is still using.
441+
boolean disableSharing = conf.getBoolean("hbase.test.disable-directory-sharing", false);
442+
443+
if (sysValue != null && !disableSharing) {
438444
// There is already a value set. So we do nothing but hope
439445
// that there will be no conflicts
440446
LOG.info("System.getProperty(\"" + propertyName + "\") already set to: " + sysValue
@@ -447,7 +453,7 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str
447453
}
448454
conf.set(propertyName, sysValue);
449455
} else {
450-
// Ok, it's not set, so we create it as a subdirectory
456+
// Ok, it's not set (or sharing is disabled), so we create it as a subdirectory
451457
createSubDir(propertyName, parent, subDirName);
452458
System.setProperty(propertyName, conf.get(propertyName));
453459
}

hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ public static void setUpBeforeClass() throws Exception {
7575
conf.setInt(HConstants.ZK_SESSION_TIMEOUT, 1000);
7676
conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, MockLoadBalancer.class,
7777
LoadBalancer.class);
78-
TEST_UTIL.startMiniDFSCluster(2);
7978
}
8079

8180
@AfterAll

hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,19 @@
7373
public abstract class AbstractTestAsyncTableScan {
7474

7575
protected static final OpenTelemetryClassRule OTEL_CLASS_RULE = OpenTelemetryClassRule.create();
76-
protected static final MiniClusterRule MINI_CLUSTER_RULE = MiniClusterRule.newBuilder()
77-
.setMiniClusterOption(StartTestingClusterOption.builder().numWorkers(3).build()).build();
76+
77+
private static Configuration createConfiguration() {
78+
Configuration conf = new Configuration();
79+
// Disable directory sharing to prevent race conditions when tests run in parallel.
80+
// Each test instance gets its own isolated directories to avoid one test's tearDown()
81+
// deleting directories another parallel test is still using.
82+
conf.setBoolean("hbase.test.disable-directory-sharing", true);
83+
return conf;
84+
}
85+
86+
protected static final MiniClusterRule MINI_CLUSTER_RULE =
87+
MiniClusterRule.newBuilder().setConfiguration(createConfiguration())
88+
.setMiniClusterOption(StartTestingClusterOption.builder().numWorkers(3).build()).build();
7889

7990
protected static final ConnectionRule CONN_RULE =
8091
ConnectionRule.createAsyncConnectionRule(MINI_CLUSTER_RULE::createAsyncConnection);

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnection.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ public class TestConnection {
8787
@BeforeClass
8888
public static void setUpBeforeClass() throws Exception {
8989
ResourceLeakDetector.setLevel(Level.PARANOID);
90+
// Disable directory sharing to prevent race conditions when tests run in parallel.
91+
// Each test instance gets its own isolated directories to avoid one test's tearDown()
92+
// deleting directories another parallel test is still using.
93+
TEST_UTIL.getConfiguration().setBoolean("hbase.test.disable-directory-sharing", true);
9094
TEST_UTIL.getConfiguration().setBoolean(HConstants.STATUS_PUBLISHED, true);
9195
// Up the handlers; this test needs more than usual.
9296
TEST_UTIL.getConfiguration().setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10);

hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestPrefetchWithBucketCache.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import static org.apache.hadoop.hbase.io.hfile.BlockCacheFactory.BUCKET_CACHE_BUCKETS_KEY;
2323
import static org.apache.hadoop.hbase.io.hfile.bucket.BucketCache.QUEUE_ADDITION_WAIT_TIME;
2424
import static org.junit.Assert.assertEquals;
25-
import static org.junit.Assert.assertNotEquals;
2625
import static org.junit.Assert.assertNotNull;
2726
import static org.junit.Assert.assertNull;
2827
import static org.junit.Assert.assertTrue;
@@ -328,7 +327,8 @@ public void testPrefetchRunTriggersEvictions() throws Exception {
328327
conf.setLong(QUEUE_ADDITION_WAIT_TIME, 0);
329328
blockCache = BlockCacheFactory.createBlockCache(conf);
330329
cacheConf = new CacheConfig(conf, blockCache);
331-
Path storeFile = writeStoreFile("testPrefetchInterruptOnCapacity", 10000);
330+
// Use 15000 KVs to ensure file reliably exceeds 1MB cache capacity even with size variance
331+
Path storeFile = writeStoreFile("testPrefetchRunTriggersEvictions", 15000);
332332
// Prefetches the file blocks
333333
createReaderAndWaitForPrefetchInterruption(storeFile);
334334
Waiter.waitFor(conf, (PrefetchExecutor.getPrefetchDelay() + 1000),
@@ -343,14 +343,16 @@ public void testPrefetchRunTriggersEvictions() throws Exception {
343343
}
344344
return true;
345345
});
346-
if (bc.getStats().getFailedInserts() == 0) {
347-
// With no wait time configuration, prefetch should trigger evictions once it reaches
348-
// cache capacity
349-
assertNotEquals(0, bc.getStats().getEvictedCount());
350-
} else {
351-
LOG.info("We had {} cache insert failures, which may cause cache usage "
352-
+ "to never reach capacity.", bc.getStats().getFailedInserts());
353-
}
346+
// With no wait time configuration, prefetch will either trigger evictions when reaching
347+
// cache capacity, or have failed inserts when the writer queue fills faster than it drains.
348+
// Both outcomes are valid - test should only fail if NEITHER happens, which would indicate
349+
// a problem with the capacity management logic.
350+
long evictions = bc.getStats().getEvictedCount();
351+
long failedInserts = bc.getStats().getFailedInserts();
352+
assertTrue(
353+
"Expected either evictions or failed inserts to demonstrate capacity management, "
354+
+ "but got evictions=" + evictions + ", failedInserts=" + failedInserts,
355+
evictions > 0 || failedInserts > 0);
354356
}
355357

356358
@Test

hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ public static void tearDownAfterClass() throws IOException {
133133
@BeforeEach
134134
public void setUp() throws IOException {
135135
UTIL.ensureSomeNonStoppedRegionServersAvailable(2);
136+
// Surefire reruns failed tests in the same JVM without re-running @BeforeClass. Reset injection
137+
// state so compareAndSet in persistToMeta can succeed again and kill-before-store flags clear.
138+
INJECTED.set(false);
139+
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdateInRollback(
140+
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(), false);
136141
}
137142

138143
private ServerCrashProcedure getSCPForServer(ServerName serverName) throws IOException {

hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestBlockBytesScannedQuota.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ public void testBBSMultiGet() throws Exception {
260260

261261
private void testTraffic(Callable<Long> trafficCallable, long expectedSuccess, long marginOfError)
262262
throws Exception {
263-
TEST_UTIL.waitFor(5_000, () -> {
263+
TEST_UTIL.waitFor(30_000, () -> {
264264
long actualSuccess;
265265
try {
266266
actualSuccess = trafficCallable.call();

0 commit comments

Comments
 (0)