Skip to content

Commit 5c82af1

Browse files
fix(health): avoid blocking DB I/O under write lock and restore interrupt flag
1 parent 5be92a5 commit 5c82af1

1 file changed

Lines changed: 79 additions & 44 deletions

File tree

src/main/java/com/iemr/admin/service/health/HealthService.java

Lines changed: 79 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,13 @@
2929
import java.util.LinkedHashMap;
3030
import java.util.Map;
3131
import java.util.concurrent.ConcurrentHashMap;
32+
import java.util.concurrent.atomic.AtomicBoolean;
3233
import java.util.concurrent.ExecutorService;
3334
import java.util.concurrent.Executors;
3435
import java.util.concurrent.Future;
3536
import java.util.concurrent.TimeUnit;
3637
import java.util.concurrent.TimeoutException;
38+
import java.util.concurrent.ExecutionException;
3739
import java.util.function.Supplier;
3840
import jakarta.annotation.PreDestroy;
3941
import javax.sql.DataSource;
@@ -72,7 +74,6 @@ public class HealthService {
7274
private static final long RESPONSE_TIME_THRESHOLD_MS = 2000;
7375

7476
private static final String DIAGNOSTIC_LOCK_WAIT = "MYSQL_LOCK_WAIT";
75-
private static final String DIAGNOSTIC_DEADLOCK = "MYSQL_DEADLOCK";
7677
private static final String DIAGNOSTIC_SLOW_QUERIES = "MYSQL_SLOW_QUERIES";
7778
private static final String DIAGNOSTIC_POOL_EXHAUSTED = "MYSQL_POOL_EXHAUSTED";
7879
private static final String DIAGNOSTIC_LOG_TEMPLATE = "Diagnostic: {}";
@@ -84,14 +85,15 @@ public class HealthService {
8485
private volatile long lastAdvancedCheckTime = 0;
8586
private volatile AdvancedCheckResult cachedAdvancedCheckResult = null;
8687
private final ReentrantReadWriteLock advancedCheckLock = new ReentrantReadWriteLock();
88+
private final AtomicBoolean advancedCheckInProgress = new AtomicBoolean(false);
8789

8890
private static final boolean ADVANCED_HEALTH_CHECKS_ENABLED = true;
8991

9092
public HealthService(DataSource dataSource,
9193
@Autowired(required = false) RedisTemplate<String, Object> redisTemplate) {
9294
this.dataSource = dataSource;
9395
this.redisTemplate = redisTemplate;
94-
this.executorService = Executors.newFixedThreadPool(2);
96+
this.executorService = Executors.newFixedThreadPool(6);
9597
}
9698

9799
@PreDestroy
@@ -118,32 +120,8 @@ public Map<String, Object> checkHealth() {
118120
Map<String, Object> mysqlStatus = new ConcurrentHashMap<>();
119121
Map<String, Object> redisStatus = new ConcurrentHashMap<>();
120122

121-
Future<?> mysqlFuture = executorService.submit(
122-
() -> performHealthCheck("MySQL", mysqlStatus, this::checkMySQLHealthSync));
123-
Future<?> redisFuture = executorService.submit(
124-
() -> performHealthCheck("Redis", redisStatus, this::checkRedisHealthSync));
125-
126-
long maxTimeout = Math.max(MYSQL_TIMEOUT_SECONDS, REDIS_TIMEOUT_SECONDS) + 1;
127-
long deadlineNs = System.nanoTime() + TimeUnit.SECONDS.toNanos(maxTimeout);
128-
try {
129-
mysqlFuture.get(maxTimeout, TimeUnit.SECONDS);
130-
long remainingNs = deadlineNs - System.nanoTime();
131-
if (remainingNs > 0) {
132-
redisFuture.get(remainingNs, TimeUnit.NANOSECONDS);
133-
} else {
134-
redisFuture.cancel(true);
135-
}
136-
} catch (TimeoutException e) {
137-
logger.warn("Health check aggregate timeout after {} seconds", maxTimeout);
138-
mysqlFuture.cancel(true);
139-
redisFuture.cancel(true);
140-
} catch (InterruptedException e) {
141-
Thread.currentThread().interrupt();
142-
logger.warn("Health check was interrupted");
143-
mysqlFuture.cancel(true);
144-
redisFuture.cancel(true);
145-
} catch (Exception e) {
146-
logger.warn("Health check execution error: {}", e.getMessage());
123+
if (!executorService.isShutdown()) {
124+
performHealthChecks(mysqlStatus, redisStatus);
147125
}
148126

149127
ensurePopulated(mysqlStatus, "MySQL");
@@ -154,13 +132,56 @@ public Map<String, Object> checkHealth() {
154132
components.put("redis", redisStatus);
155133

156134
response.put("components", components);
157-
158-
String overallStatus = computeOverallStatus(components);
159-
response.put(STATUS_KEY, overallStatus);
135+
response.put(STATUS_KEY, computeOverallStatus(components));
160136

161137
return response;
162138
}
163139

140+
private void performHealthChecks(Map<String, Object> mysqlStatus, Map<String, Object> redisStatus) {
141+
Future<?> mysqlFuture = null;
142+
Future<?> redisFuture = null;
143+
try {
144+
mysqlFuture = executorService.submit(
145+
() -> performHealthCheck("MySQL", mysqlStatus, this::checkMySQLHealthSync));
146+
redisFuture = executorService.submit(
147+
() -> performHealthCheck("Redis", redisStatus, this::checkRedisHealthSync));
148+
149+
awaitHealthChecks(mysqlFuture, redisFuture);
150+
} catch (TimeoutException e) {
151+
logger.warn("Health check aggregate timeout after {} seconds", getMaxTimeout());
152+
cancelFutures(mysqlFuture, redisFuture);
153+
} catch (InterruptedException e) {
154+
Thread.currentThread().interrupt();
155+
logger.warn("Health check was interrupted");
156+
cancelFutures(mysqlFuture, redisFuture);
157+
} catch (Exception e) {
158+
logger.warn("Health check execution error: {}", e.getMessage());
159+
}
160+
}
161+
162+
private void awaitHealthChecks(Future<?> mysqlFuture, Future<?> redisFuture) throws TimeoutException, InterruptedException, ExecutionException {
163+
long maxTimeout = getMaxTimeout();
164+
long deadlineNs = System.nanoTime() + TimeUnit.SECONDS.toNanos(maxTimeout);
165+
166+
mysqlFuture.get(maxTimeout, TimeUnit.SECONDS);
167+
long remainingNs = deadlineNs - System.nanoTime();
168+
169+
if (remainingNs > 0) {
170+
redisFuture.get(remainingNs, TimeUnit.NANOSECONDS);
171+
} else {
172+
redisFuture.cancel(true);
173+
}
174+
}
175+
176+
private long getMaxTimeout() {
177+
return Math.max(MYSQL_TIMEOUT_SECONDS, REDIS_TIMEOUT_SECONDS) + 1;
178+
}
179+
180+
private void cancelFutures(Future<?> mysqlFuture, Future<?> redisFuture) {
181+
if (mysqlFuture != null) mysqlFuture.cancel(true);
182+
if (redisFuture != null) redisFuture.cancel(true);
183+
}
184+
164185
private void ensurePopulated(Map<String, Object> status, String componentName) {
165186
if (!status.containsKey(STATUS_KEY)) {
166187
status.put(STATUS_KEY, STATUS_DOWN);
@@ -319,26 +340,40 @@ private boolean performAdvancedMySQLChecksWithThrottle() {
319340
advancedCheckLock.readLock().unlock();
320341
}
321342

322-
advancedCheckLock.writeLock().lock();
343+
// Only one thread may submit; others fall back to the (stale) cache
344+
if (!advancedCheckInProgress.compareAndSet(false, true)) {
345+
advancedCheckLock.readLock().lock();
346+
try {
347+
return cachedAdvancedCheckResult != null && cachedAdvancedCheckResult.isDegraded;
348+
} finally {
349+
advancedCheckLock.readLock().unlock();
350+
}
351+
}
352+
323353
try {
324-
if (cachedAdvancedCheckResult != null &&
325-
(currentTime - lastAdvancedCheckTime) < ADVANCED_CHECKS_THROTTLE_SECONDS * 1000) {
326-
return cachedAdvancedCheckResult.isDegraded;
354+
// Perform DB I/O outside the write lock to avoid lock contention
355+
AdvancedCheckResult result;
356+
try (Connection connection = dataSource.getConnection()) {
357+
result = performAdvancedMySQLChecks(connection);
358+
} catch (Exception e) {
359+
if (e.getCause() instanceof InterruptedException) {
360+
Thread.currentThread().interrupt();
361+
}
362+
logger.debug("Failed to get connection for advanced checks: {}", e.getMessage());
363+
result = new AdvancedCheckResult(false);
327364
}
328365

329-
// Acquire a fresh connection for advanced checks to avoid pool exhaustion
330-
try (Connection connection = dataSource.getConnection()) {
331-
AdvancedCheckResult result = performAdvancedMySQLChecks(connection);
366+
// Re-acquire write lock only to update the cache atomically
367+
advancedCheckLock.writeLock().lock();
368+
try {
332369
lastAdvancedCheckTime = currentTime;
333370
cachedAdvancedCheckResult = result;
334371
return result.isDegraded;
372+
} finally {
373+
advancedCheckLock.writeLock().unlock();
335374
}
336-
} catch (Exception e) {
337-
logger.debug("Failed to get connection for advanced checks: {}", e.getMessage());
338-
// Return cached result or false if no cache
339-
return cachedAdvancedCheckResult != null && cachedAdvancedCheckResult.isDegraded;
340375
} finally {
341-
advancedCheckLock.writeLock().unlock();
376+
advancedCheckInProgress.set(false);
342377
}
343378
}
344379

@@ -392,7 +427,7 @@ private boolean hasLockWaits(Connection connection) {
392427
private boolean hasSlowQueries(Connection connection) {
393428
try (PreparedStatement stmt = connection.prepareStatement(
394429
"SELECT COUNT(*) FROM INFORMATION_SCHEMA.PROCESSLIST " +
395-
"WHERE command != 'Sleep' AND time > ? AND user NOT IN ('event_scheduler', 'system user')")) {
430+
"WHERE command != 'Sleep' AND time > ? AND user = SUBSTRING_INDEX(USER(), '@', 1)")) {
396431
stmt.setQueryTimeout(2);
397432
stmt.setInt(1, 10);
398433
try (ResultSet rs = stmt.executeQuery()) {

0 commit comments

Comments
 (0)