2929import java .util .LinkedHashMap ;
3030import java .util .Map ;
3131import java .util .concurrent .ConcurrentHashMap ;
32+ import java .util .concurrent .atomic .AtomicBoolean ;
3233import java .util .concurrent .ExecutorService ;
3334import java .util .concurrent .Executors ;
3435import java .util .concurrent .Future ;
3536import java .util .concurrent .TimeUnit ;
3637import java .util .concurrent .TimeoutException ;
38+ import java .util .concurrent .ExecutionException ;
3739import java .util .function .Supplier ;
3840import jakarta .annotation .PreDestroy ;
3941import javax .sql .DataSource ;
@@ -72,7 +74,6 @@ public class HealthService {
7274 private static final long RESPONSE_TIME_THRESHOLD_MS = 2000 ;
7375
7476 private static final String DIAGNOSTIC_LOCK_WAIT = "MYSQL_LOCK_WAIT" ;
75- private static final String DIAGNOSTIC_DEADLOCK = "MYSQL_DEADLOCK" ;
7677 private static final String DIAGNOSTIC_SLOW_QUERIES = "MYSQL_SLOW_QUERIES" ;
7778 private static final String DIAGNOSTIC_POOL_EXHAUSTED = "MYSQL_POOL_EXHAUSTED" ;
7879 private static final String DIAGNOSTIC_LOG_TEMPLATE = "Diagnostic: {}" ;
@@ -84,14 +85,15 @@ public class HealthService {
8485 private volatile long lastAdvancedCheckTime = 0 ;
8586 private volatile AdvancedCheckResult cachedAdvancedCheckResult = null ;
8687 private final ReentrantReadWriteLock advancedCheckLock = new ReentrantReadWriteLock ();
88+ private final AtomicBoolean advancedCheckInProgress = new AtomicBoolean (false );
8789
8890 private static final boolean ADVANCED_HEALTH_CHECKS_ENABLED = true ;
8991
9092 public HealthService (DataSource dataSource ,
9193 @ Autowired (required = false ) RedisTemplate <String , Object > redisTemplate ) {
9294 this .dataSource = dataSource ;
9395 this .redisTemplate = redisTemplate ;
94- this .executorService = Executors .newFixedThreadPool (2 );
96+ this .executorService = Executors .newFixedThreadPool (6 );
9597 }
9698
9799 @ PreDestroy
@@ -118,32 +120,8 @@ public Map<String, Object> checkHealth() {
118120 Map <String , Object > mysqlStatus = new ConcurrentHashMap <>();
119121 Map <String , Object > redisStatus = new ConcurrentHashMap <>();
120122
121- Future <?> mysqlFuture = executorService .submit (
122- () -> performHealthCheck ("MySQL" , mysqlStatus , this ::checkMySQLHealthSync ));
123- Future <?> redisFuture = executorService .submit (
124- () -> performHealthCheck ("Redis" , redisStatus , this ::checkRedisHealthSync ));
125-
126- long maxTimeout = Math .max (MYSQL_TIMEOUT_SECONDS , REDIS_TIMEOUT_SECONDS ) + 1 ;
127- long deadlineNs = System .nanoTime () + TimeUnit .SECONDS .toNanos (maxTimeout );
128- try {
129- mysqlFuture .get (maxTimeout , TimeUnit .SECONDS );
130- long remainingNs = deadlineNs - System .nanoTime ();
131- if (remainingNs > 0 ) {
132- redisFuture .get (remainingNs , TimeUnit .NANOSECONDS );
133- } else {
134- redisFuture .cancel (true );
135- }
136- } catch (TimeoutException e ) {
137- logger .warn ("Health check aggregate timeout after {} seconds" , maxTimeout );
138- mysqlFuture .cancel (true );
139- redisFuture .cancel (true );
140- } catch (InterruptedException e ) {
141- Thread .currentThread ().interrupt ();
142- logger .warn ("Health check was interrupted" );
143- mysqlFuture .cancel (true );
144- redisFuture .cancel (true );
145- } catch (Exception e ) {
146- logger .warn ("Health check execution error: {}" , e .getMessage ());
123+ if (!executorService .isShutdown ()) {
124+ performHealthChecks (mysqlStatus , redisStatus );
147125 }
148126
149127 ensurePopulated (mysqlStatus , "MySQL" );
@@ -154,13 +132,56 @@ public Map<String, Object> checkHealth() {
154132 components .put ("redis" , redisStatus );
155133
156134 response .put ("components" , components );
157-
158- String overallStatus = computeOverallStatus (components );
159- response .put (STATUS_KEY , overallStatus );
135+ response .put (STATUS_KEY , computeOverallStatus (components ));
160136
161137 return response ;
162138 }
163139
140+ private void performHealthChecks (Map <String , Object > mysqlStatus , Map <String , Object > redisStatus ) {
141+ Future <?> mysqlFuture = null ;
142+ Future <?> redisFuture = null ;
143+ try {
144+ mysqlFuture = executorService .submit (
145+ () -> performHealthCheck ("MySQL" , mysqlStatus , this ::checkMySQLHealthSync ));
146+ redisFuture = executorService .submit (
147+ () -> performHealthCheck ("Redis" , redisStatus , this ::checkRedisHealthSync ));
148+
149+ awaitHealthChecks (mysqlFuture , redisFuture );
150+ } catch (TimeoutException e ) {
151+ logger .warn ("Health check aggregate timeout after {} seconds" , getMaxTimeout ());
152+ cancelFutures (mysqlFuture , redisFuture );
153+ } catch (InterruptedException e ) {
154+ Thread .currentThread ().interrupt ();
155+ logger .warn ("Health check was interrupted" );
156+ cancelFutures (mysqlFuture , redisFuture );
157+ } catch (Exception e ) {
158+ logger .warn ("Health check execution error: {}" , e .getMessage ());
159+ }
160+ }
161+
162+ private void awaitHealthChecks (Future <?> mysqlFuture , Future <?> redisFuture ) throws TimeoutException , InterruptedException , ExecutionException {
163+ long maxTimeout = getMaxTimeout ();
164+ long deadlineNs = System .nanoTime () + TimeUnit .SECONDS .toNanos (maxTimeout );
165+
166+ mysqlFuture .get (maxTimeout , TimeUnit .SECONDS );
167+ long remainingNs = deadlineNs - System .nanoTime ();
168+
169+ if (remainingNs > 0 ) {
170+ redisFuture .get (remainingNs , TimeUnit .NANOSECONDS );
171+ } else {
172+ redisFuture .cancel (true );
173+ }
174+ }
175+
176+ private long getMaxTimeout () {
177+ return Math .max (MYSQL_TIMEOUT_SECONDS , REDIS_TIMEOUT_SECONDS ) + 1 ;
178+ }
179+
180+ private void cancelFutures (Future <?> mysqlFuture , Future <?> redisFuture ) {
181+ if (mysqlFuture != null ) mysqlFuture .cancel (true );
182+ if (redisFuture != null ) redisFuture .cancel (true );
183+ }
184+
164185 private void ensurePopulated (Map <String , Object > status , String componentName ) {
165186 if (!status .containsKey (STATUS_KEY )) {
166187 status .put (STATUS_KEY , STATUS_DOWN );
@@ -319,26 +340,40 @@ private boolean performAdvancedMySQLChecksWithThrottle() {
319340 advancedCheckLock .readLock ().unlock ();
320341 }
321342
322- advancedCheckLock .writeLock ().lock ();
343+ // Only one thread may submit; others fall back to the (stale) cache
344+ if (!advancedCheckInProgress .compareAndSet (false , true )) {
345+ advancedCheckLock .readLock ().lock ();
346+ try {
347+ return cachedAdvancedCheckResult != null && cachedAdvancedCheckResult .isDegraded ;
348+ } finally {
349+ advancedCheckLock .readLock ().unlock ();
350+ }
351+ }
352+
323353 try {
324- if (cachedAdvancedCheckResult != null &&
325- (currentTime - lastAdvancedCheckTime ) < ADVANCED_CHECKS_THROTTLE_SECONDS * 1000 ) {
326- return cachedAdvancedCheckResult .isDegraded ;
354+ // Perform DB I/O outside the write lock to avoid lock contention
355+ AdvancedCheckResult result ;
356+ try (Connection connection = dataSource .getConnection ()) {
357+ result = performAdvancedMySQLChecks (connection );
358+ } catch (Exception e ) {
359+ if (e .getCause () instanceof InterruptedException ) {
360+ Thread .currentThread ().interrupt ();
361+ }
362+ logger .debug ("Failed to get connection for advanced checks: {}" , e .getMessage ());
363+ result = new AdvancedCheckResult (false );
327364 }
328365
329- // Acquire a fresh connection for advanced checks to avoid pool exhaustion
330- try ( Connection connection = dataSource . getConnection ()) {
331- AdvancedCheckResult result = performAdvancedMySQLChecks ( connection );
366+ // Re-acquire write lock only to update the cache atomically
367+ advancedCheckLock . writeLock (). lock ();
368+ try {
332369 lastAdvancedCheckTime = currentTime ;
333370 cachedAdvancedCheckResult = result ;
334371 return result .isDegraded ;
372+ } finally {
373+ advancedCheckLock .writeLock ().unlock ();
335374 }
336- } catch (Exception e ) {
337- logger .debug ("Failed to get connection for advanced checks: {}" , e .getMessage ());
338- // Return cached result or false if no cache
339- return cachedAdvancedCheckResult != null && cachedAdvancedCheckResult .isDegraded ;
340375 } finally {
341- advancedCheckLock . writeLock (). unlock ( );
376+ advancedCheckInProgress . set ( false );
342377 }
343378 }
344379
@@ -392,7 +427,7 @@ private boolean hasLockWaits(Connection connection) {
392427 private boolean hasSlowQueries (Connection connection ) {
393428 try (PreparedStatement stmt = connection .prepareStatement (
394429 "SELECT COUNT(*) FROM INFORMATION_SCHEMA.PROCESSLIST " +
395- "WHERE command != 'Sleep' AND time > ? AND user NOT IN ('event_scheduler' , 'system user' )" )) {
430+ "WHERE command != 'Sleep' AND time > ? AND user = SUBSTRING_INDEX(USER() , '@', 1 )" )) {
396431 stmt .setQueryTimeout (2 );
397432 stmt .setInt (1 , 10 );
398433 try (ResultSet rs = stmt .executeQuery ()) {
0 commit comments