From 19e3546724e4f4d0aa5c61754f4d3194ec71e293 Mon Sep 17 00:00:00 2001 From: Mehul Gohil Date: Wed, 17 Jun 2026 11:39:41 +0530 Subject: [PATCH] Bound sitemap preload traversal --- src/Modules/Cache/PageCache.php | 33 +++++- tests/bootstrap.php | 81 +++++++++++++++ tests/unit-tests/tests-page-cache.php | 143 +++++++++++++++++++++++++- 3 files changed, 252 insertions(+), 5 deletions(-) diff --git a/src/Modules/Cache/PageCache.php b/src/Modules/Cache/PageCache.php index 168de96..be542da 100644 --- a/src/Modules/Cache/PageCache.php +++ b/src/Modules/Cache/PageCache.php @@ -74,6 +74,20 @@ class PageCache implements ModuleInterface { */ private $url_normalizer = null; + /** + * Maximum number of child sitemap documents to fetch per seed run. + * + * @var int + */ + private $preload_sitemap_child_limit = 20; + + /** + * Maximum number of unique sitemap URLs to collect per seed run. + * + * @var int + */ + private $preload_sitemap_url_cap = 500; + /** * Determine whether this module should be loaded. * @@ -656,9 +670,17 @@ private function fetch_urls_from_sitemap() { return []; } - $urls = []; + $urls = []; + $url_cap = max( 1, (int) $this->preload_sitemap_url_cap ); + $child_sitemap_limit = max( 1, (int) $this->preload_sitemap_child_limit ); + $child_sitemaps_fetched = 0; + if ( isset( $index->sitemap ) ) { foreach ( $index->sitemap as $sitemap ) { + if ( count( $urls ) >= $url_cap || $child_sitemaps_fetched >= $child_sitemap_limit ) { + break; + } + if ( empty( $sitemap->loc ) ) { continue; } @@ -668,6 +690,7 @@ private function fetch_urls_from_sitemap() { continue; } + ++$child_sitemaps_fetched; $child_response = wp_remote_get( $child_sitemap_url, [ 'timeout' => 8 ] ); if ( is_wp_error( $child_response ) ) { continue; @@ -684,17 +707,21 @@ private function fetch_urls_from_sitemap() { } foreach ( $child->url as $item ) { + if ( count( $urls ) >= $url_cap ) { + break; + } + if ( ! empty( $item->loc ) ) { $item_url = esc_url_raw( (string) $item->loc ); if ( $this->is_site_url( $item_url ) ) { - $urls[] = $item_url; + $urls[ $item_url ] = true; } } } } } - return array_slice( array_values( array_unique( array_filter( $urls ) ) ), 0, 500 ); + return array_keys( $urls ); } /** diff --git a/tests/bootstrap.php b/tests/bootstrap.php index a5bc27e..87f38eb 100644 --- a/tests/bootstrap.php +++ b/tests/bootstrap.php @@ -143,6 +143,22 @@ function admin_url( $path = '' ) { } } +if ( ! function_exists( 'home_url' ) ) { + function home_url( $path = '' ) { + $base = isset( $GLOBALS['perform_test_home_url'] ) ? (string) $GLOBALS['perform_test_home_url'] : 'https://example.com'; + + if ( '' === $path ) { + return $base; + } + + if ( 0 === strpos( (string) $path, 'http://' ) || 0 === strpos( (string) $path, 'https://' ) ) { + return (string) $path; + } + + return rtrim( $base, '/' ) . '/' . ltrim( (string) $path, '/' ); + } +} + if ( ! function_exists( 'esc_url' ) ) { function esc_url( $url ) { return (string) $url; @@ -223,6 +239,71 @@ function esc_url_raw( $url ) { } } +if ( ! class_exists( 'WP_Error' ) ) { + class WP_Error { + /** + * Error code. + * + * @var string + */ + public $code = ''; + + /** + * Error message. + * + * @var string + */ + public $message = ''; + + /** + * Constructor. + * + * @param string $code Error code. + * @param string $message Error message. + */ + public function __construct( $code = '', $message = '' ) { + $this->code = (string) $code; + $this->message = (string) $message; + } + } +} + +if ( ! function_exists( 'is_wp_error' ) ) { + function is_wp_error( $thing ) { + return $thing instanceof WP_Error; + } +} + +if ( ! function_exists( 'wp_remote_get' ) ) { + function wp_remote_get( $url, $args = [] ) { + if ( ! isset( $GLOBALS['perform_test_remote_get_calls'] ) || ! is_array( $GLOBALS['perform_test_remote_get_calls'] ) ) { + $GLOBALS['perform_test_remote_get_calls'] = []; + } + + $GLOBALS['perform_test_remote_get_calls'][] = [ + 'url' => $url, + 'args' => $args, + ]; + + $responses = isset( $GLOBALS['perform_test_remote_get_map'] ) && is_array( $GLOBALS['perform_test_remote_get_map'] ) ? $GLOBALS['perform_test_remote_get_map'] : []; + if ( array_key_exists( $url, $responses ) ) { + return $responses[ $url ]; + } + + return new WP_Error( 'missing_mock', 'No mocked response registered.' ); + } +} + +if ( ! function_exists( 'wp_remote_retrieve_body' ) ) { + function wp_remote_retrieve_body( $response ) { + if ( is_array( $response ) && isset( $response['body'] ) ) { + return (string) $response['body']; + } + + return ''; + } +} + if ( ! function_exists( 'sanitize_text_field' ) ) { function sanitize_text_field( $value ) { return is_scalar( $value ) ? trim( (string) $value ) : $value; diff --git a/tests/unit-tests/tests-page-cache.php b/tests/unit-tests/tests-page-cache.php index f7b136f..82f1ca4 100644 --- a/tests/unit-tests/tests-page-cache.php +++ b/tests/unit-tests/tests-page-cache.php @@ -5,14 +5,27 @@ final class Tests_Page_Cache extends TestCase { protected function setUp(): void { - $GLOBALS['perform_test_transients'] = [ + $GLOBALS['perform_test_transients'] = [ 'perform_cache_lock_test' => 'expected-token', ]; + $GLOBALS['perform_test_options'] = []; + $GLOBALS['perform_test_filters'] = []; + $GLOBALS['perform_test_home_url'] = 'https://example.com'; + $GLOBALS['perform_test_remote_get_map'] = []; + $GLOBALS['perform_test_remote_get_calls'] = []; unset( $_SERVER['HTTP_X_PERFORM_CACHE_REGEN'] ); } protected function tearDown(): void { - unset( $GLOBALS['perform_test_transients'], $_SERVER['HTTP_X_PERFORM_CACHE_REGEN'] ); + unset( + $GLOBALS['perform_test_filters'], + $GLOBALS['perform_test_home_url'], + $GLOBALS['perform_test_options'], + $GLOBALS['perform_test_remote_get_calls'], + $GLOBALS['perform_test_remote_get_map'], + $GLOBALS['perform_test_transients'], + $_SERVER['HTTP_X_PERFORM_CACHE_REGEN'] + ); } public function test_internal_regeneration_requires_matching_lock_token() { @@ -31,4 +44,130 @@ public function test_internal_regeneration_requires_matching_lock_token() { $_SERVER['HTTP_X_PERFORM_CACHE_REGEN'] = 'expected-token'; $this->assertTrue( $is_internal_regen_request->invoke( $page_cache ) ); } + + public function test_fetch_urls_from_sitemap_limits_child_sitemap_requests_per_run() { + $page_cache = new PageCache(); + + $this->set_private_property( $page_cache, 'preload_sitemap_child_limit', 3 ); + $this->set_private_property( $page_cache, 'preload_sitemap_url_cap', 20 ); + + $GLOBALS['perform_test_remote_get_map'] = [ + 'https://example.com/wp-sitemap.xml' => [ 'body' => $this->build_sitemap_index_xml( 5 ) ], + 'https://example.com/sitemap-1.xml' => [ 'body' => $this->build_urlset_xml( '/page-1', '/page-2' ) ], + 'https://example.com/sitemap-2.xml' => [ 'body' => $this->build_urlset_xml( '/page-3', '/page-4' ) ], + 'https://example.com/sitemap-3.xml' => [ 'body' => $this->build_urlset_xml( '/page-5', '/page-6' ) ], + 'https://example.com/sitemap-4.xml' => [ 'body' => $this->build_urlset_xml( '/page-7', '/page-8' ) ], + 'https://example.com/sitemap-5.xml' => [ 'body' => $this->build_urlset_xml( '/page-9', '/page-10' ) ], + ]; + + $method = new ReflectionMethod( $page_cache, 'fetch_urls_from_sitemap' ); + $method->setAccessible( true ); + + $this->assertSame( + [ + 'https://example.com/page-1', + 'https://example.com/page-2', + 'https://example.com/page-3', + 'https://example.com/page-4', + 'https://example.com/page-5', + 'https://example.com/page-6', + ], + $method->invoke( $page_cache ) + ); + + $this->assertSame( + [ + 'https://example.com/wp-sitemap.xml', + 'https://example.com/sitemap-1.xml', + 'https://example.com/sitemap-2.xml', + 'https://example.com/sitemap-3.xml', + ], + array_column( $GLOBALS['perform_test_remote_get_calls'], 'url' ) + ); + } + + public function test_fetch_urls_from_sitemap_stops_after_reaching_url_cap() { + $page_cache = new PageCache(); + + $this->set_private_property( $page_cache, 'preload_sitemap_child_limit', 5 ); + $this->set_private_property( $page_cache, 'preload_sitemap_url_cap', 3 ); + + $GLOBALS['perform_test_remote_get_map'] = [ + 'https://example.com/wp-sitemap.xml' => [ 'body' => $this->build_sitemap_index_xml( 2 ) ], + 'https://example.com/sitemap-1.xml' => [ 'body' => $this->build_urlset_xml( '/page-1', '/page-2', '/page-2', '/page-3', '/page-4' ) ], + 'https://example.com/sitemap-2.xml' => [ 'body' => $this->build_urlset_xml( '/page-5' ) ], + ]; + + $method = new ReflectionMethod( $page_cache, 'fetch_urls_from_sitemap' ); + $method->setAccessible( true ); + + $this->assertSame( + [ + 'https://example.com/page-1', + 'https://example.com/page-2', + 'https://example.com/page-3', + ], + $method->invoke( $page_cache ) + ); + + $this->assertSame( + [ + 'https://example.com/wp-sitemap.xml', + 'https://example.com/sitemap-1.xml', + ], + array_column( $GLOBALS['perform_test_remote_get_calls'], 'url' ) + ); + } + + public function test_seed_preload_queue_keeps_existing_queue_when_sitemap_fetch_fails() { + $GLOBALS['perform_test_options'] = [ + 'perform_settings' => [ + 'enable_cache_preload' => true, + ], + 'perform_cache_preload_queue' => [ + 'https://example.com/existing-page', + ], + ]; + $GLOBALS['perform_test_remote_get_map'] = [ + 'https://example.com/wp-sitemap.xml' => new WP_Error( 'http_request_failed', 'Timeout' ), + ]; + + $page_cache = new PageCache(); + $page_cache->seed_preload_queue_from_sitemap_and_logs(); + + $this->assertSame( + [ 'https://example.com/existing-page' ], + $GLOBALS['perform_test_options']['perform_cache_preload_queue'] + ); + $this->assertSame( + [ 'https://example.com/wp-sitemap.xml' ], + array_column( $GLOBALS['perform_test_remote_get_calls'], 'url' ) + ); + } + + private function set_private_property( PageCache $page_cache, string $property_name, int $value ): void { + $property = new ReflectionProperty( $page_cache, $property_name ); + $property->setAccessible( true ); + $property->setValue( $page_cache, $value ); + } + + private function build_sitemap_index_xml( int $child_count ): string { + $items = []; + + for ( $index = 1; $index <= $child_count; $index++ ) { + $items[] = sprintf( 'https://example.com/sitemap-%d.xml', $index ); + } + + return '' . implode( '', $items ) . ''; + } + + private function build_urlset_xml( string ...$paths ): string { + $items = []; + + foreach ( $paths as $path ) { + $items[] = 'https://example.com' . $path . ''; + } + + return '' . implode( '', $items ) . ''; + } }