From 5d971f453d217216607d922a8ce604bf811e00d5 Mon Sep 17 00:00:00 2001 From: shreyaskommuri Date: Mon, 29 Jun 2026 10:41:30 -0700 Subject: [PATCH] Allow external callers to locate hook configs Expose an optional --hook-dir across scenario commands and pass the resolved directory into Parser so tools such as CloudAI Autotune can invoke hook-bearing scenarios outside the CloudAI repository. Constraint: Preserve conf/hook as the parser default and keep hook-free external invocations backward compatible. Rejected: Require callers to change into the CloudAI checkout | output paths and orchestration should not depend on process working directory. Confidence: high Scope-risk: narrow Directive: Keep scenario config roots explicit at the CLI boundary when external orchestration depends on them. Tested: uv run pytest -q (1636 passed, 4 skipped); uv run pytest -q -m ci_only (490 passed); uv run pytest --dead-fixtures; uv run pre-commit run --all-files; external NCCL dry-run from CloudAI-Autotune cwd exits 0 with --hook-dir Not-tested: Live NCCL execution on a Slurm cluster. Signed-off-by: shreyaskommuri --- src/cloudai/cli/cli.py | 39 ++++++++++++++++++++++++++++++------- src/cloudai/cli/handlers.py | 6 +++--- src/cloudai/parser.py | 17 ++++++++++------ tests/test_cli.py | 9 +++++++++ tests/test_parser.py | 7 +++++++ 5 files changed, 62 insertions(+), 16 deletions(-) diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py index 6df993218..2fe161144 100644 --- a/src/cloudai/cli/cli.py +++ b/src/cloudai/cli/cli.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -104,6 +104,13 @@ def common_options(f): type=click.Path(exists=True, resolve_path=True, path_type=Path), help="Scenario config path.", )(f) + f = click.option( + "--hook-dir", + required=False, + default=None, + type=click.Path(exists=True, resolve_path=True, path_type=Path, file_okay=False, dir_okay=True), + help="Directory with hook scenario and test configs.", + )(f) return f @@ -136,18 +143,28 @@ def main(log_file, log_level): @main.command() @common_options -def install(system_cfg: Path, tests_dir: Path, scenario_cfg: Path): +def install(system_cfg: Path, tests_dir: Path, scenario_cfg: Path, hook_dir: Path | None): """Install the necessary components for workloads.""" - args = argparse.Namespace(system_config=system_cfg, tests_dir=tests_dir, test_scenario=scenario_cfg, mode="install") + args = argparse.Namespace( + system_config=system_cfg, + tests_dir=tests_dir, + test_scenario=scenario_cfg, + hook_dir=hook_dir, + mode="install", + ) exit(handle_install_and_uninstall(args)) @main.command() @common_options -def uninstall(system_cfg: Path, tests_dir: Path, scenario_cfg: Path): +def uninstall(system_cfg: Path, tests_dir: Path, scenario_cfg: Path, hook_dir: Path | None): """Uninstall the components used by workloads.""" args = argparse.Namespace( - system_config=system_cfg, tests_dir=tests_dir, test_scenario=scenario_cfg, mode="uninstall" + system_config=system_cfg, + tests_dir=tests_dir, + test_scenario=scenario_cfg, + hook_dir=hook_dir, + mode="uninstall", ) exit(handle_install_and_uninstall(args)) @@ -161,6 +178,7 @@ def dry_run( system_cfg: Path, tests_dir: Path, scenario_cfg: Path, + hook_dir: Path | None, output_dir: Path, enable_cache_without_check: bool, single_sbatch: bool, @@ -170,6 +188,7 @@ def dry_run( system_config=system_cfg, tests_dir=tests_dir, test_scenario=scenario_cfg, + hook_dir=hook_dir, output_dir=output_dir, mode="dry-run", enable_cache_without_check=enable_cache_without_check, @@ -187,6 +206,7 @@ def run( system_cfg: Path, tests_dir: Path, scenario_cfg: Path, + hook_dir: Path | None, output_dir: Path, enable_cache_without_check: bool, single_sbatch: bool, @@ -200,6 +220,7 @@ def run( system_config=system_cfg, tests_dir=tests_dir, test_scenario=scenario_cfg, + hook_dir=hook_dir, output_dir=output_dir, mode="run", enable_cache_without_check=enable_cache_without_check, @@ -216,7 +237,7 @@ def run( type=click.Path(exists=True, resolve_path=True, path_type=Path, file_okay=False), help="Path to a scenario results directory.", ) -def generate_report(system_cfg: Path, tests_dir: Path, scenario_cfg: Path, result_dir: Path): +def generate_report(system_cfg: Path, tests_dir: Path, scenario_cfg: Path, hook_dir: Path | None, result_dir: Path): """ Generate a report from the results of a scenario. @@ -224,7 +245,11 @@ def generate_report(system_cfg: Path, tests_dir: Path, scenario_cfg: Path, resul command. """ args = argparse.Namespace( - system_config=system_cfg, tests_dir=tests_dir, test_scenario=scenario_cfg, result_dir=result_dir + system_config=system_cfg, + tests_dir=tests_dir, + test_scenario=scenario_cfg, + hook_dir=hook_dir, + result_dir=result_dir, ) exit(handle_generate_report(args)) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 3fd099dc0..89cb02e71 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -62,7 +62,7 @@ def handle_install_and_uninstall(args: argparse.Namespace) -> int: Args: args (argparse.Namespace): The parsed command-line arguments. """ - parser = Parser(args.system_config) + parser = Parser(args.system_config, args.hook_dir or HOOK_ROOT) system, tests, scenario = parser.parse(args.tests_dir, args.test_scenario) system.update() @@ -249,7 +249,7 @@ def register_signal_handlers(signal_handler: Callable) -> None: def _setup_system_and_scenario( args: argparse.Namespace, ) -> tuple[System, TestScenario, list[TestDefinition]] | None: - parser = Parser(args.system_config) + parser = Parser(args.system_config, args.hook_dir or HOOK_ROOT) try: system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario) except MissingTestError as e: @@ -354,7 +354,7 @@ def handle_generate_report(args: argparse.Namespace) -> int: Args: args (argparse.Namespace): The parsed command-line arguments. """ - parser = Parser(args.system_config) + parser = Parser(args.system_config, args.hook_dir or HOOK_ROOT) system, _, test_scenario = parser.parse(args.tests_dir, args.test_scenario) assert test_scenario is not None diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 5dbe22c0b..d4e34a979 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -43,15 +43,18 @@ class Parser: """Main parser for parsing all types of configurations.""" - def __init__(self, system_config_path: Path) -> None: + def __init__(self, system_config_path: Path, hook_root: Path = HOOK_ROOT) -> None: """ Initialize a Parser instance. Args: system_config_path (str): The file path for system configurations. + hook_root (Path): Directory containing hook scenarios and tests. """ logging.debug(f"Initializing parser with: {system_config_path=}") self.system_config_path = system_config_path + self.hook_root = hook_root + self.hook_test_root = hook_root / "test" self._system: Optional[System] = None @property @@ -90,12 +93,14 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - if not HOOK_ROOT.exists(): - logging.debug(f"HOOK_ROOT path '{HOOK_ROOT}' does not exist.") + if not self.hook_root.exists(): + logging.debug(f"Hook root path '{self.hook_root}' does not exist.") try: hook_tests = ( - self.parse_tests(list(HOOK_TEST_ROOT.glob("*.toml")), self.system) if HOOK_TEST_ROOT.exists() else [] + self.parse_tests(list(self.hook_test_root.glob("*.toml")), self.system) + if self.hook_test_root.exists() + else [] ) except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users @@ -106,10 +111,10 @@ def parse( test_mapping = {t.name: t for t in tests} hook_test_scenario_mapping = {} - if HOOK_ROOT.exists() and list(HOOK_ROOT.glob("*.toml")): + if self.hook_root.exists() and list(self.hook_root.glob("*.toml")): try: hook_test_scenario_mapping = self.parse_hooks( - list(HOOK_ROOT.glob("*.toml")), self.system, {t.name: t for t in hook_tests} + list(self.hook_root.glob("*.toml")), self.system, {t.name: t for t in hook_tests} ) except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users diff --git a/tests/test_cli.py b/tests/test_cli.py index 2a3f394ec..b20be445b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -46,6 +46,15 @@ def test_tests_dir_is_optional(tmp_path: Path): assert "Missing option '--tests-dir'" not in result.output +def test_hook_dir_is_available_on_run_commands(): + runner = CliRunner() + + result = runner.invoke(main, ["dry-run", "--help"]) + + assert result.exit_code == 0 + assert "--hook-dir DIRECTORY" in result.output + + @pytest.mark.parametrize( "subcommand", ["dry-run", "generate-report", "install", "list", "run", "uninstall", "verify-configs"] ) diff --git a/tests/test_parser.py b/tests/test_parser.py index 63a9413da..028216318 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -39,6 +39,13 @@ def test_no_tests_dir(self, parser: Parser): parser.parse(tests_dir, None) assert "Test path" in str(exc_info.value) + def test_custom_hook_root_is_used(self, parser: Parser, tmp_path: Path): + hook_root = tmp_path / "hooks" + parser = Parser(parser.system_config_path, hook_root) + + assert parser.hook_root == hook_root + assert parser.hook_test_root == hook_root / "test" + @patch("cloudai.test_parser.TestParser.parse_all") def test_no_scenario(self, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test"