diff --git a/scripts/backends/ompi.py b/scripts/backends/ompi.py index f0f337c..2e0dd65 100644 --- a/scripts/backends/ompi.py +++ b/scripts/backends/ompi.py @@ -1,4 +1,5 @@ import os + from backends.mpi_base import BaseMpiBackend diff --git a/scripts/icclrun.py b/scripts/icclrun.py index 026efc6..d882b98 100644 --- a/scripts/icclrun.py +++ b/scripts/icclrun.py @@ -1,41 +1,46 @@ #!/usr/bin/env python3 import argparse -import sys import os +import sys -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(SCRIPT_DIR) - -# `CMAKE_INSTALL_PREFIX` -PREFIX_DIR = os.path.dirname(SCRIPT_DIR) -lib_found = False - -for lib_dir_name in ["lib64", "lib"]: - candidate_path = os.path.join(PREFIX_DIR, lib_dir_name, "infiniccl") - if os.path.exists(candidate_path): - sys.path.append(candidate_path) - lib_found = True - break - -# Fallback -if not lib_found: - if os.path.exists(os.path.join(PREFIX_DIR, "icclrun_logic.py")): - sys.path.append(PREFIX_DIR) - else: - print( - f"[Error]: Could not locate 'icclrun_logic.py' in system library paths or local workspace.", - file=sys.stderr, - ) - print( - f"Looked under: {os.path.join(PREFIX_DIR, 'lib64/infiniccl')}, {os.path.join(PREFIX_DIR, 'lib/infiniccl')}, and {PREFIX_DIR}", - file=sys.stderr, - ) - sys.exit(1) -from icclrun_logic import ICCLLauncher +def configure_system_paths(): + """Dynamically resolves and injects necessary framework search paths.""" + SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + sys.path.append(SCRIPT_DIR) + + # `CMAKE_INSTALL_PREFIX` + PREFIX_DIR = os.path.dirname(SCRIPT_DIR) + lib_found = False + + for lib_dir_name in ["lib64", "lib"]: + candidate_path = os.path.join(PREFIX_DIR, lib_dir_name, "infiniccl") + if os.path.exists(candidate_path): + sys.path.append(candidate_path) + lib_found = True + break + + # Fallback + if not lib_found: + if os.path.exists(os.path.join(PREFIX_DIR, "icclrun_logic.py")): + sys.path.append(PREFIX_DIR) + else: + print( + "[Error]: Could not locate 'icclrun_logic.py' in system library paths or local workspace.", + file=sys.stderr, + ) + print( + f"Looked under: {os.path.join(PREFIX_DIR, 'lib64/infiniccl')}, {os.path.join(PREFIX_DIR, 'lib/infiniccl')}, and {PREFIX_DIR}", + file=sys.stderr, + ) + sys.exit(1) def main(): + configure_system_paths() + + from icclrun_logic import ICCLLauncher + parser = argparse.ArgumentParser(description="InfiniCCL Unified Launcher") parser.add_argument("--config", "-c", dest="cluster", help="Path to cluster.yaml") parser.add_argument("--build", action="store_true", help="Compile remote nodes") diff --git a/scripts/icclrun_logic.py b/scripts/icclrun_logic.py index 5b95492..c06a378 100644 --- a/scripts/icclrun_logic.py +++ b/scripts/icclrun_logic.py @@ -39,7 +39,7 @@ def _is_local(self, ip): local_ips = socket.gethostbyname_ex(socket.gethostname())[2] local_ips += ["127.0.0.1", "localhost"] return ip in local_ips - except: + except Exception: return False def orchestrate_build(self): diff --git a/scripts/run_examples.py b/scripts/run_examples.py index 43fb86b..0fc79ef 100755 --- a/scripts/run_examples.py +++ b/scripts/run_examples.py @@ -80,7 +80,8 @@ def run_iccl_example( process.wait(timeout=timeout_duration) return_code = process.returncode print( - f"--- [VERBOSE OUTPUT END: `{example_name}`] ---\n" + " " * 56, end="" + f"--- [VERBOSE OUTPUT END: `{example_name}`] ---\n" + " " * 56, + end="", ) else: # Quiet mode: Redirect straight to the file handle. @@ -104,7 +105,9 @@ def run_iccl_example( except subprocess.TimeoutExpired: print(f" ❌ TIMEOUT (Exceeded {timeout_duration} seconds)") with open(log_file_path, "a") as f: - f.write(f"\n[RUNNER ERROR]: Distributed `icclrun` harness timed out after {timeout_duration} seconds.\n") + f.write( + f"\n[RUNNER ERROR]: Distributed `icclrun` harness timed out after {timeout_duration} seconds.\n" + ) return False except FileNotFoundError: print(" ❌ ERROR (`icclrun` executable not found in `PATH`)") diff --git a/src/nvidia/nccl/comm_instance.h b/src/nvidia/nccl/comm_instance.h index 12fe2fc..c8d8956 100644 --- a/src/nvidia/nccl/comm_instance.h +++ b/src/nvidia/nccl/comm_instance.h @@ -12,6 +12,6 @@ struct NcclInstance : public BackendCommInstance { NcclInstance() { type = BackendType::kNccl; } }; -} // namespace infini::ccl +} // namespace infini::ccl #endif