Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/cloudai/models/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ def __hash__(self) -> int:
return self.git_repo.__hash__()


class TrainingReportConfig(BaseModel):
"""Training-report aggregation window: steps excluded before computing per-metric stats."""

model_config = ConfigDict(extra="forbid")

exclude_start_steps: int = Field(default=5, ge=0)
exclude_post_profiling_steps: int = Field(default=2, ge=0)
Comment thread
blugassi marked this conversation as resolved.

Comment thread
coderabbitai[bot] marked this conversation as resolved.

class TestDefinition(BaseModel, ABC):
"""Base Test object."""

Expand All @@ -106,6 +115,7 @@ class TestDefinition(BaseModel, ABC):
git_repos: list[GitRepo] = []
nsys: Optional[NsysConfiguration] = None
predictor: Optional[PredictorConfig] = None
training_report: Optional[TrainingReportConfig] = None

agent: str = "grid_search"
agent_steps: int = 1
Expand Down
35 changes: 31 additions & 4 deletions src/cloudai/report_generator/training/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@
}


# (world_size, num_nodes, model_name) and the computed data_parallel_size are not mapped here.
NEMO_CONFIG: dict[str, str] = {
# Framework's resolved config artifact. (world_size, num_nodes, model_name) and computed data_parallel_size are not
# mapped here.
NEMO_MODEL_CONFIG: dict[str, str] = {
"micro_batch_size": "data.micro_batch_size",
"global_batch_size": "data.global_batch_size",
"seq_length": "data.seq_length",
Expand All @@ -77,7 +78,7 @@
"moe_grouped_gemm": "model.moe_grouped_gemm",
}

MEGATRON_CONFIG: dict[str, str] = {
MEGATRON_MODEL_CONFIG: dict[str, str] = {
"micro_batch_size": "micro_batch_size",
"global_batch_size": "global_batch_size",
"seq_length": "seq_length",
Expand All @@ -101,7 +102,7 @@
"moe_grouped_gemm": "moe_grouped_gemm",
}

MEGATRON_BRIDGE_CONFIG: dict[str, str] = {
MEGATRON_BRIDGE_MODEL_CONFIG: dict[str, str] = {
"micro_batch_size": "train.micro_batch_size",
"global_batch_size": "train.global_batch_size",
"seq_length": "model.seq_length",
Expand All @@ -124,3 +125,29 @@
"moe_ffn_hidden_size": "model.moe_ffn_hidden_size",
"moe_grouped_gemm": "model.moe_grouped_gemm",
}


# CloudAI TestDefinition (user TOML + defaults). TrainingConfig field -> dotted path in TestDefinition.model_dump().
NEMO_TEST_CONFIG: dict[str, str] = {
"profiling_enabled": "nsys.enable",
"profiling_start_step": "extra_cmd_args.*start_step",
"profiling_stop_step": "extra_cmd_args.*end_step",
"exclude_start_steps": "training_report.exclude_start_steps",
"exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
}

MEGATRON_TEST_CONFIG: dict[str, str] = {
"profiling_enabled": "nsys.enable",
"profiling_start_step": "cmd_args.profile_step_start",
"profiling_stop_step": "cmd_args.profile_step_end",
"exclude_start_steps": "training_report.exclude_start_steps",
"exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
}

MEGATRON_BRIDGE_TEST_CONFIG: dict[str, str] = {
"profiling_enabled": "cmd_args.enable_nsys",
"profiling_start_step": "cmd_args.profiling_start_step",
"profiling_stop_step": "cmd_args.profiling_stop_step",
"exclude_start_steps": "training_report.exclude_start_steps",
"exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
65 changes: 65 additions & 0 deletions src/cloudai/report_generator/training/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,43 @@

"""Data models for training parsers."""

import statistics
from collections.abc import Hashable
from dataclasses import MISSING, dataclass, fields
from typing import Any, List, Optional


@dataclass
class MetricStats:
"""Aggregated statistics for one metric over the filtered steps."""

mean: float
min: float
max: float
std: float
t99: float
t95: float

@classmethod
def from_values(cls, values: list[float]) -> "MetricStats":
"""Build stats from a non-empty list of values (population std; inclusive percentiles)."""
return cls(
mean=statistics.mean(values),
min=min(values),
max=max(values),
std=statistics.pstdev(values),
t99=cls._percentile(values, 99),
t95=cls._percentile(values, 95),
)

@staticmethod
def _percentile(values: list[float], p: int) -> float:
"""Inclusive, linearly-interpolated p-th percentile; returns the sole value for a single sample."""
if len(values) == 1:
return float(values[0])
return statistics.quantiles(values, n=100, method="inclusive")[p - 1]


@dataclass(frozen=True)
class Scalar:
"""A single scalar event from a training run (source-agnostic: TensorBoard today, others later)."""
Expand Down Expand Up @@ -51,6 +83,29 @@ class TrainingStep:
OPTIONAL_STEP_FIELDS = {f.name for f in fields(TrainingStep) if f.default is not MISSING}


@dataclass(kw_only=True)
class StepAggregation:
"""Per-metric aggregated statistics over the filtered steps."""

step_time_sec: MetricStats
loss: MetricStats
memory_reserved_bytes: MetricStats
memory_allocated_bytes: MetricStats
tflops_per_gpu: Optional[MetricStats] = None

@classmethod
def from_steps(cls, steps: list["TrainingStep"]) -> "StepAggregation":
"""Build per-metric stats from a non-empty list of already-filtered steps."""
tflops = [s.tflops_per_gpu for s in steps if s.tflops_per_gpu is not None]
return cls(
step_time_sec=MetricStats.from_values([s.step_time_sec for s in steps]),
loss=MetricStats.from_values([s.loss for s in steps]),
memory_reserved_bytes=MetricStats.from_values([s.memory_reserved_bytes for s in steps]),
memory_allocated_bytes=MetricStats.from_values([s.memory_allocated_bytes for s in steps]),
tflops_per_gpu=MetricStats.from_values(tflops) if tflops else None,
)


@dataclass(kw_only=True)
class TrainingConfig:
"""
Expand Down Expand Up @@ -95,6 +150,15 @@ class TrainingConfig:
world_size: Optional[int] = None # CloudAI-computed (None when gpus_per_node is unavailable)
num_nodes: int = 0 # CloudAI-computed

# Profiling (CloudAI-computed from the run's nsys/profiler settings)
profiling_enabled: bool = False
profiling_start_step: Optional[int] = None
profiling_stop_step: Optional[int] = None

# Aggregation window (steps dropped before computing the top-level aggregation)
exclude_start_steps: int = 5
exclude_post_profiling_steps: int = 2

# Identity
test_template_name: str = "" # CloudAI-computed

Expand All @@ -105,3 +169,4 @@ class TrainingResults:

config: TrainingConfig
steps: List[TrainingStep]
aggregation: Optional[StepAggregation] = None # None when no steps remain after exclusions
Loading
Loading