NVIDIA · blugassi · Jul 2, 2026 · Jul 2, 2026 · Jul 5, 2026
@@ -89,6 +89,15 @@ def __hash__(self) -> int:
         return self.git_repo.__hash__()
 
 
+class TrainingReportConfig(BaseModel):
+    """Training-report aggregation window: steps excluded before computing per-metric stats."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    exclude_start_steps: int = Field(default=5, ge=0)
+    exclude_post_profiling_steps: int = Field(default=2, ge=0)
+
+
 class TestDefinition(BaseModel, ABC):
     """Base Test object."""
 
@@ -106,6 +115,7 @@ class TestDefinition(BaseModel, ABC):
     git_repos: list[GitRepo] = []
     nsys: Optional[NsysConfiguration] = None
     predictor: Optional[PredictorConfig] = None
+    training_report: Optional[TrainingReportConfig] = None
 
     agent: str = "grid_search"
     agent_steps: int = 1

@@ -52,8 +52,9 @@
 }
 
 
-# (world_size, num_nodes, model_name) and the computed data_parallel_size are not mapped here.
-NEMO_CONFIG: dict[str, str] = {
+# Framework's resolved config artifact. (world_size, num_nodes, model_name) and computed data_parallel_size are not
+# mapped here.
+NEMO_MODEL_CONFIG: dict[str, str] = {
     "micro_batch_size": "data.micro_batch_size",
     "global_batch_size": "data.global_batch_size",
     "seq_length": "data.seq_length",
@@ -77,7 +78,7 @@
     "moe_grouped_gemm": "model.moe_grouped_gemm",
 }
 
-MEGATRON_CONFIG: dict[str, str] = {
+MEGATRON_MODEL_CONFIG: dict[str, str] = {
     "micro_batch_size": "micro_batch_size",
     "global_batch_size": "global_batch_size",
     "seq_length": "seq_length",
@@ -101,7 +102,7 @@
     "moe_grouped_gemm": "moe_grouped_gemm",
 }
 
-MEGATRON_BRIDGE_CONFIG: dict[str, str] = {
+MEGATRON_BRIDGE_MODEL_CONFIG: dict[str, str] = {
     "micro_batch_size": "train.micro_batch_size",
     "global_batch_size": "train.global_batch_size",
     "seq_length": "model.seq_length",
@@ -124,3 +125,29 @@
     "moe_ffn_hidden_size": "model.moe_ffn_hidden_size",
     "moe_grouped_gemm": "model.moe_grouped_gemm",
 }
+
+
+# CloudAI TestDefinition (user TOML + defaults). TrainingConfig field -> dotted path in TestDefinition.model_dump().
+NEMO_TEST_CONFIG: dict[str, str] = {
+    "profiling_enabled": "nsys.enable",
+    "profiling_start_step": "extra_cmd_args.*start_step",
+    "profiling_stop_step": "extra_cmd_args.*end_step",
+    "exclude_start_steps": "training_report.exclude_start_steps",
+    "exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
+}
+
+MEGATRON_TEST_CONFIG: dict[str, str] = {
+    "profiling_enabled": "nsys.enable",
+    "profiling_start_step": "cmd_args.profile_step_start",
+    "profiling_stop_step": "cmd_args.profile_step_end",
+    "exclude_start_steps": "training_report.exclude_start_steps",
+    "exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
+}
+
+MEGATRON_BRIDGE_TEST_CONFIG: dict[str, str] = {
+    "profiling_enabled": "cmd_args.enable_nsys",
+    "profiling_start_step": "cmd_args.profiling_start_step",
+    "profiling_stop_step": "cmd_args.profiling_stop_step",
+    "exclude_start_steps": "training_report.exclude_start_steps",
+    "exclude_post_profiling_steps": "training_report.exclude_post_profiling_steps",
+}
@@ -16,11 +16,43 @@
 
 """Data models for training parsers."""
 
+import statistics
 from collections.abc import Hashable
 from dataclasses import MISSING, dataclass, fields
 from typing import Any, List, Optional
 
 
+@dataclass
+class MetricStats:
+    """Aggregated statistics for one metric over the filtered steps."""
+
+    mean: float
+    min: float
+    max: float
+    std: float
+    t99: float
+    t95: float
+
+    @classmethod
+    def from_values(cls, values: list[float]) -> "MetricStats":
+        """Build stats from a non-empty list of values (population std; inclusive percentiles)."""
+        return cls(
+            mean=statistics.mean(values),
+            min=min(values),
+            max=max(values),
+            std=statistics.pstdev(values),
+            t99=cls._percentile(values, 99),
+            t95=cls._percentile(values, 95),
+        )
+
+    @staticmethod
+    def _percentile(values: list[float], p: int) -> float:
+        """Inclusive, linearly-interpolated p-th percentile; returns the sole value for a single sample."""
+        if len(values) == 1:
+            return float(values[0])
+        return statistics.quantiles(values, n=100, method="inclusive")[p - 1]
+
+
 @dataclass(frozen=True)
 class Scalar:
     """A single scalar event from a training run (source-agnostic: TensorBoard today, others later)."""
@@ -51,6 +83,29 @@ class TrainingStep:
 OPTIONAL_STEP_FIELDS = {f.name for f in fields(TrainingStep) if f.default is not MISSING}
 
 
+@dataclass(kw_only=True)
+class StepAggregation:
+    """Per-metric aggregated statistics over the filtered steps."""
+
+    step_time_sec: MetricStats
+    loss: MetricStats
+    memory_reserved_bytes: MetricStats
+    memory_allocated_bytes: MetricStats
+    tflops_per_gpu: Optional[MetricStats] = None
+
+    @classmethod
+    def from_steps(cls, steps: list["TrainingStep"]) -> "StepAggregation":
+        """Build per-metric stats from a non-empty list of already-filtered steps."""
+        tflops = [s.tflops_per_gpu for s in steps if s.tflops_per_gpu is not None]
+        return cls(
+            step_time_sec=MetricStats.from_values([s.step_time_sec for s in steps]),
+            loss=MetricStats.from_values([s.loss for s in steps]),
+            memory_reserved_bytes=MetricStats.from_values([s.memory_reserved_bytes for s in steps]),
+            memory_allocated_bytes=MetricStats.from_values([s.memory_allocated_bytes for s in steps]),
+            tflops_per_gpu=MetricStats.from_values(tflops) if tflops else None,
+        )
+
+
 @dataclass(kw_only=True)
 class TrainingConfig:
     """
@@ -95,6 +150,15 @@ class TrainingConfig:
     world_size: Optional[int] = None  # CloudAI-computed (None when gpus_per_node is unavailable)
     num_nodes: int = 0  # CloudAI-computed
 
+    # Profiling (CloudAI-computed from the run's nsys/profiler settings)
+    profiling_enabled: bool = False
+    profiling_start_step: Optional[int] = None
+    profiling_stop_step: Optional[int] = None
+
+    # Aggregation window (steps dropped before computing the top-level aggregation)
+    exclude_start_steps: int = 5
+    exclude_post_profiling_steps: int = 2
+
     # Identity
     test_template_name: str = ""  # CloudAI-computed
 
@@ -105,3 +169,4 @@ class TrainingResults:
 
     config: TrainingConfig
     steps: List[TrainingStep]
+    aggregation: Optional[StepAggregation] = None  # None when no steps remain after exclusions