From dad6927718d7618edacbbbd28cf7ca36516b2334 Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Sun, 7 Jun 2026 14:02:05 -0400 Subject: [PATCH] Merge map/for schedule into a single `_resolve_loop_schedule` with an optimal parallel strategy --- src/gt4py/cartesian/gtc/dace/oir_to_treeir.py | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/gt4py/cartesian/gtc/dace/oir_to_treeir.py b/src/gt4py/cartesian/gtc/dace/oir_to_treeir.py index b75a5119f2..8f68341648 100644 --- a/src/gt4py/cartesian/gtc/dace/oir_to_treeir.py +++ b/src/gt4py/cartesian/gtc/dace/oir_to_treeir.py @@ -31,10 +31,18 @@ """Default dace residency types per device type.""" -def _resolve_default_map_schedule( - device_type: dtypes.DeviceType, +def _resolve_loop_schedule( + device_type: dtypes.DeviceType, loop_order: common.LoopOrder ) -> dtypes.ScheduleType: - """Default kernel target per device type.""" + """Optimal kernel schedule type based on syntax and target device. + + Current strategy: + - respect OIR syntax: sequential on all non-parallel keyword + - maximize local parallel usage per target + """ + if loop_order != common.LoopOrder.PARALLEL: + return dtypes.ScheduleType.Sequential + if device_type == dtypes.DeviceType.GPU: return dtypes.ScheduleType.GPU_Device @@ -44,7 +52,7 @@ def _resolve_default_map_schedule( if not gt_config.build_settings["openmp"]["use_openmp"]: return dtypes.ScheduleType.Sequential - return dtypes.ScheduleType.Default + return dtypes.ScheduleType.CPU_Multicore class OIRToTreeIR(eve.NodeVisitor): @@ -148,7 +156,9 @@ def visit_HorizontalExecution(self, node: oir.HorizontalExecution, ctx: tir.Cont loop = tir.HorizontalLoop( bounds_i=tir.Bounds(start=axis_start_i, end=axis_end_i), bounds_j=tir.Bounds(start=axis_start_j, end=axis_end_j), - schedule=_resolve_default_map_schedule(self._device_type), + schedule=_resolve_loop_schedule( + self._device_type, common.LoopOrder.PARALLEL + ), # Horizontal is always parallel children=[], parent=ctx.current_scope, ) @@ -258,19 +268,6 @@ def visit_Interval( return tir.Bounds(start=start, end=end) - def _vertical_loop_schedule(self) -> dtypes.ScheduleType: - """ - Defines the vertical loop schedule. - - Current strategy is to - - keep the vertical loop on the host for both, CPU and GPU targets - - and run it in parallel on CPU and sequential on GPU. - """ - if self._device_type == dtypes.DeviceType.GPU: - return dtypes.ScheduleType.Sequential - - return _resolve_default_map_schedule(self._device_type) - def visit_VerticalLoopSection( self, node: oir.VerticalLoopSection, ctx: tir.Context, loop_order: common.LoopOrder ) -> None: @@ -285,7 +282,7 @@ def visit_VerticalLoopSection( iteration_variable=eve.SymbolRef(f"{tir.Axis.K.iteration_symbol()}_{id(node)}"), loop_order=loop_order, bounds_k=bounds, - schedule=self._vertical_loop_schedule(), + schedule=_resolve_loop_schedule(self._device_type, loop_order), children=[], parent=ctx.current_scope, )