Skip to content

Commit 6caeace

Browse files
donettom-1alexdeucher
authored andcommitted
drm/amd: Fix MQD and control stack alignment for non-4K
For gfxV9, due to a hardware bug ("based on the comments in the code here [1]"), the control stack of a user-mode compute queue must be allocated immediately after the page boundary of its regular MQD buffer. To handle this, we allocate an enlarged MQD buffer where the first page is used as the MQD and the remaining pages store the control stack. Although these regions share the same BO, they require different memory types: the MQD must be UC (uncached), while the control stack must be NC (non-coherent), matching the behavior when the control stack is allocated in user space. This logic works correctly on systems where the CPU page size matches the GPU page size (4K). However, the current implementation aligns both the MQD and the control stack to the CPU PAGE_SIZE. On systems with a larger CPU page size, the entire first CPU page is marked UC—even though that page may contain multiple GPU pages. The GPU treats the second 4K GPU page inside that CPU page as part of the control stack, but it is incorrectly mapped as UC. This patch fixes the issue by aligning both the MQD and control stack sizes to the GPU page size (4K). The first 4K page is correctly marked as UC for the MQD, and the remaining GPU pages are marked NC for the control stack. This ensures proper memory type assignment on systems with larger CPU page sizes. [1]: https://elixir.bootlin.com/linux/v6.18/source/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c#L118 Acked-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Donet Tom <donettom@linux.ibm.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit 998d6781410de1c4b787fdbf6c56e851ea7fa553)
1 parent 68484a6 commit 6caeace

4 files changed

Lines changed: 64 additions & 21 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,50 @@ void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa,
403403
drm_dev_exit(idx);
404404
}
405405

406+
/**
407+
* amdgpu_gart_map_gfx9_mqd - map mqd and ctrl_stack dma_addresses into GART entries
408+
*
409+
* @adev: amdgpu_device pointer
410+
* @offset: offset into the GPU's gart aperture
411+
* @pages: number of pages to bind
412+
* @dma_addr: DMA addresses of pages
413+
* @flags: page table entry flags
414+
*
415+
* Map the MQD and control stack addresses into GART entries with the correct
416+
* memory types on gfxv9. The MQD occupies the first 4KB and is followed by
417+
* the control stack. The MQD uses UC (uncached) memory, while the control stack
418+
* uses NC (non-coherent) memory.
419+
*/
420+
void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset,
421+
int pages, dma_addr_t *dma_addr, uint64_t flags)
422+
{
423+
uint64_t page_base;
424+
unsigned int i, j, t;
425+
int idx;
426+
uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC);
427+
void *dst;
428+
429+
if (!adev->gart.ptr)
430+
return;
431+
432+
if (!drm_dev_enter(adev_to_drm(adev), &idx))
433+
return;
434+
435+
t = offset / AMDGPU_GPU_PAGE_SIZE;
436+
dst = adev->gart.ptr;
437+
for (i = 0; i < pages; i++) {
438+
page_base = dma_addr[i];
439+
for (j = 0; j < AMDGPU_GPU_PAGES_IN_CPU_PAGE; j++, t++) {
440+
if ((i == 0) && (j == 0))
441+
amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, flags);
442+
else
443+
amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, ctrl_flags);
444+
page_base += AMDGPU_GPU_PAGE_SIZE;
445+
}
446+
}
447+
drm_dev_exit(idx);
448+
}
449+
406450
/**
407451
* amdgpu_gart_bind - bind pages into the gart page table
408452
*

drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
6262
void amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
6363
int pages, dma_addr_t *dma_addr, uint64_t flags,
6464
void *dst);
65+
void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset,
66+
int pages, dma_addr_t *dma_addr, uint64_t flags);
6567
void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
6668
int pages, dma_addr_t *dma_addr, uint64_t flags);
6769
void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa,

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -853,25 +853,15 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct amdgpu_device *adev,
853853
int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp);
854854
uint64_t page_idx, pages_per_xcc;
855855
int i;
856-
uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC);
857856

858857
pages_per_xcc = total_pages;
859858
do_div(pages_per_xcc, num_xcc);
860859

861860
for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc) {
862-
/* MQD page: use default flags */
863-
amdgpu_gart_bind(adev,
861+
amdgpu_gart_map_gfx9_mqd(adev,
864862
gtt->offset + (page_idx << PAGE_SHIFT),
865-
1, &gtt->ttm.dma_address[page_idx], flags);
866-
/*
867-
* Ctrl pages - modify the memory type to NC (ctrl_flags) from
868-
* the second page of the BO onward.
869-
*/
870-
amdgpu_gart_bind(adev,
871-
gtt->offset + ((page_idx + 1) << PAGE_SHIFT),
872-
pages_per_xcc - 1,
873-
&gtt->ttm.dma_address[page_idx + 1],
874-
ctrl_flags);
863+
pages_per_xcc, &gtt->ttm.dma_address[page_idx],
864+
flags);
875865
}
876866
}
877867

drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm,
4242
struct queue_properties *q)
4343
{
4444
if (mm->dev->kfd->cwsr_enabled &&
45-
q->type == KFD_QUEUE_TYPE_COMPUTE)
46-
return ALIGN(q->ctl_stack_size, PAGE_SIZE) +
47-
ALIGN(sizeof(struct v9_mqd), PAGE_SIZE);
45+
q->type == KFD_QUEUE_TYPE_COMPUTE) {
46+
47+
/* On gfxv9, the MQD resides in the first 4K page,
48+
* followed by the control stack. Align both to
49+
* AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary.
50+
*/
51+
52+
return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) +
53+
ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE);
54+
}
4855

4956
return mm->mqd_size;
5057
}
@@ -151,8 +158,8 @@ static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
151158
if (!mqd_mem_obj)
152159
return NULL;
153160
retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev,
154-
(ALIGN(q->ctl_stack_size, PAGE_SIZE) +
155-
ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) *
161+
(ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) +
162+
ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE)) *
156163
NUM_XCC(node->xcc_mask),
157164
mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM :
158165
AMDGPU_GEM_DOMAIN_GTT,
@@ -360,7 +367,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
360367
struct kfd_context_save_area_header header;
361368

362369
/* Control stack is located one page after MQD. */
363-
void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
370+
void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
364371

365372
m = get_mqd(mqd);
366373

@@ -397,7 +404,7 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi
397404
{
398405
struct v9_mqd *m;
399406
/* Control stack is located one page after MQD. */
400-
void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
407+
void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
401408

402409
m = get_mqd(mqd);
403410

@@ -443,7 +450,7 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd,
443450
*gart_addr = addr;
444451

445452
/* Control stack is located one page after MQD. */
446-
ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE);
453+
ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE);
447454
memcpy(ctl_stack, ctl_stack_src, ctl_stack_size);
448455

449456
m->cp_hqd_pq_doorbell_control =

0 commit comments

Comments
 (0)