Skip to content

Commit f80cb05

Browse files
committed
Implement first version of class which can log memory access on GPU
1 parent bcc2e92 commit f80cb05

6 files changed

Lines changed: 513 additions & 4 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
*.ptx
55
*.cubin
66
*.fatbin
7+
.idea
8+
cmake-build*/
9+
out/

CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
cmake_minimum_required(VERSION 3.24)
2+
3+
#set(CMAKE_CUDA_STANDARD 14)
4+
5+
set(CMAKE_CUDA_ARCHITECTURES 61)
6+
7+
8+
project(GPU_memory_access_visualization LANGUAGES CUDA)
9+
10+
11+
12+
include_directories(.)
13+
include_directories(src)
14+
15+
16+
add_executable(GPU_memory_access_visualization
17+
src/cuda_mav.cu src/cuda_mav.h)

examples/basic/normal.cu

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <vector>
22
#include <numeric>
33
#include <iostream>
4+
#include "../../src/cuda_mav.cu"
45

56
inline void checkCudaError(cudaError_t err) {
67
if (err != cudaSuccess) {
@@ -11,11 +12,26 @@ inline void checkCudaError(cudaError_t err) {
1112
}
1213

1314

14-
__global__ void kernel(int prob_size, int * const input, int * output){
15+
__global__ void kernel(int prob_size, CudaMav<int> * input, CudaMav<int> * output){
1516
int id = threadIdx.x + blockIdx.x * blockDim.x;
16-
if(id < prob_size){
17-
output[id] = input[id];
17+
if (id < prob_size) {
18+
// This works
19+
int temp = (int)(*input)[id];
20+
(*output)[id] = temp;
21+
22+
// This also works
23+
//(*output)[id] = (int)(*input)[id];
24+
25+
// Print to console
26+
//printf("input[%d] = %d\n", id, static_cast<int>((*input)[id]));
27+
28+
// This does not work for some reason
29+
//(*output)[id] = (*input)[id];
30+
31+
// This also does not work
32+
//output->operator[](id) = input->operator[](id);
1833
}
34+
1935
}
2036

2137
int main(){
@@ -32,11 +48,20 @@ int main(){
3248

3349
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
3450

51+
CudaMav<int> input(d_input);
52+
CudaMav<int> output(d_output);
53+
3554
constexpr int threads = 32;
3655
constexpr int blocks = (prob_size/threads)+1;
3756

38-
kernel<<<blocks, threads>>>(prob_size, d_input, d_output);
57+
kernel<<<blocks, threads>>>(prob_size, input.getDevicePointer(), output.getDevicePointer());
3958
checkCudaError(cudaGetLastError());
59+
cudaDeviceSynchronize();
60+
61+
auto data = input.getGlobalSettings();
62+
63+
input.analyze("../../../html/basic_template.html", "../../../out/basic_input.html");
64+
output.analyze("../../../html/basic_template.html", "../../../out/basic_output.html");
4065

4166
checkCudaError(cudaMemcpy(h_output.data(), d_output, sizeof(int)*prob_size, cudaMemcpyDeviceToHost));
4267

0 commit comments

Comments
 (0)