11#include < vector>
22#include < numeric>
33#include < iostream>
4+ #include " ../../src/cuda_mav.cu"
45
56inline void checkCudaError (cudaError_t err) {
67 if (err != cudaSuccess) {
@@ -11,11 +12,26 @@ inline void checkCudaError(cudaError_t err) {
1112}
1213
1314
14- __global__ void kernel (int prob_size, int * const input, int * output){
15+ __global__ void kernel (int prob_size, CudaMav< int > * input, CudaMav< int > * output){
1516 int id = threadIdx .x + blockIdx .x * blockDim .x ;
16- if (id < prob_size){
17- output[id] = input[id];
17+ if (id < prob_size) {
18+ // This works
19+ int temp = (int )(*input)[id];
20+ (*output)[id] = temp;
21+
22+ // This also works
23+ // (*output)[id] = (int)(*input)[id];
24+
25+ // Print to console
26+ // printf("input[%d] = %d\n", id, static_cast<int>((*input)[id]));
27+
28+ // This does not work for some reason
29+ // (*output)[id] = (*input)[id];
30+
31+ // This also does not work
32+ // output->operator[](id) = input->operator[](id);
1833 }
34+
1935}
2036
2137int main (){
@@ -32,11 +48,20 @@ int main(){
3248
3349 checkCudaError (cudaMemcpy (d_input, h_input.data (), sizeof (int )* prob_size, cudaMemcpyHostToDevice));
3450
51+ CudaMav<int > input (d_input);
52+ CudaMav<int > output (d_output);
53+
3554 constexpr int threads = 32 ;
3655 constexpr int blocks = (prob_size/threads)+1 ;
3756
38- kernel<<<blocks, threads>>> (prob_size, d_input, d_output );
57+ kernel<<<blocks, threads>>> (prob_size, input. getDevicePointer (), output. getDevicePointer () );
3958 checkCudaError (cudaGetLastError ());
59+ cudaDeviceSynchronize ();
60+
61+ auto data = input.getGlobalSettings ();
62+
63+ input.analyze (" ../../../html/basic_template.html" , " ../../../out/basic_input.html" );
64+ output.analyze (" ../../../html/basic_template.html" , " ../../../out/basic_output.html" );
4065
4166 checkCudaError (cudaMemcpy (h_output.data (), d_output, sizeof (int )*prob_size, cudaMemcpyDeviceToHost));
4267
0 commit comments