88#include < vector>
99#include < iostream>
1010
11+ // Base class for creating managed memory objects
12+ // source: https://developer.nvidia.com/blog/unified-memory-in-cuda-6/
13+ class Managed
14+ {
15+ public:
16+ void * operator new (size_t len) {
17+ void * ptr;
18+ cudaMallocManaged (&ptr, len);
19+ cudaDeviceSynchronize ();
20+ return ptr;
21+ }
22+
23+ void operator delete (void * ptr) {
24+ cudaDeviceSynchronize ();
25+ cudaFree (ptr);
26+ }
27+ };
28+
1129// Define a custom template class, which takes care of storing the memory accesses of a given type.
1230template <typename T>
13- class CudaMemAccessStorage {
31+ class CudaMemAccessStorage : public Managed {
1432
1533 struct GlobalSettings {
1634 int gridDimX;
@@ -77,14 +95,10 @@ private:
7795
7896 // Have a array of ints on the device used to store grid dimensions, and block dimensions, and the warp size
7997 // This array will always have 10 elements (x,y,z) for grid dimensions, (x,y,z) for block dimensions, and 1 for warp size, 1 for status messages, 1 for the original log size, 1 for the current log size
80- GlobalSettings *d_constantData = nullptr ;
81- // Have the same array on the host
82- GlobalSettings *h_constantData = nullptr ;
98+ GlobalSettings constantData;
8399
84100 // Have a pointer to a list of memory access logs
85- MemoryAccessLog *d_memoryAccessLog = nullptr ;
86- // Have a pointer to a list of memory access logs on the host
87- MemoryAccessLog *h_memoryAccessLog = nullptr ;
101+ MemoryAccessLog* memoryAccessLog = nullptr ;
88102
89103 // Store the memory regions by storing the starting address, the amount of elements, the size of a single element and a name
90104 std::vector<std::tuple<T*, size_t , size_t , std::string>> memoryRegions;
@@ -256,95 +270,44 @@ public:
256270 }
257271
258272private:
259- // Define function to load back data
260- void fetchData () {
261- // First fetch the h_constantData from the device
262- checkCudaError (cudaMemcpy (h_constantData, d_constantData, sizeof (GlobalSettings), cudaMemcpyDeviceToHost),
263- " Could not copy constant data from device." );
264-
265- // Copy the data back from the device
266- checkCudaError (
267- cudaMemcpy (h_memoryAccessLog, d_memoryAccessLog, sizeof (MemoryAccessLog) * h_constantData->originalSize ,
268- cudaMemcpyDeviceToHost), " Could not copy memory access logs from device." );
269- }
270273
271274 __device__ int getStorageIndex () {// Atomically increase the currentSize by 1
272- int current_index = atomicAdd (&d_constantData-> currentSize , 1 );
275+ int current_index = atomicAdd (&constantData. currentSize , 1 );
273276
274277 // First check if the currentSize is zero, if so we need to initialize the additional data variables, needed later to restore the data
275278 if (current_index == 0 ) {
276279 // Store the grid dimensions
277- d_constantData-> gridDimX = gridDim .x ;
278- d_constantData-> gridDimY = gridDim .y ;
279- d_constantData-> gridDimZ = gridDim .z ;
280+ constantData. gridDimY = gridDim .y ;
281+ constantData. gridDimZ = gridDim .z ;
282+ constantData. gridDimX = gridDim .x ;
280283 // Store the block dimensions
281- d_constantData-> blockDimX = blockDim .x ;
282- d_constantData-> blockDimY = blockDim .y ;
283- d_constantData-> blockDimZ = blockDim .z ;
284+ constantData. blockDimX = blockDim .x ;
285+ constantData. blockDimY = blockDim .y ;
286+ constantData. blockDimZ = blockDim .z ;
284287 // Store the warp size
285- d_constantData-> warpSize = warpSize ;
288+ constantData. warpSize = warpSize ;
286289 }
287290 return current_index;
288291 }
289292
290- CudaMemAccessStorage<T> *d_this;
291-
292293public:
293294
294- __device__ CudaMemAccessStorage () {
295- // Set the default values
296- d_memoryAccessLog = nullptr ;
297- }
298-
299295 // Constructor which allocates the memory on the device
300296 __host__ CudaMemAccessStorage (unsigned int size) {
301297
302- h_constantData = new GlobalSettings {-1 , -1 , -1 , -1 , -1 , -1 , -1 , size, 0 };
298+ constantData = {-1 , -1 , -1 , -1 , -1 , -1 , -1 , size, 0 };
303299
304- // Allocate the memory on the device for the d_constantData and check if it was successful
305- checkCudaError (cudaMalloc (&d_constantData, sizeof (GlobalSettings)),
306- " Could not allocate array to store kernel data on device." );
307- // Copy over the host data to the device
308- checkCudaError (cudaMemcpy (d_constantData, h_constantData, sizeof (GlobalSettings), cudaMemcpyHostToDevice),
300+ // Allocate the memory for the memeoryAccessLog and check if it was successful
301+ checkCudaError (cudaMallocManaged (&memoryAccessLog, sizeof (MemoryAccessLog) * size),
309302 " Could not copy constant data to device." );
310-
311- // Allocate the memory on the device for the d_memoryAccessLog and check if it was successful
312- checkCudaError (cudaMalloc (&d_memoryAccessLog, sizeof (MemoryAccessLog) * size),
313- " Could not allocate array to store memory access logs on device. (reading)" );
314- // Also allocate the memory on the host for the h_memoryAccessLog
315- h_memoryAccessLog = new MemoryAccessLog[size];
316-
317- // Copy the empty data to the device
318- checkCudaError (cudaMemcpy (d_memoryAccessLog, h_memoryAccessLog, sizeof (MemoryAccessLog) * size,
319- cudaMemcpyHostToDevice), " Could not copy memory access logs to device." );
320-
321- // Now we finished initializing the class, so we need to create the copy of this class on the device
322- // Allocate the memory on the device for the d_this and check if it was successful
323- checkCudaError (cudaMalloc (&d_this, sizeof (CudaMemAccessStorage<T>)),
324- " Could not allocate array to store this class on device." );
325- // Copy the empty data to the device
326- checkCudaError (cudaMemcpy (d_this, this , sizeof (CudaMemAccessStorage<T>), cudaMemcpyHostToDevice),
327- " Could not copy this class to device." );
328-
329303 }
330304
331305 __host__ void registerArray (T *array, size_t size, std::string name = " " ) {
332306 memoryRegions.push_back (std::make_tuple (array, size, sizeof (array[0 ]), name));
333307 }
334308
335309 __host__ ~CudaMemAccessStorage () {
336- // Free the memory on the device
337- checkCudaError (cudaFree (d_memoryAccessLog), " Could not free memory access logs on device." );
338- checkCudaError (cudaFree (d_constantData), " Could not free constant data on device." );
339- checkCudaError (cudaFree (d_this), " Could not free this class on device." );
340-
341- // Free the memory on the host
342- delete[] h_memoryAccessLog;
343- delete h_constantData;
344- }
345-
346- __host__ __device__ CudaMemAccessStorage<T> *getDevicePointer () {
347- return d_this;
310+ checkCudaError (cudaFree (memoryAccessLog), " Could not free memory access logs." );
348311 }
349312
350313 __device__ void pushReadLog (T *address) {
@@ -355,9 +318,9 @@ public:
355318 unsigned int threadId = threadIdx .x + threadIdx .y * blockDim .x + blockDim .x * blockDim .y * threadIdx .z ;
356319
357320 // Check that our current index is less than the original size
358- if (current_index < d_constantData-> originalSize ) {
321+ if (current_index < constantData. originalSize ) {
359322 // Store the data in the memory access log
360- d_memoryAccessLog [current_index] = MemoryAccessLog (address, blockId, threadId, true );
323+ memoryAccessLog [current_index] = MemoryAccessLog (address, blockId, threadId, true );
361324 }
362325 }
363326
@@ -369,9 +332,9 @@ public:
369332 unsigned int threadId = threadIdx .x + threadIdx .y * blockDim .x + blockDim .x * blockDim .y * threadIdx .z ;
370333
371334 // Check that our current index is less than the original size
372- if (current_index < d_constantData-> originalSize ) {
335+ if (current_index < constantData. originalSize ) {
373336 // Store the data in the memory access log
374- d_memoryAccessLog [current_index] = MemoryAccessLog (address, blockId, threadId, false );
337+ memoryAccessLog [current_index] = MemoryAccessLog (address, blockId, threadId, false );
375338 }
376339 }
377340
@@ -380,8 +343,6 @@ public:
380343 void generateOutput (const std::string template_file, const std::string &output_path,
381344 std::function<std::tuple<std::string, std::string>(GlobalSettings settingsStruct, std::vector<std::tuple<T*, size_t , size_t , std::string>> memoryRegions,
382345 std::vector<MemoryAccessLog> accessLogs)> customGenerationFunction = nullptr) {
383- // Fetch the data back from the device
384- fetchData ();
385346
386347 // Data processing code here
387348
@@ -402,18 +363,18 @@ public:
402363 std::vector<MemoryAccessLog> accessLogs;
403364
404365 // Loop over the read logs
405- for (int i = 0 ; i < h_constantData-> currentSize ; i++) {
366+ for (int i = 0 ; i < constantData. currentSize ; i++) {
406367 // Add the log to the vector
407- accessLogs.push_back (h_memoryAccessLog [i]);
368+ accessLogs.push_back (memoryAccessLog [i]);
408369 }
409370
410371 // Run the custom generation function, if one was passed
411372 if (customGenerationFunction != nullptr ) {
412- placeholderReplacement = customGenerationFunction (*h_constantData , memoryRegions, accessLogs);
373+ placeholderReplacement = customGenerationFunction (constantData , memoryRegions, accessLogs);
413374 }
414375 // If none was passed, use a default function
415376 else {
416- placeholderReplacement = parseDataForStaticHTML (*h_constantData , memoryRegions, accessLogs);
377+ placeholderReplacement = parseDataForStaticHTML (constantData , memoryRegions, accessLogs);
417378 }
418379
419380 // Replace "<!-- HTML_TEMPLATE -->" with the HTML template
@@ -445,31 +406,23 @@ public:
445406 }
446407
447408 __host__ const GlobalSettings getGlobalSettings () {
448- // Fetch the data back from the device
449- fetchData ();
450-
451- return *h_constantData;
409+ return constantData;
452410 }
453411
454412};
455413
456414
457415// Define a custom template class which holds the data for the CUDA kernel
458416template <typename T>
459- class CudaMemAccessLogger {
417+ class CudaMemAccessLogger : public Managed {
460418
461419private:
462420
463421 // Have an internal pointer to the data, this pointer is a device pointer
464422 T *d_data;
465423
466424 // Have a pointer to the storage class we are using
467- CudaMemAccessStorage<T> *h_storage;
468- // We also need to store its device pointer
469- CudaMemAccessStorage<T> *d_storage;
470-
471- // Also have an instance of this class allocated on the device
472- CudaMemAccessLogger<T> *d_this;
425+ CudaMemAccessStorage<T> *storage;
473426
474427 // Implement a proxy class so we can both read and write from the array when accessing the array operator
475428 class AccessProxy {
@@ -524,37 +477,23 @@ public:
524477 }
525478
526479 // Constructor which allocates the memory on the device
527- __host__ CudaMemAccessLogger (T *array_data, size_t array_length, std::string description_name, CudaMemAccessStorage<T> *storage = nullptr ) {
480+ __host__ CudaMemAccessLogger (T *array_data, size_t array_length, std::string description_name, CudaMemAccessStorage<T> *cma_storage = nullptr ) {
528481
529482 // Store the passed data pointer within the class
530483 d_data = array_data;
531484
532485 // If the storage class is not null, we need to store it
533- if (storage != nullptr ) {
486+ if (cma_storage != nullptr ) {
534487 // Store the storage class
535- h_storage = storage ;
488+ storage = cma_storage ;
536489 }
537490 // If it is null, we need to create a new storage class
538491 else {
539492 // Create a new storage class
540- h_storage = new CudaMemAccessStorage<T>(10000 );
493+ storage = new CudaMemAccessStorage<T>(10000 );
541494 }
542495
543- // Also store the device pointer
544- d_storage = h_storage->getDevicePointer ();
545-
546- h_storage->registerArray (array_data, array_length, description_name);
547-
548-
549-
550- // We need to create the copy of this class on the device
551- // Allocate the memory on the device for the d_this and check if it was successful
552- checkCudaError (cudaMalloc (&d_this, sizeof (CudaMemAccessLogger<T>)),
553- " Could not allocate array to store this class on device." );
554- // Copy the empty data to the device
555- checkCudaError (cudaMemcpy (d_this, this , sizeof (CudaMemAccessLogger<T>), cudaMemcpyHostToDevice),
556- " Could not copy this class to device." );
557-
496+ storage->registerArray (array_data, array_length, description_name);
558497 }
559498
560499 __device__ T get (unsigned int index) {
@@ -563,7 +502,7 @@ public:
563502 T *address = &d_data[index];
564503
565504 // Push the read log
566- d_storage ->pushReadLog (address);
505+ storage ->pushReadLog (address);
567506
568507 // Print the accessed data
569508 // printf("Accessed data at address %p by thread %d in block %d\n", &d_data[index], threadId, blockId);
@@ -577,7 +516,7 @@ public:
577516 T *address = &d_data[index];
578517
579518 // Push the write log
580- d_storage ->pushWriteLog (address);
519+ storage ->pushWriteLog (address);
581520
582521 // Write the value to the data
583522 d_data[index] = value;
@@ -586,13 +525,6 @@ public:
586525 // printf("Wrote data at address %p by thread %d in block %d\n", &d_data[index], threadId, blockId);
587526 }
588527
589-
590- // Destructor to free the memory on the device
591- __host__ ~CudaMemAccessLogger () {
592- // Free up the memory on the device
593- checkCudaError (cudaFree (d_this), " Could not free class instance pointer on device." );
594- }
595-
596528 // Array operator overload on the device
597529 __device__ AccessProxy operator [](size_t i) {
598530 return AccessProxy (this , i);
@@ -610,11 +542,7 @@ public:
610542#endif
611543 }
612544
613- __host__ CudaMemAccessLogger<T> *getDevicePointer () {
614- return d_this;
615- }
616-
617545 CudaMemAccessStorage<T>* getStorage () {
618- return h_storage ;
546+ return storage ;
619547 }
620548};
0 commit comments