Skip to content

Commit 50f8928

Browse files
committed
Restructure to store access type within AccessLog itself, rename class
Additionally move the memory free to the destructor, and refactor out a code block which happens twice into a function
1 parent 58506bb commit 50f8928

2 files changed

Lines changed: 80 additions & 81 deletions

File tree

examples/basic/normal.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ inline void checkCudaError(cudaError_t err) {
1212
}
1313

1414

15-
__global__ void kernel(int prob_size, CudaMav<int> * input, CudaMav<int> * output){
15+
__global__ void kernel(int prob_size, CudaMemAccessLogger<int> * input, CudaMemAccessLogger<int> * output){
1616
int id = threadIdx.x + blockIdx.x * blockDim.x;
1717
if (id < prob_size) {
1818
(*output)[id] = (*input)[id];
@@ -34,8 +34,8 @@ int main(){
3434

3535
checkCudaError(cudaMemcpy(d_input, h_input.data(), sizeof(int)* prob_size, cudaMemcpyHostToDevice));
3636

37-
CudaMav<int> input(d_input);
38-
CudaMav<int> output(d_output);
37+
CudaMemAccessLogger<int> input(d_input);
38+
CudaMemAccessLogger<int> output(d_output);
3939

4040
constexpr int threads = 32;
4141
constexpr int blocks = (prob_size/threads)+1;

include/cuda_mav.cuh

Lines changed: 77 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
// Define a custom template class which holds the data for the CUDA kernel
1212
template <typename T>
13-
class CudaMav
13+
class CudaMemAccessLogger
1414
{
1515
struct GlobalSettings {
1616
int gridDimX;
@@ -20,28 +20,55 @@ class CudaMav
2020
int blockDimY;
2121
int blockDimZ;
2222
int warpSize;
23-
unsigned int originalSize_read;
24-
unsigned int currentSize_read;
25-
unsigned int originalSize_write;
26-
unsigned int currentSize_write;
27-
23+
unsigned int originalSize;
24+
unsigned int currentSize;
2825
};
2926

3027
// Have a struct to store logging data
3128
struct MemoryAccessLog {
29+
private:
3230
// Store the address which was addressed
3331
T* address;
34-
35-
// Store the thread id which accessed the address
36-
unsigned int threadId;
32+
// Store the thread id which accessed the address, additionally the uppermost bit is used to store if the access was a read or write, 0 for read, 1 for write
33+
unsigned int threadId_accessType;
3734
// Store the block id which accessed the address
3835
unsigned int blockId;
36+
public:
3937

4038
// Constructor which decomposes the block and thread id into the packed long
41-
__host__ __device__ MemoryAccessLog(T* address, int blockId, int threadId) : address(address), threadId(threadId), blockId(blockId) {}
39+
__host__ __device__ MemoryAccessLog(T* address, int blockId, int threadId, bool read = true) : address(address), threadId_accessType(threadId), blockId(blockId) {
40+
// Set the uppermost bit to 1 if the access was a write, 0 if it was a read
41+
if (!read) {
42+
threadId_accessType |= 1 << 31;
43+
} else {
44+
threadId_accessType &= ~(1 << 31);
45+
}
46+
}
4247

4348
// Empty constructor
44-
__host__ __device__ MemoryAccessLog() : address(nullptr), threadId(0), blockId(0) {}
49+
__host__ __device__ MemoryAccessLog() : address(nullptr), threadId_accessType(0), blockId(0) {}
50+
51+
// Getter for the address
52+
__host__ T* Address() const {
53+
return address;
54+
}
55+
56+
// Getter for the thread id
57+
__host__ int ThreadId() const {
58+
return threadId_accessType & ~(1 << 31);
59+
}
60+
61+
// Getter for the block id
62+
__host__ int BlockId() const {
63+
return blockId;
64+
}
65+
66+
// Getter for the access type
67+
__host__ bool IsRead() const {
68+
return (threadId_accessType & (1 << 31)) == 0;
69+
}
70+
71+
4572
};
4673

4774
private:
@@ -55,32 +82,26 @@ private:
5582
T* d_data;
5683

5784
// Also have an instance of this class allocated on the device
58-
CudaMav<T>* d_this;
85+
CudaMemAccessLogger<T>* d_this;
5986

6087
// Store if memory was fetched from the device
6188
bool fetchedFromDevice = false;
6289

63-
// Have a pointer to a list of memory access logs for reading
64-
MemoryAccessLog* d_memoryAccessLog_reading = nullptr;
90+
// Have a pointer to a list of memory access logs
91+
MemoryAccessLog* d_memoryAccessLog = nullptr;
6592
// Have a pointer to a list of memory access logs on the host
66-
MemoryAccessLog* h_memoryAccessLog_reading = nullptr;
67-
68-
// Have a pointer to a list of memory access logs for writing
69-
MemoryAccessLog* d_memoryAccessLog_writing = nullptr;
70-
// Have a pointer to a list of memory access logs on the host
71-
MemoryAccessLog* h_memoryAccessLog_writing = nullptr;
72-
93+
MemoryAccessLog* h_memoryAccessLog = nullptr;
7394

7495
// Implement a proxy class so we can both read and write from the array when accessing the array operator
7596
class AccessProxy {
76-
// Have a reference to the CudaMav class
77-
CudaMav<T>* cudaMav;
97+
// Have a reference to the CudaMemAccessLogger class
98+
CudaMemAccessLogger<T>* cudaMav;
7899
// Have a reference to the index
79100
int index;
80101

81102
public:
82-
// Constructor which takes a reference to the CudaMav class and the index
83-
__device__ AccessProxy(CudaMav<T>* cudaMav, int index) : cudaMav(cudaMav), index(index) {}
103+
// Constructor which takes a reference to the CudaMemAccessLogger class and the index
104+
__device__ AccessProxy(CudaMemAccessLogger<T>* cudaMav, int index) : cudaMav(cudaMav), index(index) {}
84105
AccessProxy() = delete;
85106

86107
// Overload the assignment operator so we can write to the array
@@ -125,18 +146,10 @@ private:
125146
// First fetch the h_constantData from the device
126147
checkCudaError(cudaMemcpy(h_constantData, d_constantData, sizeof(GlobalSettings), cudaMemcpyDeviceToHost), "Could not copy constant data from device.");
127148

128-
// Copy the data back from the device for reading
129-
checkCudaError(cudaMemcpy(h_memoryAccessLog_reading, d_memoryAccessLog_reading, sizeof(MemoryAccessLog) * h_constantData->originalSize_read, cudaMemcpyDeviceToHost), "Could not copy memory access logs from device.");
130-
// Copy the data back from the device for writing
131-
checkCudaError(cudaMemcpy(h_memoryAccessLog_writing, d_memoryAccessLog_writing, sizeof(MemoryAccessLog) * h_constantData->originalSize_write, cudaMemcpyDeviceToHost), "Could not copy memory access logs from device.");
149+
// Copy the data back from the device
150+
checkCudaError(cudaMemcpy(h_memoryAccessLog, d_memoryAccessLog, sizeof(MemoryAccessLog) * h_constantData->originalSize_read, cudaMemcpyDeviceToHost), "Could not copy memory access logs from device.");
132151

133152

134-
// Free up the memory on the device
135-
checkCudaError(cudaFree(d_constantData), "Could not free constant data on device.");
136-
checkCudaError(cudaFree(d_memoryAccessLog_reading), "Could not free memory access logs (reading) on device.");
137-
checkCudaError(cudaFree(d_memoryAccessLog_writing), "Could not free memory access logs (writing) on device.");
138-
checkCudaError(cudaFree(d_this), "Could not free class instance pointer on device.");
139-
140153
// Set the fetched from device flag to true
141154
fetchedFromDevice = true;
142155
}
@@ -231,17 +244,15 @@ private:
231244
public:
232245

233246
// Constructor to create an empty class
234-
__device__ __host__ CudaMav() {
247+
__device__ __host__ CudaMemAccessLogger() {
235248
// Set the data pointer to null
236249
d_data = nullptr;
237-
// Set the memory access log pointer to null for reading
238-
d_memoryAccessLog_reading = nullptr;
239-
// Set the memory access log pointer to null for writing
240-
d_memoryAccessLog_writing = nullptr;
250+
// Set the memory access log pointer to null
251+
d_memoryAccessLog = nullptr;
241252
}
242253

243254
// Constructor which allocates the memory on the device
244-
__host__ CudaMav(T* array_data, unsigned int size = 100000)
255+
__host__ CudaMemAccessLogger(T* array_data, unsigned int size = 100000)
245256
{
246257

247258
h_constantData = new GlobalSettings{ -1, -1, -1, -1, -1, -1, -1, size, 0, size, 0};
@@ -254,30 +265,24 @@ public:
254265
// Store the passed data pointer within the class
255266
d_data = array_data;
256267

257-
// Allocate the memory on the device for the d_memoryAccessLog_reading and check if it was successful
258-
checkCudaError(cudaMalloc(&d_memoryAccessLog_reading, sizeof(MemoryAccessLog) * size), "Could not allocate array to store memory access logs on device. (reading)");
259-
// Also allocate the memory on the host for the h_memoryAccessLog_reading and check if it was successful
260-
h_memoryAccessLog_reading = new MemoryAccessLog[size];
268+
// Allocate the memory on the device for the d_memoryAccessLog and check if it was successful
269+
checkCudaError(cudaMalloc(&d_memoryAccessLog, sizeof(MemoryAccessLog) * size), "Could not allocate array to store memory access logs on device. (reading)");
270+
// Also allocate the memory on the host for the h_memoryAccessLog
271+
h_memoryAccessLog = new MemoryAccessLog[size];
261272

262273
// Copy the empty data to the device
263-
checkCudaError(cudaMemcpy(d_memoryAccessLog_reading, h_memoryAccessLog_reading, sizeof(MemoryAccessLog) * size, cudaMemcpyHostToDevice), "Could not copy memory access logs to device. (reading)");
264-
265-
// To the same for writing
266-
checkCudaError(cudaMalloc(&d_memoryAccessLog_writing, sizeof(MemoryAccessLog) * size), "Could not allocate array to store memory access logs on device. (writing)");
267-
h_memoryAccessLog_writing = new MemoryAccessLog[size];
268-
checkCudaError(cudaMemcpy(d_memoryAccessLog_writing, h_memoryAccessLog_writing, sizeof(MemoryAccessLog) * size, cudaMemcpyHostToDevice), "Could not copy memory access logs to device. (writing)");
274+
checkCudaError(cudaMemcpy(d_memoryAccessLog, h_memoryAccessLog, sizeof(MemoryAccessLog) * size, cudaMemcpyHostToDevice), "Could not copy memory access logs to device.");
269275

270276
// Now we finished initializing the class, so we need to create the copy of this class on the device
271277
// Allocate the memory on the device for the d_this and check if it was successful
272-
checkCudaError(cudaMalloc(&d_this, sizeof(CudaMav<T>)), "Could not allocate array to store this class on device.");
278+
checkCudaError(cudaMalloc(&d_this, sizeof(CudaMemAccessLogger<T>)), "Could not allocate array to store this class on device.");
273279
// Copy the empty data to the device
274-
checkCudaError(cudaMemcpy(d_this, this, sizeof(CudaMav<T>), cudaMemcpyHostToDevice), "Could not copy this class to device.");
280+
checkCudaError(cudaMemcpy(d_this, this, sizeof(CudaMemAccessLogger<T>), cudaMemcpyHostToDevice), "Could not copy this class to device.");
275281

276282
}
277283

278-
__device__ T get(unsigned int index) {
279-
// Atomically increase the currentSize by 1
280-
int current_index = atomicAdd(&d_constantData->currentSize_read, 1);
284+
__device__ int getStorageIndex() {// Atomically increase the currentSize by 1
285+
int current_index = atomicAdd(&d_constantData->currentSize, 1);
281286

282287
// First check if the currentSize is zero, if so we need to initialize the additional data variables, needed later to restore the data
283288
if (current_index == 0) {
@@ -292,6 +297,11 @@ public:
292297
// Store the warp size
293298
d_constantData->warpSize = warpSize;
294299
}
300+
return current_index;
301+
}
302+
303+
__device__ T get(unsigned int index) {
304+
int current_index = getStorageIndex();
295305

296306
// Get the block and thread id
297307
unsigned int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
@@ -303,7 +313,7 @@ public:
303313
// Check that our current index is less than the original size
304314
if (current_index < d_constantData->originalSize_read) {
305315
// Store the data in the memory access log
306-
d_memoryAccessLog_reading[current_index] = MemoryAccessLog(address, blockId, threadId);
316+
d_memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId);
307317
}
308318

309319

@@ -318,26 +328,11 @@ public:
318328
//printf("Writing to index %d \n", index);
319329

320330
// Atomically increase the currentSize by 1
321-
int current_index = atomicAdd(&d_constantData->currentSize_write, 1);
331+
int current_index = getStorageIndex();
322332

323333
// Write the value to the data
324334
d_data[index] = value;
325335

326-
327-
// First check if the currentSize is zero, if so we need to initialize the additional data variables, needed later to restore the data
328-
if (current_index == 0) {
329-
// Store the grid dimensions
330-
d_constantData->gridDimX = gridDim.x;
331-
d_constantData->gridDimY = gridDim.y;
332-
d_constantData->gridDimZ = gridDim.z;
333-
// Store the block dimensions
334-
d_constantData->blockDimX = blockDim.x;
335-
d_constantData->blockDimY = blockDim.y;
336-
d_constantData->blockDimZ = blockDim.z;
337-
// Store the warp size
338-
d_constantData->warpSize = warpSize;
339-
}
340-
341336
// Get the block and thread id
342337
unsigned int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
343338
unsigned int threadId = threadIdx.x + threadIdx.y * blockDim.x + blockDim.x * blockDim.y * threadIdx.z;
@@ -348,7 +343,7 @@ public:
348343
// Check that our current index is less than the original size
349344
if (current_index < d_constantData->originalSize_write) {
350345
// Store the data in the memory access log
351-
d_memoryAccessLog_writing[current_index] = MemoryAccessLog(address, blockId, threadId);
346+
d_memoryAccessLog[current_index] = MemoryAccessLog(address, blockId, threadId, false);
352347
}
353348

354349
// Print the accessed data
@@ -357,12 +352,16 @@ public:
357352

358353

359354
// Destructor to free the memory on the device
360-
__device__ __host__ ~CudaMav()
355+
__host__ ~CudaMemAccessLogger()
361356
{
362357
// Free the memory on the host
363-
delete[] h_memoryAccessLog_reading;
364-
delete[] h_memoryAccessLog_writing;
358+
delete[] h_memoryAccessLog;
365359
delete h_constantData;
360+
361+
// Free up the memory on the device
362+
checkCudaError(cudaFree(d_constantData), "Could not free constant data on device.");
363+
checkCudaError(cudaFree(d_memoryAccessLog), "Could not free memory access logs on device.");
364+
checkCudaError(cudaFree(d_this), "Could not free class instance pointer on device.");
366365
}
367366

368367
// Array operator overload on the device
@@ -391,7 +390,7 @@ public:
391390
#endif
392391
}
393392

394-
__host__ CudaMav<T>* getDevicePointer() {
393+
__host__ CudaMemAccessLogger<T>* getDevicePointer() {
395394
return d_this;
396395
}
397396

0 commit comments

Comments
 (0)