Skip to content
10 changes: 6 additions & 4 deletions dub.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,23 @@
},
{
"name": "unittest",
"dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"],
"dflags": ["-mdcompute-targets=cuda-800", "-oq"],
"targetType": "executable",
"versions": ["DComputeTesting"],
"stringImportPaths": ["."],
"versions": ["DComputeTesting", "DComputeCUDA_800"],
},
{
"name": "test-cuda",
"dflags" : ["--mdcompute-targets=cuda-210", "-oq"],
"targetType": "executable",
"versions": ["DComputeTestCUDA"],
"stringImportPaths": ["."],
"versions": ["DComputeTestCUDA", "DComputeCUDA_210"],
},
{
"name": "test-ocl",
"dflags" : ["--mdcompute-targets=ocl-200", "-oq"],
"targetType": "executable",
"versions": ["DComputeTestOpenCL"],
},
]
]
}
2 changes: 2 additions & 0 deletions source/dcompute/driver/cuda/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public import dcompute.driver.cuda.platform;
public import dcompute.driver.cuda.program;
public import dcompute.driver.cuda.queue;
public import dcompute.driver.cuda.unified_buffer;
public import dcompute.driver.cuda.runtime;

enum Copy
{
Expand Down Expand Up @@ -42,3 +43,4 @@ private template ReplaceTemplate(alias needle, alias replacement) {
}
}
}

53 changes: 52 additions & 1 deletion source/dcompute/driver/cuda/program.d
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,58 @@ struct Program
checkErrors();
return ret;
}


/**
* Load a program from a PTX global variable injected into the binary by LDC.
*
* Relies on LDC embedding the device PTX into the binary as
* `__dcompute_ptx_<arch>_<module>` globals. That support exists only when the
* D frontend `__VERSION__` is >= 2113 (LDC >= 1.43). NOTE: released LDC 1.42.0
* *and* pre-1.43 master both report `__VERSION__` == 2112, so 2113 is the
* lowest frontend version guaranteed to carry the PTX-embedding fix. On any
* older compiler the `else` overload below fails with a clear message on use.
*/
static if (__VERSION__ >= 2113)
static Program fromModule(string moduleName)()
{
version (DComputeCUDA_1200) enum _arch = "cuda1200";
else version (DComputeCUDA_900) enum _arch = "cuda900";
else version (DComputeCUDA_800) enum _arch = "cuda800";
else version (DComputeCUDA_750) enum _arch = "cuda750";
else version (DComputeCUDA_700) enum _arch = "cuda700";
else version (DComputeCUDA_600) enum _arch = "cuda600";
else version (DComputeCUDA_500) enum _arch = "cuda500";
else version (DComputeCUDA_300) enum _arch = "cuda300";
else version (DComputeCUDA_210) enum _arch = "cuda210";
else static assert(false,
"Add a DComputeCUDA_XXX version to your dub config " ~
"matching your --mdcompute-targets=cuda-XXX dflag. " ~
"Example: \"versions\": [\"DComputeCUDA_800\"]");

import std.array : replace;

enum mangledName = moduleName.replace(".", "_");
enum symbolName = "__dcompute_ptx_" ~ _arch ~ "_" ~ mangledName;

mixin("pragma(mangle, \"" ~ symbolName ~ "\") extern(C) extern __gshared const char " ~ symbolName ~ ";");

Program ret;
mixin("status = cast(Status)cuModuleLoadData(&ret.raw, &" ~ symbolName ~ ");");
checkErrors();
return ret;
}
else
static Program fromModule(string moduleName)()
{
static assert(false,
"Program.fromModule (used directly or via launch!) requires an LDC that " ~
"embeds device PTX into the binary as `__dcompute_ptx_*` globals: D " ~
"frontend __VERSION__ >= 2113 (LDC >= 1.43). This compiler is too old — " ~
"both released 1.42.0 and pre-1.43 master report __VERSION__ 2112. " ~
"Upgrade LDC to >= 1.43, or generate the .ptx file and load it via " ~
"Program.fromFile / Program.fromString.");
}

__gshared static Program globalProgram;
//cuModuleLoadDataEx
//cuModuleLoadFatBinary
Expand Down
159 changes: 159 additions & 0 deletions source/dcompute/driver/cuda/runtime.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
* dcompute.driver.cuda.runtime
*
* initialisation of the CUDA runtime:
*
* shared static this() — runs once at program startup (before main()), on
* the main thread. Initialises Platform, Device,
* Context and the embedded Program.
*
* static this() — runs once per thread (including the main thread)
* when that thread starts. Creates the thread-local
* Queue and pushes the shared Context onto the
* thread's CUDA context stack.
*
*
* ensureInit() is kept as a defensive fallback. It is a no-op once the
* module constructors have run, so calling it is always safe and costs only
* a single bool check on the fast path.
*/
module dcompute.driver.cuda.runtime;

import dcompute.driver.cuda;
import std.experimental.allocator.mallocator : Mallocator;

// Global state (shared across every thread)
// __gshared: lives in a single memory location, accessible from all threads.
private __gshared Device _defaultDevice;
private __gshared Context _defaultContext;
private __gshared bool _platformReady = false; // safety-net flag

// Thread-local state
// plain `static`: D gives each thread its own copy automatically.
private static Queue _threadQueue;
private static bool _threadReady = false; // safety-net flag

// Primary init: shared static constructor
// D runtime calls this exactly once, before main(), on the main thread,
// in module-dependency order. No locking needed here.
shared static this()
{
_initPlatform();
}

// Per-thread init: thread-local static constructor
// D runtime calls this automatically for every thread (including main) when
// that thread begins. For the main thread it runs after shared static this().
static this()
{
_initThread();
}

// Public API

/**
* Defensive fallback: ensures the runtime is fully initialised for the
* calling thread.
*
* Under normal operation this is a no-op (both flags are already true before
* main() starts). It guards against unusual call sites — e.g. Buffer
* constructed at global scope, or code reached before module constructors
* have finished in a pathological import order.
*
* Cost on the fast path: two bool reads, no locking.
*/
void ensureInit()
{
if (!_platformReady) _initPlatform(); // global guard (synchronized inside)
if (!_threadReady) _initThread(); // per-thread guard (no lock, TLS)
}

/**
* Returns the default Device (same object from every thread).
*/
Device defaultDevice()
{
return _defaultDevice;
}

/**
* Returns the default Queue for the *calling* thread.
* Each thread gets its own independent Queue — no contention, no locking.
*/
Queue defaultQueue()
{
return _threadQueue;
}

// Private implementation

private void _initPlatform()
{
// Double-checked locking: fast no-lock path after first init.
if (_platformReady) return;

synchronized
{
if (_platformReady) return;

Platform.initialise();
// Device 0 is the default. Users needing a specific device can call
// Platform.getDevices() and manage their own Context + Queue.

// TODO : Make multi-device usage better and easy.
_defaultDevice = Platform.getDevices(Mallocator.instance)[0];

// cuCtxCreate creates the context AND pushes it onto the calling
// thread's CUDA context stack.
_defaultContext = Context(_defaultDevice);

// Compile-time PTX embedding
// LDC compiles @compute modules to PTX first, then compiles host code,
// so import() resolves in a single build pass with no file I/O at
// runtime.
// The version ladder is pure data — one line per SM level. Adding a
// new target only requires one new `else version` line here plus the
// matching "DComputeCUDA_XXX" entry in dub.json.

_platformReady = true;
}
}

private void _initThread()
{
if (_threadReady) return;

// The thread that ran _initPlatform() already has _defaultContext on its
// CUDA context stack (cuCtxCreate pushes automatically).
// Every other thread must push it explicitly before creating a stream.
if (Context.current.raw != _defaultContext.raw)
Context.push(_defaultContext);

_threadQueue = Queue(false); // non-async stream, bound to this thread
_threadReady = true;
}

/**
* Launch kernel `k` on the calling thread's default Queue using the globally
* loaded Program. Platform, Device, Context, Queue and Program are all
* initialised lazily on the first call — the user needs no boilerplate.
*
* Example:
* launch!saxpy([N,1,1], [1,1,1], b_res, alpha, b_x, b_y, N);
*
* Params:
* grid = Grid dimensions [x, y, z].
* block = Block dimensions [x, y, z].
* args = Kernel arguments (host types, Buffer/UnifiedBuffer ).
*/
auto launch(alias k)(uint[3] grid, uint[3] block,
HostArgsOf!(typeof(k)) args)
{
if (Program.globalProgram.raw is null)
{
import std.traits : moduleName;
ensureInit();
Program.globalProgram = Program.fromModule!(moduleName!(__traits(parent, k)))();
}
defaultQueue().enqueue!k(grid, block)(args);
}
Loading