fix tied/untied LM-head

ngc92 · ngc92 · commit 79c8c9ff00a2 · 2026-03-06T00:12:19.000+01:00
diff --git a/src/models/llama_gradients.cpp b/src/models/llama_gradients.cpp
@@ -11,8 +11,7 @@
 void LLamaGradientsUnsharded::on_first_micro_step(cudaStream_t stream) {
     using namespace LLamaWeightID;
     fill_zero(mNonBlockGradients.get_tensor(LNF_W), stream);
-    if(mNonBlockGradients.get_tensor(LM_HEAD).Data != mNonBlockGradients.get_tensor(EMBEDDING).Data) {
-        fill_zero(mNonBlockGradients.get_tensor(LM_HEAD), stream);// TODO superfluous?
+    if(mNonBlockGradients.get_tensor(LM_HEAD).Data != nullptr) {
         fill_zero(mNonBlockGradients.get_tensor(EMBEDDING), stream);
     } else {
         // embedding backward comes after LMHead backward; and LMHead backward *sets* the gradient
@@ -32,8 +31,8 @@ void LLamaGradientsUnsharded::on_first_micro_step(cudaStream_t stream) {
 // shard the transformer blocks, but not the embeddings and lmhead.
 
 void LLamaGradientsBlockShardedBase::on_first_micro_step(cudaStream_t stream) {
-    // if we have untied embeddings, we need to zero them out
-    if(mFullNonBlock.get_tensor(LLamaWeightID::EMBEDDING).Data != mFullNonBlock.get_tensor(LLamaWeightID::LM_HEAD).Data) {
+    // if we have untied embeddings, we need to zero them out, same as above
+    if(mFullNonBlock.get_tensor(LLamaWeightID::LM_HEAD).Data != nullptr) {
         fill_zero(mFullNonBlock.get_tensor(LLamaWeightID::EMBEDDING), stream);
     }
     fill_zero(mFullNonBlock.get_tensor(LLamaWeightID::LNF_W), stream);
@@ -102,7 +101,6 @@ void LLamaGradientsBlockSharded_AllToAll::on_notify_block(int layer_idx, SimpleT
     }
 
     // make sure we've done the local accumulation before we allow communication to begin.
-
     CUDA_CHECK(cudaEventRecord(signal, stream));
     NvtxRange range("all-to-all-gradients", layer_idx);
 
diff --git a/src/models/llama_model.cpp b/src/models/llama_model.cpp
@@ -509,7 +509,16 @@ void LLamaModel::_backward_lmhead(long B, long T, float z_loss, int micro_step,
 
         // handle the LM-head. We run the d_lmhead matmul first, so that the gradient reduction can overlap with the DLNF matmul.
         bool accumulate;
-        auto& d_lmhead = Grads->get_non_block_full(LLamaWeightID::LM_HEAD, main_stream, comm, accumulate);
+        // get the correct matrix depending on whether we have tied embeddings
+        auto& d_lmhead = [&]() -> Tensor& {
+            if (Config.TiedWordEmbeddings) {
+                return Grads->get_non_block_full(LLamaWeightID::EMBEDDING, main_stream, comm, accumulate);
+            } else {
+                return Grads->get_non_block_full(LLamaWeightID::LM_HEAD, main_stream, comm, accumulate);
+            }
+        }();
+
+        // even if we overwrite for first micro-batch, we need to accumulate on non-first nano batch
         accumulate |= nano_step != 0;
         matmul(d_lmhead, lnf_slice, rs->Output, Tensor{}, nullptr, nullptr,
                rs->CublasLtHandle, rs->CuBlasWorkspace, C, V, nano_batch_size, EMMTranspose::NT, accumulate, main_stream, rs->MatmulBackend);
@@ -757,6 +766,8 @@ void LLamaModel::fill_non_block_shapes(GenericTensorContainer& target, const Tra
     create(target.get_tensor(LLamaWeightID::LNF_W), C, 0, other_dtype);
     if(!config.TiedWordEmbeddings) {
         create(target.get_tensor(LLamaWeightID::LM_HEAD), V, C, matrix_dtype);
+    } else {
+        create(target.get_tensor(LLamaWeightID::LM_HEAD), 0, 0, matrix_dtype);
     }
 }