All commands expect the following environment variable for common arguments:
export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
--ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
--opt-v-dtype=bf16 --gpus=1 --use-cuda-graphs"./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 5634 ms | norm 14.193143 | loss 12.119067 | tps 93057 | sol 37.3%
# [T] step 1 [ 0.2%] | time: 5456 ms | norm 16.201113 | loss 11.495615 | tps 96093 | sol 38.5%
# [T] step 2 [ 0.3%] | time: 5460 ms | norm 12.649597 | loss 10.924326 | tps 96023 | sol 38.5%
# [T] step 3 [ 0.5%] | time: 5465 ms | norm 9.578722 | loss 10.513724 | tps 95935 | sol 38.5%
# [T] step 4 [ 0.6%] | time: 5468 ms | norm 8.089824 | loss 10.235654 | tps 95882 | sol 38.5%
# [T] step 5 [ 0.7%] | time: 5471 ms | norm 6.755733 | loss 10.023256 | tps 95830 | sol 38.4%./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 6225 ms | norm 14.358118 | loss 12.121218 | tps 84222 | sol 51.7%
# [T] step 1 [ 0.2%] | time: 6069 ms | norm 16.448149 | loss 11.495825 | tps 86387 | sol 53.1%
# [T] step 2 [ 0.3%] | time: 6080 ms | norm 12.630775 | loss 10.924068 | tps 86231 | sol 53.0%
# [T] step 3 [ 0.5%] | time: 6081 ms | norm 9.669825 | loss 10.513218 | tps 86217 | sol 53.0%
# [T] step 4 [ 0.6%] | time: 6083 ms | norm 8.126737 | loss 10.232552 | tps 86189 | sol 53.0%
# [T] step 5 [ 0.7%] | time: 6097 ms | norm 6.798275 | loss 10.017941 | tps 85991 | sol 52.8%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 13100 ms | norm 15.995070 | loss 12.243080 | tps 40021 | sol 44.4%
# [T] step 1 [ 0.2%] | time: 12955 ms | norm 20.734991 | loss 11.048519 | tps 40469 | sol 44.9%
# [T] step 2 [ 0.3%] | time: 12983 ms | norm 12.357675 | loss 9.907482 | tps 40382 | sol 44.8%
# [T] step 3 [ 0.5%] | time: 13001 ms | norm 11.488480 | loss 9.301374 | tps 40326 | sol 44.8%
# [T] step 4 [ 0.6%] | time: 13016 ms | norm 12.547009 | loss 8.981881 | tps 40280 | sol 44.7%
# [T] step 5 [ 0.7%] | time: 13021 ms | norm 10.865277 | loss 8.700561 | tps 40264 | sol 44.7%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 16220 ms | norm 16.096266 | loss 12.243530 | tps 32323 | sol 61.1%
# [T] step 1 [ 0.2%] | time: 16114 ms | norm 20.710337 | loss 11.035639 | tps 32536 | sol 61.5%
# [T] step 2 [ 0.3%] | time: 16152 ms | norm 12.318372 | loss 9.896132 | tps 32459 | sol 61.4%
# [T] step 3 [ 0.5%] | time: 16171 ms | norm 11.506572 | loss 9.289701 | tps 32421 | sol 61.3%
# [T] step 4 [ 0.6%] | time: 16173 ms | norm 12.408413 | loss 8.961591 | tps 32417 | sol 61.3%
# [T] step 5 [ 0.7%] | time: 16181 ms | norm 11.920678 | loss 8.689033 | tps 32401 | sol 61.3%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32
# [T] step 0 [ 0.1%] | time: 22771 ms | norm 17.133600 | loss 12.363778 | tps 23024 | sol 48.6%
# [T] step 1 [ 0.2%] | time: 22670 ms | norm 29.894756 | loss 10.841306 | tps 23126 | sol 48.9%
# [T] step 2 [ 0.3%] | time: 22700 ms | norm 150.413696 | loss 10.680067 | tps 23096 | sol 48.8%
# [T] step 3 [ 0.5%] | time: 22700 ms | norm 70.165176 | loss 9.605476 | tps 23096 | sol 48.8%
# [T] step 4 [ 0.6%] | time: 22694 ms | norm 38.271679 | loss 9.464457 | tps 23102 | sol 48.8%
# [T] step 5 [ 0.7%] | time: 22683 ms | norm 22.784163 | loss 9.420075 | tps 23113 | sol 48.8%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=32
# [T] step 0 [ 0.1%] | time: 30499 ms | norm 17.293736 | loss 12.366375 | tps 17190 | sol 64.7%
# [T] step 1 [ 0.2%] | time: 30483 ms | norm 29.885283 | loss 10.833670 | tps 17199 | sol 64.7%
# [T] step 2 [ 0.3%] | time: 30514 ms | norm 148.883972 | loss 10.652156 | tps 17181 | sol 64.7%
# [T] step 3 [ 0.5%] | time: 30516 ms | norm 70.251312 | loss 9.592727 | tps 17180 | sol 64.7%
# [T] step 4 [ 0.6%] | time: 30512 ms | norm 38.035744 | loss 9.457141 | tps 17183 | sol 64.7%
# [T] step 5 [ 0.7%] | time: 30505 ms | norm 23.024145 | loss 9.407635 | tps 17186 | sol 64.7%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=8 --grad-accumulation=64 --lmhead-chunks=2
# [T] step 0 [ 0.1%] | time: 45476 ms | norm 12.235020 | loss 12.650134 | tps 11528 | sol 53.7%
# [T] step 1 [ 0.2%] | time: 45493 ms | norm 23.953074 | loss 10.777575 | tps 11524 | sol 53.7%
# [T] step 2 [ 0.3%] | time: 45502 ms | norm 198.857437 | loss 15.783616 | tps 11522 | sol 53.7%
# [T] step 3 [ 0.5%] | time: 45459 ms | norm 145.786591 | loss 15.086870 | tps 11533 | sol 53.7%
# [T] step 4 [ 0.6%] | time: 45422 ms | norm 133.364182 | loss 15.064936 | tps 11542 | sol 53.7%
# [T] step 5 [ 0.7%] | time: 45416 ms | norm 109.239677 | loss 13.035682 | tps 11544 | sol 53.8%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=64 --lmhead-chunks=2
# [T] step 0 [ 0.1%] | time: 64784 ms | norm 12.297297 | loss 12.650360 | tps 8092 | sol 69.1%
# [T] step 1 [ 0.2%] | time: 64888 ms | norm 23.911800 | loss 10.767729 | tps 8079 | sol 69.0%
# [T] step 2 [ 0.3%] | time: 64903 ms | norm 199.028992 | loss 15.786371 | tps 8078 | sol 69.0%
# [T] step 3 [ 0.5%] | time: 64898 ms | norm 146.806534 | loss 15.094817 | tps 8078 | sol 69.0%
# [T] step 4 [ 0.6%] | time: 64891 ms | norm 133.626602 | loss 15.091805 | tps 8079 | sol 69.0%
# [T] step 5 [ 0.7%] | time: 64881 ms | norm 113.991562 | loss 13.045942 | tps 8080 | sol 69.0%On the RTX Pro 6000, it appears that --use-zero-copy is slower than memcpy-based transfers.
./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=8 --grad-accumulation=64 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-master
# [T] step 0 [ 0.1%] | time: 87705 ms | norm 17.875080 | loss 12.966641 | tps 5977 | sol 54.4%
# [T] step 1 [ 0.2%] | time: 86999 ms | norm 98.621193 | loss 11.152443 | tps 6026 | sol 54.8%
# [T] step 2 [ 0.3%] | time: 87008 ms | norm 46.631096 | loss 12.567963 | tps 6025 | sol 54.8%
# [T] step 3 [ 0.5%] | time: 87009 ms | norm 214.977768 | loss 16.865253 | tps 6025 | sol 54.8%
# [T] step 4 [ 0.6%] | time: 86996 ms | norm 175.693497 | loss 17.263870 | tps 6026 | sol 54.8%
# [T] step 5 [ 0.7%] | time: 87021 ms | norm 203.495667 | loss 15.400432 | tps 6024 | sol 54.8%./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=4 --grad-accumulation=128 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v
# [T] step 0 [ 0.1%] | time: 129 s | norm 17.839781 | loss 12.968658 | tps 4043 | sol 68.6%
# [T] step 1 [ 0.2%] | time: 129 s | norm 96.474220 | loss 11.130053 | tps 4033 | sol 68.4%
# [T] step 2 [ 0.3%] | time: 129 s | norm 46.290825 | loss 12.560698 | tps 4034 | sol 68.4%
# [T] step 3 [ 0.5%] | time: 129 s | norm 215.747574 | loss 16.763985 | tps 4035 | sol 68.5%
# [T] step 4 [ 0.6%] | time: 129 s | norm 177.702576 | loss 17.259163 | tps 4037 | sol 68.5%
# [T] step 5 [ 0.7%] | time: 129 s | norm 211.524994 | loss 15.369626 | tps 4038 | sol 68.5%