Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Download Bench

Three small benchmarks for the question behind dropping the `hf` CLI from
modelwrap: do we need the Xet stack for fast large-model downloads, or
would a stdlib-only Go downloader do?

Each is a standalone Go program with its own run script, and each writes a
TSV of results you can `rsync` off the bench host.

- `diskwrite/` — raw disk write throughput (no network). Writes arbitrary
data, fsyncs, reports write vs write+sync MiB/s.
- `netread/` — raw network download throughput (no disk). Streams every file
in a Hugging Face repo to `io.Discard`, reports per-file and total MiB/s.
- `naive/` — the real stdlib-only downloader, sequential, with per-file
network and disk timing separated. Compare against `netread` (no disk) and
`diskwrite` (no network) to see where time goes.

All three are Go standard library only — no Python, no `huggingface_hub`,
no `hf_xet`.

## Run

On a box with Go (e.g. `inf8.tinfoil.sh`, downloads to `/mnt/large`):

```bash
OUT_BASE=/mnt/large/modelwrap-bench bash bench/diskwrite/run.sh
OUT_BASE=/mnt/large/modelwrap-bench bash bench/netread/run.sh
OUT_BASE=/mnt/large/modelwrap-bench bash bench/naive/run.sh
```

Results land in `$OUT_BASE/{diskwrite,netread,naive}.tsv` (tab-separated).

`bench/xet_probe.py` is a separate one-off: it detects which download path
`huggingface_hub` actually takes (native Xet CAS vs plain HTTPS) for a file.
149 changes: 149 additions & 0 deletions bench/diskwrite/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Command diskwrite measures raw disk write throughput: it writes a file of
// arbitrary data in fixed-size blocks, then fsyncs it. It reports write-only
// and write+sync throughput so disk speed can be compared against network
// in isolation. No network, no HF, no Python — just the disk.
package main

import (
"flag"
"fmt"
"log"
"os"
"strconv"
"strings"
"time"
)

func main() {
out := flag.String("out", "", "output file path")
sizeStr := flag.String("size", "10GiB", "total bytes to write (e.g. 10GiB, 512MiB)")
bsStr := flag.String("bs", "1MiB", "block size (e.g. 1MiB)")
results := flag.String("results", "", "append a TSV row to this path")
flag.Parse()

if *out == "" {
log.Fatal("usage: diskwrite --out <file> [--size 10GiB] [--bs 1MiB] [--results file]")
}
size, err := parseSize(*sizeStr)
if err != nil {
log.Fatalf("size: %v", err)
}
bs, err := parseSize(*bsStr)
if err != nil {
log.Fatalf("bs: %v", err)
}
if bs <= 0 || size <= 0 {
log.Fatal("size and bs must be > 0")
}

buf := make([]byte, bs)
f, err := os.Create(*out)
if err != nil {
log.Fatal(err)
}
defer os.Remove(*out)

written := int64(0)
writeStart := time.Now()
for written < size {
n := int64(bs)
if written+n > size {
n = size - written
}
if _, err := f.Write(buf[:n]); err != nil {
f.Close()
log.Fatalf("write at %d: %v", written, err)
}
written += n
}
writeElapsed := time.Since(writeStart)

syncStart := time.Now()
if err := f.Sync(); err != nil {
f.Close()
log.Fatalf("sync: %v", err)
}
syncElapsed := time.Since(syncStart)

if err := f.Close(); err != nil {
log.Fatal(err)
}

total := writeElapsed + syncElapsed
writeMib := mib(written, writeElapsed)
totalMib := mib(written, total)
gib := float64(written) / (1 << 30)

fmt.Printf("diskwrite: %d bytes (%.2f GiB)\n", written, gib)
fmt.Printf(" write: %.3fs %.1f MiB/s\n", writeElapsed.Seconds(), writeMib)
fmt.Printf(" sync: %.3fs\n", syncElapsed.Seconds())
fmt.Printf(" total: %.3fs %.1f MiB/s\n", total.Seconds(), totalMib)

if *results != "" {
if err := appendRow(*results, written, writeElapsed, syncElapsed, total); err != nil {
log.Printf("warning: write results: %v", err)
}
}
}

func mib(b int64, d time.Duration) float64 {
if d <= 0 {
return 0
}
return float64(b) / d.Seconds() / (1 << 20)
}

func appendRow(path string, bytes int64, write, sync, total time.Duration) error {
header := false
if _, err := os.Stat(path); os.IsNotExist(err) {
header = true
}
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return err
}
defer f.Close()
if header {
fmt.Fprintln(f, "bytes\tgib\twrite_s\tsync_s\ttotal_s\twrite_mib_s\ttotal_mib_s")
}
fmt.Fprintf(f, "%d\t%.2f\t%.3f\t%.3f\t%.3f\t%.1f\t%.1f\n",
bytes, float64(bytes)/(1<<30),
write.Seconds(), sync.Seconds(), total.Seconds(),
mib(bytes, write), mib(bytes, total))
return nil
}

func parseSize(s string) (int64, error) {
s = strings.TrimSpace(s)
i := 0
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
i++
}
if i == 0 {
return 0, fmt.Errorf("invalid size %q", s)
}
n, err := strconv.ParseInt(s[:i], 10, 64)
if err != nil {
return 0, fmt.Errorf("invalid size %q: %w", s, err)
}
switch strings.ToLower(strings.TrimSpace(s[i:])) {
case "", "b":
return n, nil
case "kib":
return n << 10, nil
case "mib":
return n << 20, nil
case "gib":
return n << 30, nil
case "tib":
return n << 40, nil
case "kb":
return n * 1000, nil
case "mb":
return n * 1000 * 1000, nil
case "gb":
return n * 1000 * 1000 * 1000, nil
default:
return 0, fmt.Errorf("unknown unit in %q", s)
}
}
12 changes: 12 additions & 0 deletions bench/diskwrite/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# diskwrite: raw disk write throughput (no network). Writes arbitrary data,
# fsyncs, reports write vs write+sync MiB/s. Results -> $OUT_BASE/diskwrite.tsv
set -euo pipefail
OUT_BASE="${OUT_BASE:-/mnt/large/modelwrap-bench}"
SIZE="${SIZE:-10GiB}"
BS="${BS:-1MiB}"
mkdir -p "$OUT_BASE"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
( cd "$SCRIPT_DIR/.." && go build -o "$OUT_BASE/diskwrite" ./diskwrite )
"$OUT_BASE/diskwrite" --out "$OUT_BASE/diskwrite.data" --size "$SIZE" --bs "$BS" --results "$OUT_BASE/diskwrite.tsv"
echo "results: $OUT_BASE/diskwrite.tsv"
3 changes: 3 additions & 0 deletions bench/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/tinfoilsh/modelwrap/bench

go 1.22.0
Loading
Loading