diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000..a49779f --- /dev/null +++ b/bench/README.md @@ -0,0 +1,34 @@ +# Download Bench + +Three small benchmarks for the question behind dropping the `hf` CLI from +modelwrap: do we need the Xet stack for fast large-model downloads, or +would a stdlib-only Go downloader do? + +Each is a standalone Go program with its own run script, and each writes a +TSV of results you can `rsync` off the bench host. + +- `diskwrite/` — raw disk write throughput (no network). Writes arbitrary + data, fsyncs, reports write vs write+sync MiB/s. +- `netread/` — raw network download throughput (no disk). Streams every file + in a Hugging Face repo to `io.Discard`, reports per-file and total MiB/s. +- `naive/` — the real stdlib-only downloader, sequential, with per-file + network and disk timing separated. Compare against `netread` (no disk) and + `diskwrite` (no network) to see where time goes. + +All three are Go standard library only — no Python, no `huggingface_hub`, +no `hf_xet`. + +## Run + +On a box with Go (e.g. `inf8.tinfoil.sh`, downloads to `/mnt/large`): + +```bash +OUT_BASE=/mnt/large/modelwrap-bench bash bench/diskwrite/run.sh +OUT_BASE=/mnt/large/modelwrap-bench bash bench/netread/run.sh +OUT_BASE=/mnt/large/modelwrap-bench bash bench/naive/run.sh +``` + +Results land in `$OUT_BASE/{diskwrite,netread,naive}.tsv` (tab-separated). + +`bench/xet_probe.py` is a separate one-off: it detects which download path +`huggingface_hub` actually takes (native Xet CAS vs plain HTTPS) for a file. diff --git a/bench/diskwrite/main.go b/bench/diskwrite/main.go new file mode 100644 index 0000000..1af2761 --- /dev/null +++ b/bench/diskwrite/main.go @@ -0,0 +1,149 @@ +// Command diskwrite measures raw disk write throughput: it writes a file of +// arbitrary data in fixed-size blocks, then fsyncs it. It reports write-only +// and write+sync throughput so disk speed can be compared against network +// in isolation. No network, no HF, no Python — just the disk. +package main + +import ( + "flag" + "fmt" + "log" + "os" + "strconv" + "strings" + "time" +) + +func main() { + out := flag.String("out", "", "output file path") + sizeStr := flag.String("size", "10GiB", "total bytes to write (e.g. 10GiB, 512MiB)") + bsStr := flag.String("bs", "1MiB", "block size (e.g. 1MiB)") + results := flag.String("results", "", "append a TSV row to this path") + flag.Parse() + + if *out == "" { + log.Fatal("usage: diskwrite --out [--size 10GiB] [--bs 1MiB] [--results file]") + } + size, err := parseSize(*sizeStr) + if err != nil { + log.Fatalf("size: %v", err) + } + bs, err := parseSize(*bsStr) + if err != nil { + log.Fatalf("bs: %v", err) + } + if bs <= 0 || size <= 0 { + log.Fatal("size and bs must be > 0") + } + + buf := make([]byte, bs) + f, err := os.Create(*out) + if err != nil { + log.Fatal(err) + } + defer os.Remove(*out) + + written := int64(0) + writeStart := time.Now() + for written < size { + n := int64(bs) + if written+n > size { + n = size - written + } + if _, err := f.Write(buf[:n]); err != nil { + f.Close() + log.Fatalf("write at %d: %v", written, err) + } + written += n + } + writeElapsed := time.Since(writeStart) + + syncStart := time.Now() + if err := f.Sync(); err != nil { + f.Close() + log.Fatalf("sync: %v", err) + } + syncElapsed := time.Since(syncStart) + + if err := f.Close(); err != nil { + log.Fatal(err) + } + + total := writeElapsed + syncElapsed + writeMib := mib(written, writeElapsed) + totalMib := mib(written, total) + gib := float64(written) / (1 << 30) + + fmt.Printf("diskwrite: %d bytes (%.2f GiB)\n", written, gib) + fmt.Printf(" write: %.3fs %.1f MiB/s\n", writeElapsed.Seconds(), writeMib) + fmt.Printf(" sync: %.3fs\n", syncElapsed.Seconds()) + fmt.Printf(" total: %.3fs %.1f MiB/s\n", total.Seconds(), totalMib) + + if *results != "" { + if err := appendRow(*results, written, writeElapsed, syncElapsed, total); err != nil { + log.Printf("warning: write results: %v", err) + } + } +} + +func mib(b int64, d time.Duration) float64 { + if d <= 0 { + return 0 + } + return float64(b) / d.Seconds() / (1 << 20) +} + +func appendRow(path string, bytes int64, write, sync, total time.Duration) error { + header := false + if _, err := os.Stat(path); os.IsNotExist(err) { + header = true + } + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + if header { + fmt.Fprintln(f, "bytes\tgib\twrite_s\tsync_s\ttotal_s\twrite_mib_s\ttotal_mib_s") + } + fmt.Fprintf(f, "%d\t%.2f\t%.3f\t%.3f\t%.3f\t%.1f\t%.1f\n", + bytes, float64(bytes)/(1<<30), + write.Seconds(), sync.Seconds(), total.Seconds(), + mib(bytes, write), mib(bytes, total)) + return nil +} + +func parseSize(s string) (int64, error) { + s = strings.TrimSpace(s) + i := 0 + for i < len(s) && s[i] >= '0' && s[i] <= '9' { + i++ + } + if i == 0 { + return 0, fmt.Errorf("invalid size %q", s) + } + n, err := strconv.ParseInt(s[:i], 10, 64) + if err != nil { + return 0, fmt.Errorf("invalid size %q: %w", s, err) + } + switch strings.ToLower(strings.TrimSpace(s[i:])) { + case "", "b": + return n, nil + case "kib": + return n << 10, nil + case "mib": + return n << 20, nil + case "gib": + return n << 30, nil + case "tib": + return n << 40, nil + case "kb": + return n * 1000, nil + case "mb": + return n * 1000 * 1000, nil + case "gb": + return n * 1000 * 1000 * 1000, nil + default: + return 0, fmt.Errorf("unknown unit in %q", s) + } +} diff --git a/bench/diskwrite/run.sh b/bench/diskwrite/run.sh new file mode 100755 index 0000000..d45c146 --- /dev/null +++ b/bench/diskwrite/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# diskwrite: raw disk write throughput (no network). Writes arbitrary data, +# fsyncs, reports write vs write+sync MiB/s. Results -> $OUT_BASE/diskwrite.tsv +set -euo pipefail +OUT_BASE="${OUT_BASE:-/mnt/large/modelwrap-bench}" +SIZE="${SIZE:-10GiB}" +BS="${BS:-1MiB}" +mkdir -p "$OUT_BASE" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +( cd "$SCRIPT_DIR/.." && go build -o "$OUT_BASE/diskwrite" ./diskwrite ) +"$OUT_BASE/diskwrite" --out "$OUT_BASE/diskwrite.data" --size "$SIZE" --bs "$BS" --results "$OUT_BASE/diskwrite.tsv" +echo "results: $OUT_BASE/diskwrite.tsv" diff --git a/bench/go.mod b/bench/go.mod new file mode 100644 index 0000000..7822515 --- /dev/null +++ b/bench/go.mod @@ -0,0 +1,3 @@ +module github.com/tinfoilsh/modelwrap/bench + +go 1.22.0 diff --git a/bench/naive/main.go b/bench/naive/main.go new file mode 100644 index 0000000..0a1852b --- /dev/null +++ b/bench/naive/main.go @@ -0,0 +1,241 @@ +// Command naive downloads a Hugging Face model using only the Go standard +// library, sequentially, and profiles each file: it separates the time to +// read a file over the network (into memory) from the time to write it to +// disk, so disk and network can be compared in isolation. +// +// It is the "no supply chain" baseline against the official hf CLI +// (huggingface_hub + hf_xet). No Python, no Xet plugin, no concurrency. +package main + +import ( + "bytes" + "context" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "path/filepath" + "strings" + "time" +) + +type entry struct { + Type string `json:"type"` + Path string `json:"path"` + Size int64 `json:"size"` +} + +const hub = "https://huggingface.co" + +func main() { + repo := flag.String("repo", "", "Hugging Face repo id, e.g. Qwen/Qwen2.5-72B-Instruct") + rev := flag.String("revision", "main", "revision (branch or commit)") + out := flag.String("out", "", "output directory") + sync := flag.Bool("sync", false, "fsync each file after writing (measures real disk, not page cache)") + results := flag.String("results", "", "write per-file TSV results to this path") + flag.Parse() + + if *repo == "" || *out == "" { + log.Fatal("usage: naive --repo --out [--revision main] [--sync] [--results file]") + } + token := os.Getenv("HF_TOKEN") + + ctx := context.Background() + files, err := listTree(ctx, *repo, *rev, token) + if err != nil { + log.Fatalf("list tree: %v", err) + } + log.Printf("listed %d files", len(files)) + + var rows []row + var totalBytes int64 + var totalNet, totalDisk time.Duration + start := time.Now() + + for i, f := range files { + n, netT, diskT, err := fetchFile(ctx, *repo, *rev, f.Path, *out, token, *sync) + if err != nil { + log.Fatalf("[%d/%d] %s: %v", i+1, len(files), f.Path, err) + } + totalBytes += n + totalNet += netT + totalDisk += diskT + log.Printf("[%d/%d] %s: %d bytes net=%.3fs (%.1f MiB/s) disk=%.3fs (%.1f MiB/s)", + i+1, len(files), f.Path, n, netT.Seconds(), mib(n, netT), diskT.Seconds(), mib(n, diskT)) + rows = append(rows, row{f.Path, n, netT, diskT}) + } + + wall := time.Since(start) + fmt.Printf("naive: files=%d bytes=%d (%.2f GiB) net=%.3fs disk=%.3fs wall=%.3fs | net=%.1f MiB/s disk=%.1f MiB/s wall=%.1f MiB/s\n", + len(rows), totalBytes, float64(totalBytes)/(1<<30), + totalNet.Seconds(), totalDisk.Seconds(), wall.Seconds(), + mib(totalBytes, totalNet), mib(totalBytes, totalDisk), float64(totalBytes)/wall.Seconds()/(1<<20)) + + if *results != "" { + if err := writeResults(*results, rows, totalBytes, totalNet, totalDisk); err != nil { + log.Printf("warning: write results: %v", err) + } + } +} + +type row struct { + path string + bytes int64 + net time.Duration + disk time.Duration +} + +func mib(b int64, d time.Duration) float64 { + if d <= 0 { + return 0 + } + return float64(b) / d.Seconds() / (1 << 20) +} + +// fetchFile reads a file fully into memory (network time, isolated from +// disk) then writes it to a .part file and renames (disk time, isolated +// from network). The two phases are sequential by design: this measures the +// components separately rather than overlapping them. +func fetchFile(ctx context.Context, repo, rev, path, out, token string, doSync bool) (n int64, netT, diskT time.Duration, err error) { + dest := filepath.Join(out, path) + if err = os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { + return + } + req, err := http.NewRequestWithContext(ctx, "GET", + fmt.Sprintf("%s/%s/resolve/%s/%s", hub, repo, rev, path), nil) + if err != nil { + return + } + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + + netStart := time.Now() + resp, err := http.DefaultClient.Do(req) + if err != nil { + return + } + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + err = fmt.Errorf("%s: %s", path, resp.Status) + return + } + buf := &bytes.Buffer{} + if resp.ContentLength > 0 { + buf.Grow(int(resp.ContentLength)) + } + n, err = io.Copy(buf, resp.Body) + resp.Body.Close() + netT = time.Since(netStart) + if err != nil { + err = fmt.Errorf("%s: read: %w", path, err) + return + } + + tmp := dest + ".part" + diskStart := time.Now() + f, err := os.Create(tmp) + if err != nil { + return + } + _, werr := buf.WriteTo(f) + if werr == nil && doSync { + werr = f.Sync() + } + cerr := f.Close() + diskT = time.Since(diskStart) + err = werr + if err == nil { + err = cerr + } + if err != nil { + os.Remove(tmp) + err = fmt.Errorf("%s: write: %w", path, err) + return + } + return n, netT, diskT, os.Rename(tmp, dest) +} + +func writeResults(path string, rows []row, totalBytes int64, totalNet, totalDisk time.Duration) error { + header := false + if _, err := os.Stat(path); os.IsNotExist(err) { + header = true + } + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + if header { + fmt.Fprintln(f, "file\tbytes\tgib\tnet_s\tdisk_s\tnet_mib_s\tdisk_mib_s") + } + for _, r := range rows { + fmt.Fprintf(f, "%s\t%d\t%.2f\t%.3f\t%.3f\t%.1f\t%.1f\n", + r.path, r.bytes, float64(r.bytes)/(1<<30), + r.net.Seconds(), r.disk.Seconds(), mib(r.bytes, r.net), mib(r.bytes, r.disk)) + } + fmt.Fprintf(f, "TOTAL\t%d\t%.2f\t%.3f\t%.3f\t%.1f\t%.1f\n", + totalBytes, float64(totalBytes)/(1<<30), + totalNet.Seconds(), totalDisk.Seconds(), mib(totalBytes, totalNet), mib(totalBytes, totalDisk)) + return nil +} + +// listTree paginates the Hub tree API and returns leaf (non-directory) entries. +func listTree(ctx context.Context, repo, rev, token string) ([]entry, error) { + segments := strings.Split(repo, "/") + for i, s := range segments { + segments[i] = url.PathEscape(s) + } + u := fmt.Sprintf("%s/api/models/%s/tree/%s?recursive=true", hub, strings.Join(segments, "/"), rev) + + var files []entry + for u != "" { + req, err := http.NewRequestWithContext(ctx, "GET", u, nil) + if err != nil { + return nil, err + } + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return nil, fmt.Errorf("tree %s: %s: %s", u, resp.Status, body) + } + var page []entry + if err := json.NewDecoder(resp.Body).Decode(&page); err != nil { + resp.Body.Close() + return nil, err + } + resp.Body.Close() + for _, e := range page { + if e.Type != "directory" && e.Type != "tree" { + files = append(files, e) + } + } + u = nextLink(resp.Header.Get("Link")) + } + return files, nil +} + +func nextLink(link string) string { + for _, part := range strings.Split(link, ",") { + if !strings.Contains(part, `rel="next"`) { + continue + } + part = strings.TrimSpace(part) + part = strings.TrimPrefix(part, "<") + if i := strings.Index(part, ">"); i >= 0 { + return part[:i] + } + } + return "" +} diff --git a/bench/naive/run.sh b/bench/naive/run.sh new file mode 100755 index 0000000..85f46b5 --- /dev/null +++ b/bench/naive/run.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# naive: sequential stdlib-only model download to disk, with per-file network +# and disk timing separated. Compare against netread (no disk) and diskwrite +# (no network) to see where time goes. Results -> $OUT_BASE/naive.tsv +# +# SYNC=1 adds an fsync per file (measures real disk, not page cache). +set -euo pipefail +OUT_BASE="${OUT_BASE:-/mnt/large/modelwrap-bench}" +MODEL="${MODEL:-Qwen/Qwen2.5-72B-Instruct}" +REVISION="${REVISION:-main}" +SYNC="${SYNC:-0}" +mkdir -p "$OUT_BASE" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +( cd "$SCRIPT_DIR/.." && go build -o "$OUT_BASE/naive" ./naive ) +OUT="$OUT_BASE/naive-out" +rm -rf "$OUT" +args=( --repo "$MODEL" --revision "$REVISION" --out "$OUT" --results "$OUT_BASE/naive.tsv" ) +if [ "${SYNC}" = "1" ]; then args+=( --sync ); fi +"$OUT_BASE/naive" "${args[@]}" +echo "results: $OUT_BASE/naive.tsv" diff --git a/bench/netread/main.go b/bench/netread/main.go new file mode 100644 index 0000000..08a0fcc --- /dev/null +++ b/bench/netread/main.go @@ -0,0 +1,187 @@ +// Command netread measures raw network download throughput: it lists a +// Hugging Face repo's file tree and streams every file to io.Discard — no +// disk writes at all. It is the network-only counterpart of naive: run both +// to see how much disk adds. No Python, no Xet, no concurrency. +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type entry struct { + Type string `json:"type"` + Path string `json:"path"` + Size int64 `json:"size"` +} + +const hub = "https://huggingface.co" + +func main() { + repo := flag.String("repo", "", "Hugging Face repo id, e.g. Qwen/Qwen2.5-72B-Instruct") + rev := flag.String("revision", "main", "revision (branch or commit)") + results := flag.String("results", "", "write per-file TSV results to this path") + flag.Parse() + + if *repo == "" { + log.Fatal("usage: netread --repo [--revision main] [--results file]") + } + token := os.Getenv("HF_TOKEN") + + ctx := context.Background() + files, err := listTree(ctx, *repo, *rev, token) + if err != nil { + log.Fatalf("list tree: %v", err) + } + log.Printf("listed %d files", len(files)) + + var rows []row + var totalBytes int64 + var totalNet time.Duration + start := time.Now() + + for i, f := range files { + n, elapsed, err := readToDiscard(ctx, *repo, *rev, f.Path, token) + if err != nil { + log.Fatalf("[%d/%d] %s: %v", i+1, len(files), f.Path, err) + } + totalBytes += n + totalNet += elapsed + log.Printf("[%d/%d] %s: %d bytes %.3fs %.1f MiB/s", i+1, len(files), f.Path, n, elapsed.Seconds(), mib(n, elapsed)) + rows = append(rows, row{f.Path, n, elapsed}) + } + + wall := time.Since(start) + fmt.Printf("netread: files=%d bytes=%d (%.2f GiB) net=%.3fs wall=%.3fs | net=%.1f MiB/s wall=%.1f MiB/s\n", + len(rows), totalBytes, float64(totalBytes)/(1<<30), + totalNet.Seconds(), wall.Seconds(), + mib(totalBytes, totalNet), float64(totalBytes)/wall.Seconds()/(1<<20)) + + if *results != "" { + if err := writeResults(*results, rows, totalBytes, totalNet); err != nil { + log.Printf("warning: write results: %v", err) + } + } +} + +type row struct { + path string + bytes int64 + elapsed time.Duration +} + +func readToDiscard(ctx context.Context, repo, rev, path, token string) (int64, time.Duration, error) { + req, err := http.NewRequestWithContext(ctx, "GET", + fmt.Sprintf("%s/%s/resolve/%s/%s", hub, repo, rev, path), nil) + if err != nil { + return 0, 0, err + } + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + start := time.Now() + resp, err := http.DefaultClient.Do(req) + if err != nil { + return 0, 0, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return 0, 0, fmt.Errorf("%s: %s", path, resp.Status) + } + n, err := io.Copy(io.Discard, resp.Body) + return n, time.Since(start), err +} + +func writeResults(path string, rows []row, totalBytes int64, totalNet time.Duration) error { + header := false + if _, err := os.Stat(path); os.IsNotExist(err) { + header = true + } + f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + if header { + fmt.Fprintln(f, "file\tbytes\tgib\tnet_s\tnet_mib_s") + } + for _, r := range rows { + fmt.Fprintf(f, "%s\t%d\t%.2f\t%.3f\t%.1f\n", + r.path, r.bytes, float64(r.bytes)/(1<<30), r.elapsed.Seconds(), mib(r.bytes, r.elapsed)) + } + fmt.Fprintf(f, "TOTAL\t%d\t%.2f\t%.3f\t%.1f\n", + totalBytes, float64(totalBytes)/(1<<30), totalNet.Seconds(), mib(totalBytes, totalNet)) + return nil +} + +func mib(b int64, d time.Duration) float64 { + if d <= 0 { + return 0 + } + return float64(b) / d.Seconds() / (1 << 20) +} + +// listTree paginates the Hub tree API and returns leaf (non-directory) entries. +func listTree(ctx context.Context, repo, rev, token string) ([]entry, error) { + segments := strings.Split(repo, "/") + for i, s := range segments { + segments[i] = url.PathEscape(s) + } + u := fmt.Sprintf("%s/api/models/%s/tree/%s?recursive=true", hub, strings.Join(segments, "/"), rev) + + var files []entry + for u != "" { + req, err := http.NewRequestWithContext(ctx, "GET", u, nil) + if err != nil { + return nil, err + } + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return nil, fmt.Errorf("tree %s: %s: %s", u, resp.Status, body) + } + var page []entry + if err := json.NewDecoder(resp.Body).Decode(&page); err != nil { + resp.Body.Close() + return nil, err + } + resp.Body.Close() + for _, e := range page { + if e.Type != "directory" && e.Type != "tree" { + files = append(files, e) + } + } + u = nextLink(resp.Header.Get("Link")) + } + return files, nil +} + +func nextLink(link string) string { + for _, part := range strings.Split(link, ",") { + if !strings.Contains(part, `rel="next"`) { + continue + } + part = strings.TrimSpace(part) + part = strings.TrimPrefix(part, "<") + if i := strings.Index(part, ">"); i >= 0 { + return part[:i] + } + } + return "" +} diff --git a/bench/netread/run.sh b/bench/netread/run.sh new file mode 100755 index 0000000..30915a8 --- /dev/null +++ b/bench/netread/run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# netread: raw network download throughput (no disk). Streams every file in a +# Hugging Face repo to io.Discard, reports per-file and total MiB/s. +# Results -> $OUT_BASE/netread.tsv +set -euo pipefail +OUT_BASE="${OUT_BASE:-/mnt/large/modelwrap-bench}" +MODEL="${MODEL:-Qwen/Qwen2.5-72B-Instruct}" +REVISION="${REVISION:-main}" +mkdir -p "$OUT_BASE" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +( cd "$SCRIPT_DIR/.." && go build -o "$OUT_BASE/netread" ./netread ) +"$OUT_BASE/netread" --repo "$MODEL" --revision "$REVISION" --results "$OUT_BASE/netread.tsv" +echo "results: $OUT_BASE/netread.tsv" diff --git a/bench/results/diskwrite.tsv b/bench/results/diskwrite.tsv new file mode 100644 index 0000000..5e4b941 --- /dev/null +++ b/bench/results/diskwrite.tsv @@ -0,0 +1,2 @@ +bytes gib write_s sync_s total_s write_mib_s total_mib_s +10737418240 10.00 2.556 8.278 10.834 4007.0 945.2 diff --git a/bench/results/naive.tsv b/bench/results/naive.tsv new file mode 100644 index 0000000..19586ff --- /dev/null +++ b/bench/results/naive.tsv @@ -0,0 +1,49 @@ +file bytes gib net_s disk_s net_mib_s disk_mib_s +.gitattributes 1519 0.00 0.055 0.000 0.0 30.4 +LICENSE 6962 0.00 0.049 0.000 0.1 239.1 +README.md 6259 0.00 0.050 0.000 0.1 152.1 +config.json 663 0.00 0.049 0.000 0.0 31.4 +generation_config.json 242 0.00 0.054 0.000 0.0 3.3 +merges.txt 1671839 0.00 0.070 0.001 22.6 2778.1 +model-00001-of-00037.safetensors 3762345336 3.50 9.660 1.490 371.4 2408.1 +model-00002-of-00037.safetensors 3995200440 3.72 12.201 1.516 312.3 2513.4 +model-00003-of-00037.safetensors 3812769392 3.55 11.746 1.442 309.6 2522.0 +model-00004-of-00037.safetensors 3995183944 3.72 11.722 1.369 325.0 2782.4 +model-00005-of-00037.safetensors 3995183944 3.72 10.174 1.780 374.5 2140.4 +model-00006-of-00037.safetensors 3995200456 3.72 11.618 1.491 328.0 2555.0 +model-00007-of-00037.safetensors 3812769424 3.55 10.043 1.437 362.0 2529.7 +model-00008-of-00037.safetensors 3995183968 3.72 11.134 1.795 342.2 2123.1 +model-00009-of-00037.safetensors 3995183968 3.72 8.783 1.177 433.8 3238.5 +model-00010-of-00037.safetensors 3995200464 3.72 10.017 1.184 380.4 3217.2 +model-00011-of-00037.safetensors 3812769424 3.55 10.793 1.433 336.9 2538.3 +model-00012-of-00037.safetensors 3995183968 3.72 11.402 1.498 334.1 2543.9 +model-00013-of-00037.safetensors 3995183968 3.72 8.812 1.167 432.4 3264.8 +model-00014-of-00037.safetensors 3995200464 3.72 11.351 1.345 335.7 2832.5 +model-00015-of-00037.safetensors 3812769424 3.55 10.628 1.710 342.1 2126.6 +model-00016-of-00037.safetensors 3995183968 3.72 10.991 1.494 346.7 2550.4 +model-00017-of-00037.safetensors 3995183968 3.72 9.048 1.491 421.1 2555.5 +model-00018-of-00037.safetensors 3995200464 3.72 11.361 2.854 335.4 1335.2 +model-00019-of-00037.safetensors 3812769424 3.55 9.965 1.617 364.9 2248.1 +model-00020-of-00037.safetensors 3995183968 3.72 38.757 1.505 98.3 2532.4 +model-00021-of-00037.safetensors 3995183968 3.72 9.741 1.537 391.1 2479.3 +model-00022-of-00037.safetensors 3995200464 3.72 9.919 1.490 384.1 2556.7 +model-00023-of-00037.safetensors 3812769424 3.55 7.871 1.441 462.0 2523.5 +model-00024-of-00037.safetensors 3995183968 3.72 9.773 1.497 389.8 2544.3 +model-00025-of-00037.safetensors 3995183968 3.72 9.940 1.492 383.3 2554.3 +model-00026-of-00037.safetensors 3995200464 3.72 10.196 1.174 373.7 3245.2 +model-00027-of-00037.safetensors 3812769424 3.55 7.749 1.335 469.3 2723.6 +model-00028-of-00037.safetensors 3995183968 3.72 9.940 1.138 383.3 3348.6 +model-00029-of-00037.safetensors 3995183968 3.72 10.419 1.627 365.7 2342.0 +model-00030-of-00037.safetensors 3995200464 3.72 10.123 1.484 376.4 2567.8 +model-00031-of-00037.safetensors 3812769424 3.55 8.347 1.409 435.6 2580.6 +model-00032-of-00037.safetensors 3995183968 3.72 9.513 1.498 400.5 2542.9 +model-00033-of-00037.safetensors 3995183968 3.72 9.012 1.477 422.8 2578.8 +model-00034-of-00037.safetensors 3995200464 3.72 10.911 1.183 349.2 3220.2 +model-00035-of-00037.safetensors 3812769424 3.55 9.072 1.530 400.8 2377.3 +model-00036-of-00037.safetensors 3995183968 3.72 10.212 1.153 373.1 3303.5 +model-00037-of-00037.safetensors 3460317640 3.22 10.150 1.293 325.1 2551.6 +model.safetensors.index.json 79025 0.00 0.082 0.000 0.9 768.9 +tokenizer.json 7031645 0.01 0.104 0.002 64.6 2977.2 +tokenizer_config.json 7305 0.00 0.049 0.000 0.1 202.2 +vocab.json 2776833 0.00 0.077 0.001 34.6 2821.2 +TOTAL 145424101604 135.44 403.734 54.557 343.5 2542.1 diff --git a/bench/results/netread.tsv b/bench/results/netread.tsv new file mode 100644 index 0000000..50ab8b3 --- /dev/null +++ b/bench/results/netread.tsv @@ -0,0 +1,49 @@ +file bytes gib net_s net_mib_s +.gitattributes 1519 0.00 0.054 0.0 +LICENSE 6962 0.00 0.052 0.1 +README.md 6259 0.00 0.052 0.1 +config.json 663 0.00 0.051 0.0 +generation_config.json 242 0.00 0.054 0.0 +merges.txt 1671839 0.00 0.067 23.8 +model-00001-of-00037.safetensors 3762345336 3.50 9.284 386.5 +model-00002-of-00037.safetensors 3995200440 3.72 8.611 442.5 +model-00003-of-00037.safetensors 3812769392 3.55 9.153 397.3 +model-00004-of-00037.safetensors 3995183944 3.72 9.344 407.7 +model-00005-of-00037.safetensors 3995183944 3.72 9.703 392.7 +model-00006-of-00037.safetensors 3995200456 3.72 8.694 438.2 +model-00007-of-00037.safetensors 3812769424 3.55 8.934 407.0 +model-00008-of-00037.safetensors 3995183968 3.72 9.010 422.9 +model-00009-of-00037.safetensors 3995183968 3.72 9.717 392.1 +model-00010-of-00037.safetensors 3995200464 3.72 8.204 464.4 +model-00011-of-00037.safetensors 3812769424 3.55 8.792 413.6 +model-00012-of-00037.safetensors 3995183968 3.72 9.383 406.1 +model-00013-of-00037.safetensors 3995183968 3.72 9.704 392.6 +model-00014-of-00037.safetensors 3995200464 3.72 9.138 417.0 +model-00015-of-00037.safetensors 3812769424 3.55 8.983 404.8 +model-00016-of-00037.safetensors 3995183968 3.72 37.839 100.7 +model-00017-of-00037.safetensors 3995183968 3.72 9.246 412.1 +model-00018-of-00037.safetensors 3995200464 3.72 11.037 345.2 +model-00019-of-00037.safetensors 3812769424 3.55 9.158 397.1 +model-00020-of-00037.safetensors 3995183968 3.72 64.033 59.5 +model-00021-of-00037.safetensors 3995183968 3.72 9.266 411.2 +model-00022-of-00037.safetensors 3995200464 3.72 9.361 407.0 +model-00023-of-00037.safetensors 3812769424 3.55 8.369 434.5 +model-00024-of-00037.safetensors 3995183968 3.72 9.082 419.5 +model-00025-of-00037.safetensors 3995183968 3.72 8.912 427.5 +model-00026-of-00037.safetensors 3995200464 3.72 8.468 449.9 +model-00027-of-00037.safetensors 3812769424 3.55 8.361 434.9 +model-00028-of-00037.safetensors 3995183968 3.72 8.180 465.8 +model-00029-of-00037.safetensors 3995183968 3.72 8.403 453.4 +model-00030-of-00037.safetensors 3995200464 3.72 8.496 448.5 +model-00031-of-00037.safetensors 3812769424 3.55 9.085 400.2 +model-00032-of-00037.safetensors 3995183968 3.72 9.307 409.4 +model-00033-of-00037.safetensors 3995183968 3.72 42.116 90.5 +model-00034-of-00037.safetensors 3995200464 3.72 9.877 385.8 +model-00035-of-00037.safetensors 3812769424 3.55 8.772 414.5 +model-00036-of-00037.safetensors 3995183968 3.72 8.523 447.0 +model-00037-of-00037.safetensors 3460317640 3.22 7.673 430.1 +model.safetensors.index.json 79025 0.00 0.050 1.5 +tokenizer.json 7031645 0.01 0.227 29.5 +tokenizer_config.json 7305 0.00 0.048 0.1 +vocab.json 2776833 0.00 0.073 36.2 +TOTAL 145424101604 135.44 450.948 307.5 diff --git a/bench/xet_probe.py b/bench/xet_probe.py new file mode 100644 index 0000000..11f2f4d --- /dev/null +++ b/bench/xet_probe.py @@ -0,0 +1,162 @@ +"""Definitively detect which download path huggingface_hub takes. + +Monkeypatches `xet_get` and `http_get` in file_download.py so we can see +exactly which one is called for a single file, plus snapshots established +TCP connections during the download to fingerprint the endpoint +(Xet native CAS -> cas-server.xethub.hf.co ; bridge/CDN -> *.cloudfront.net). + +Runs twice: Xet enabled, then HF_HUB_DISABLE_XET=1, to see if disabling +changes the path or the peers. +""" + +import json +import os +import re +import shutil +import socket +import subprocess +import sys +import threading +import time +import urllib.request + +REPO = os.environ.get("PROBE_REPO", "Qwen/Qwen2.5-0.5B-Instruct") +FILE = os.environ.get("PROBE_FILE", "model.safetensors") +REV = os.environ.get("PROBE_REV", "main") + +import huggingface_hub.file_download as fd # noqa: E402 + +_orig_xet = fd.xet_get if hasattr(fd, "xet_get") else None +_orig_http = fd.http_get + +calls = {"xet_get": 0, "http_get": 0} + + +def spy_xet(*a, **k): + calls["xet_get"] += 1 + print(" >>> xet_get() CALLED (native Xet CAS protocol)", flush=True) + return _orig_xet(*a, **k) + + +def spy_http(*a, **k): + calls["http_get"] += 1 + print(" >>> http_get() CALLED (plain HTTPS / bridge redirect)", flush=True) + return _orig_http(*a, **k) + + +if _orig_xet is not None: + fd.xet_get = spy_xet +fd.http_get = spy_http + +from huggingface_hub import hf_hub_download # noqa: E402 +from huggingface_hub.utils._runtime import is_xet_available # noqa: E402 + + +def api(path): + return json.load(urllib.request.urlopen(f"https://huggingface.co{path}")) + + +def xet_hash_present(): + tree = api(f"/api/models/{REPO}/tree/{REV}?recursive=true") + for e in tree: + if e.get("path") == FILE: + return e.get("lfs", {}).get("oid"), "xetHash" in e, e.get("size") + return None, False, None + + +def snapshot_peers(pid): + try: + out = subprocess.check_output( + ["ss", "-tnp"], text=True, stderr=subprocess.DEVNULL + ) + except subprocess.CalledProcessError: + return {} + peers = {} + for line in out.splitlines(): + if "ESTAB" not in line or f"pid={pid}" not in line: + continue + m = re.search(r"([\d.]+):(\d+)\s+([\d.]+):(\d+)", line) + if not m: + continue + peer = m.group(3) + if peer.startswith("127."): + continue + if peer not in peers: + try: + peers[peer] = socket.gethostbyaddr(peer)[0] + except socket.herror: + peers[peer] = "?" + return peers + + +def run_once(label, disable_xet): + outdir = f"/tmp/xetprobe-{label}" + shutil.rmtree(outdir, ignore_errors=True) + os.makedirs(outdir) + cache = f"/tmp/xetprobe-cache-{label}" + shutil.rmtree(cache, ignore_errors=True) + env = dict(os.environ) + env["HF_HOME"] = cache + if disable_xet: + env["HF_HUB_DISABLE_XET"] = "1" + for k in ("HF_HUB_DISABLE_XET",): + os.environ[k] = env.get(k, "") + import huggingface_hub.constants as constants + + constants.HF_HUB_DISABLE_XET = bool(disable_xet) + + calls["xet_get"] = 0 + calls["http_get"] = 0 + print(f"\n=== {label} ===", flush=True) + print(f" HF_HUB_DISABLE_XET={constants.HF_HUB_DISABLE_XET}", flush=True) + + pid = os.getpid() + peers = {} + done = threading.Event() + + def poll(): + while not done.is_set(): + peers.update(snapshot_peers(pid)) + time.sleep(0.02) + + t = threading.Thread(target=poll, daemon=True) + t.start() + start = time.time() + path = hf_hub_download(REPO, FILE, revision=REV, local_dir=outdir) + done.set() + elapsed = time.time() - start + size = os.path.getsize(path) + mibs = size / (1 << 20) / elapsed + print(f" size={size} time={elapsed:.2f}s {mibs:.0f} MiB/s", flush=True) + print( + f" calls: xet_get={calls['xet_get']} http_get={calls['http_get']}", flush=True + ) + print(f" peers ({len(peers)}):", flush=True) + for ip, host in sorted(peers.items()): + tag = "" + if "xethub" in host and "bridge" not in host: + tag = " <-- XET NATIVE CAS" + elif "xethub" in host: + tag = " <-- XET BRIDGE" + elif "cloudfront" in host: + tag = " <-- CDN/CloudFront" + print(f" {ip} {host}{tag}", flush=True) + shutil.rmtree(outdir, ignore_errors=True) + shutil.rmtree(cache, ignore_errors=True) + return elapsed, mibs + + +def main(): + oid, has_xet, size = xet_hash_present() + print(f"repo={REPO} file={FILE} rev={REV}") + print(f" lfs.oid={oid}") + print(f" size={size}") + print(f" xetHash present: {has_xet}") + print(f" hf_xet importable: {is_xet_available()}") + print(f" huggingface_hub: {__import__('huggingface_hub').__version__}") + run_once("xet-enabled", disable_xet=False) + run_once("xet-disabled", disable_xet=True) + + +if __name__ == "__main__": + main() diff --git a/writeup.md b/writeup.md new file mode 100644 index 0000000..b501509 --- /dev/null +++ b/writeup.md @@ -0,0 +1,32 @@ +# HF CLI vs Naive Go Download + +## Question + +modelwrap downloads models by shelling out to `hf download` from +`huggingface_hub[hf_xet]` (`wrap/wrap.go`). That pulls Python, +`huggingface_hub`, the `hf_xet` plugin, and their full transitive deps into +the packer container — a sizable supply-chain surface for a tool whose +point is reproducibility and trust. + +Do we actually need the Xet stack for fast large-model downloads, or could +a stdlib-only Go downloader replace it? + +## Benchmarks + +Three small benchmarks in `bench/`, each its own program + run script, each +writing a TSV you can `rsync` off the bench host: + +- `bench/diskwrite` — raw disk write throughput (no network). +- `bench/netread` — raw network download throughput to `io.Discard` (no disk). +- `bench/naive` — the real stdlib downloader, sequential, with per-file + network and disk timing separated. + +Run on `inf8.tinfoil.sh` (downloads to `/mnt/large`): + +```bash +OUT_BASE=/mnt/large/modelwrap-bench bash bench/diskwrite/run.sh +OUT_BASE=/mnt/large/modelwrap-bench bash bench/netread/run.sh +OUT_BASE=/mnt/large/modelwrap-bench bash bench/naive/run.sh +``` + +Results land in `$OUT_BASE/{diskwrite,netread,naive}.tsv`.