vllm-cpu-example/.env at main · schoolofdevops/vllm-cpu-example · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# =============================================================================
# vLLM CPU Deployment Configuration
# =============================================================================
# Optimized for macOS CPU-only inference with small footprint

# -----------------------------------------------------------------------------
# Model Configuration
# -----------------------------------------------------------------------------
# Options for SmolLM2 models:
# - HuggingFaceTB/SmolLM2-135M-Instruct  (smallest, ~500MB)
# - HuggingFaceTB/SmolLM2-360M-Instruct  (balanced, ~1.3GB)
# - HuggingFaceTB/SmolLM2-1.7B-Instruct  (larger, ~6.5GB)
MODEL_NAME=HuggingFaceTB/SmolLM2-360M-Instruct

# Data type for inference (auto, float32, float16, bfloat16)
# Note: float16/bfloat16 may not work well on all CPUs, use float32 for CPU
DTYPE=float32

# Maximum sequence length (lower = less memory)
MAX_MODEL_LEN=2048

# Maximum number of concurrent sequences (lower = less memory, fewer concurrent requests)
MAX_NUM_SEQS=8

# -----------------------------------------------------------------------------
# Server Configuration
# -----------------------------------------------------------------------------
# Port to expose vLLM API
VLLM_PORT=8009

# -----------------------------------------------------------------------------
# Performance Tuning
# -----------------------------------------------------------------------------
# KV Cache space in GB (lower = less memory usage, fewer cached tokens)
KVCACHE_SPACE=1

# OpenMP threads (2-4 recommended for macOS, adjust based on your CPU)
OMP_THREADS=2

# -----------------------------------------------------------------------------
# Resource Limits (Docker Desktop for Mac)
# -----------------------------------------------------------------------------
# CPU limits (adjust based on your system)
# For M1/M2 Macs with 8 cores: 4.0-6.0
# For Intel Macs: 2.0-4.0
CPU_LIMIT=4.0
CPU_RESERVATION=2.0

# Memory limits (adjust based on available RAM)
# Minimum recommended: 4G for SmolLM2-360M
# For SmolLM2-1.7B: 8G-12G
MEMORY_LIMIT=8G
MEMORY_RESERVATION=4G

# -----------------------------------------------------------------------------
# Optional: HuggingFace Configuration
# -----------------------------------------------------------------------------
# Uncomment and set if you need to access gated models
# HF_TOKEN=your_huggingface_token_here

# -----------------------------------------------------------------------------
# Optimization Presets (uncomment one preset)
# -----------------------------------------------------------------------------

# PRESET: Minimal Footprint (SmolLM2-135M, ~2GB total memory)
# MODEL_NAME=HuggingFaceTB/SmolLM2-135M-Instruct
# MAX_MODEL_LEN=1024
# MAX_NUM_SEQS=4
# KVCACHE_SPACE=0.5
# OMP_THREADS=2
# CPU_LIMIT=2.0
# MEMORY_LIMIT=4G

# PRESET: Balanced (SmolLM2-360M, ~4GB total memory) - DEFAULT
# MODEL_NAME=HuggingFaceTB/SmolLM2-360M-Instruct
# MAX_MODEL_LEN=2048
# MAX_NUM_SEQS=8
# KVCACHE_SPACE=1
# OMP_THREADS=2
# CPU_LIMIT=4.0
# MEMORY_LIMIT=8G

# PRESET: Maximum Quality (SmolLM2-1.7B, ~10GB total memory)
# MODEL_NAME=HuggingFaceTB/SmolLM2-1.7B-Instruct
# MAX_MODEL_LEN=4096
# MAX_NUM_SEQS=16
# KVCACHE_SPACE=2
# OMP_THREADS=4
# CPU_LIMIT=6.0
# MEMORY_LIMIT=12G