Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lightllm/server/function_call_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .api_models import Tool

logger = logging.getLogger(__name__)
ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "True").upper() in ["ON", "TRUE", "1"]
ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "False").upper() in ["ON", "TRUE", "1"]

TOOLS_TAG_LIST = [
"<|plugin|>",
Expand Down
20 changes: 14 additions & 6 deletions lightllm/server/httpserver/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,10 +706,10 @@ async def _wait_to_token_package(
if self.pd_mode.is_P() and is_first_token:
metadata["prompt_ids"] = prompt_ids

prompt_cache_len = metadata.pop("prompt_cache_len", 0)
gpu_prompt_cache_len = metadata.pop("prompt_cache_len", 0)
cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
metadata["prompt_cache_len"] = prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
metadata["prompt_cache_len"] = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0)

if is_first_token:
Expand All @@ -733,9 +733,12 @@ async def _wait_to_token_package(
self.per_token_costs.add(mean_per_token_cost_time_ms)
x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
prompt_cache_ratio = prompt_cache_len / prompt_tokens
gpu_prompt_cache_ratio = gpu_prompt_cache_len / prompt_tokens
cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
prompt_cache_len = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
prompt_cache_ratio = prompt_cache_len / prompt_tokens
generation_throughput = out_token_counter / max(total_cost_time_ms / 1000.0, 1e-6)

mtp_avg_token_per_step = out_token_counter / max(
(out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1
Expand All @@ -748,9 +751,9 @@ async def _wait_to_token_package(
f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} "
f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms "
f"prompt_token_num:{prompt_tokens} "
f"gpu cache hit: {prompt_cache_len > 0} "
f"gpu_prompt_cache_len:{prompt_cache_len} "
f"gpu_prompt_cache_ratio:{prompt_cache_ratio} "
f"gpu cache hit: {gpu_prompt_cache_ratio > 0} "
f"gpu_prompt_cache_len:{gpu_prompt_cache_len} "
f"gpu_prompt_cache_ratio:{gpu_prompt_cache_ratio} "
f"cpu cache hit: {cpu_prompt_cache_len > 0} "
f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
Expand All @@ -759,8 +762,13 @@ async def _wait_to_token_package(
f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
)

self.metric_client.histogram_observe("lightllm_cache_length", prompt_cache_len)
self.metric_client.histogram_observe("lightllm_cache_ratio", prompt_cache_ratio)
self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", prompt_tokens)
self.metric_client.counter_inc_by("lightllm_generation_tokens_total", out_token_counter)
self.metric_client.gauge_set("lightllm_cache_hit_rate", prompt_cache_ratio)
self.metric_client.gauge_set("lightllm_gen_throughput", generation_throughput)
self.metric_client.histogram_observe(
"lightllm_request_inference_duration", total_cost_time_ms / 1000.0
)
Expand Down
10 changes: 10 additions & 0 deletions lightllm/server/metrics/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def on_disconnect(self, conn):
def exposed_counter_inc(self, name: str, label: str = None) -> None:
return self.monitor.counter_inc(name, label)

def exposed_counter_inc_by(self, name: str, amount: float) -> None:
return self.monitor.counter_inc_by(name, amount)

def exposed_histogram_observe(self, name: str, value: float, label: str = None) -> None:
return self.monitor.histogram_observe(name, value, label)

Expand Down Expand Up @@ -106,6 +109,13 @@ def inner_func():
self._append_task(inner_func)
return

def counter_inc_by(self, *args, **kwargs):
def inner_func():
return self.conn.root.counter_inc_by(*args, **kwargs)

self._append_task(inner_func)
return

def histogram_observe(self, *args, **kwargs):
def inner_func():
return self.conn.root.histogram_observe(*args, **kwargs)
Expand Down
43 changes: 26 additions & 17 deletions lightllm/server/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
"lightllm_cache_ratio": "cache length / input_length",
"lightllm_batch_current_max_tokens": "dynamic max token used for current batch",
"lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step",
"lightllm_prompt_tokens_total": "Total number of prefill tokens processed",
"lightllm_generation_tokens_total": "Total number of generation tokens processed",
"lightllm_cache_hit_rate": "Prefix cache hit rate of latest completed request",
"lightllm_gen_throughput": "Generation throughput of latest completed request (tokens/s)",
"lightllm_num_running_reqs": "Number of running requests",
}


Expand Down Expand Up @@ -60,6 +65,7 @@ def __init__(self, args):
self.init_metrics(args)

def init_metrics(self, args):
self.model_name = args.model_name

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

If args.model_name is None (e.g., if the --model_name argument is not provided at startup) or if the attribute does not exist on args, self.model_name will be None or raise an AttributeError. In prometheus_client, passing None as a label value (e.g., model_name=None) will raise a ValueError: Invalid label value: None at runtime when any metric is updated, crashing the metric server or the background metric thread. To prevent this, use getattr with a safe fallback string like 'unknown'.

Suggested change
self.model_name = args.model_name
self.model_name = getattr(args, "model_name", None) or "unknown"


self.create_histogram("lightllm_request_duration", self.duration_buckets)
self.create_histogram("lightllm_request_validation_duration", self.duration_buckets)
Expand Down Expand Up @@ -100,40 +106,43 @@ def init_metrics(self, args):
mtp_avg_token_per_step_buckets = [1.0, 2.0]
self.create_histogram("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step_buckets)

self.create_counter("lightllm_prompt_tokens_total")
self.create_counter("lightllm_generation_tokens_total")
self.create_gauge("lightllm_cache_hit_rate")
self.create_gauge("lightllm_gen_throughput")
self.create_gauge("lightllm_num_running_reqs")

def create_histogram(self, name, buckets, labelnames=None):
if labelnames is None:
histogram = Histogram(name, MONITOR_INFO[name], buckets=buckets, registry=self.registry)
else:
histogram = Histogram(
name, MONITOR_INFO[name], labelnames=labelnames, buckets=buckets, registry=self.registry
)
all_labels = ["model_name"] + (labelnames or [])
histogram = Histogram(name, MONITOR_INFO[name], labelnames=all_labels, buckets=buckets, registry=self.registry)
self.monitor_registry[name] = histogram

def create_counter(self, name, labelnames=None):
if labelnames is None:
histogram = Counter(name, MONITOR_INFO[name], registry=self.registry)
else:
histogram = Counter(name, MONITOR_INFO[name], labelnames=labelnames, registry=self.registry)
self.monitor_registry[name] = histogram
all_labels = ["model_name"] + (labelnames or [])
counter = Counter(name, MONITOR_INFO[name], labelnames=all_labels, registry=self.registry)
self.monitor_registry[name] = counter

def create_gauge(self, name):
gauge = Gauge(name, MONITOR_INFO[name], registry=self.registry)
gauge = Gauge(name, MONITOR_INFO[name], labelnames=["model_name"], registry=self.registry)
self.monitor_registry[name] = gauge

def counter_inc(self, name, label=None):
if label is None:
self.monitor_registry[name].inc()
self.monitor_registry[name].labels(model_name=self.model_name).inc()
else:
self.monitor_registry[name].labels(method=label).inc()
self.monitor_registry[name].labels(model_name=self.model_name, method=label).inc()

def counter_inc_by(self, name, amount):
self.monitor_registry[name].labels(model_name=self.model_name).inc(amount)

def histogram_observe(self, name, value, label=None):
if label is None:
self.monitor_registry[name].observe(value)
self.monitor_registry[name].labels(model_name=self.model_name).observe(value)
else:
self.monitor_registry[name].labels(method=label).observe(value)
self.monitor_registry[name].labels(model_name=self.model_name, method=label).observe(value)

def gauge_set(self, name, value):
self.monitor_registry[name].set(value)
self.monitor_registry[name].labels(model_name=self.model_name).set(value)

def push_metrices(self):
if self.gateway_url is not None:
Expand Down
5 changes: 3 additions & 2 deletions lightllm/server/router/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
from .stats import RouterStatics


logger = init_logger(__name__)


Expand Down Expand Up @@ -241,10 +240,11 @@ async def loop_for_fwd(
f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token"
)
logger.debug(self.router_statics.log_str())
self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num)
self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num())
# pd decode mode need to update token_load more frequently
self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode)
self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs))
self.metric_client.gauge_set("lightllm_num_running_reqs", len(self.running_batch.reqs))
self.metric_client.gauge_set("lightllm_queue_size", self.req_queue.get_wait_req_num())
self.metric_client.gauge_set(
"lightllm_batch_current_max_tokens",
Expand All @@ -257,6 +257,7 @@ async def loop_for_fwd(
self.req_queue.update_token_load(self.running_batch, force_update=True)
if counter_count % 300 == 0:
self.metric_client.gauge_set("lightllm_batch_current_size", 0.0)
self.metric_client.gauge_set("lightllm_num_running_reqs", 0.0)
self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0)
self.metric_client.gauge_set("lightllm_queue_size", 0.0)
self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0)
Expand Down
Loading