ModelTC · shihaobai · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py
@@ -30,7 +30,7 @@
 from .api_models import Tool
 
 logger = logging.getLogger(__name__)
-ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "True").upper() in ["ON", "TRUE", "1"]
+ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "False").upper() in ["ON", "TRUE", "1"]
 
 TOOLS_TAG_LIST = [
     "<|plugin|>",

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -706,10 +706,10 @@ async def _wait_to_token_package(
                     if self.pd_mode.is_P() and is_first_token:
                         metadata["prompt_ids"] = prompt_ids
 
-                    prompt_cache_len = metadata.pop("prompt_cache_len", 0)
+                    gpu_prompt_cache_len = metadata.pop("prompt_cache_len", 0)
                     cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
                     disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
-                    metadata["prompt_cache_len"] = prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
+                    metadata["prompt_cache_len"] = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
                     sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0)
 
                     if is_first_token:
@@ -733,9 +733,12 @@ async def _wait_to_token_package(
                         self.per_token_costs.add(mean_per_token_cost_time_ms)
                         x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
                         x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
-                        prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        gpu_prompt_cache_ratio = gpu_prompt_cache_len / prompt_tokens
                         cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
                         disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
+                        prompt_cache_len = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
+                        prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        generation_throughput = out_token_counter / max(total_cost_time_ms / 1000.0, 1e-6)
 
                         mtp_avg_token_per_step = out_token_counter / max(
                             (out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1
@@ -748,9 +751,9 @@ async def _wait_to_token_package(
                             f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} "
                             f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms "
                             f"prompt_token_num:{prompt_tokens} "
-                            f"gpu cache hit: {prompt_cache_len > 0} "
-                            f"gpu_prompt_cache_len:{prompt_cache_len} "
-                            f"gpu_prompt_cache_ratio:{prompt_cache_ratio} "
+                            f"gpu cache hit: {gpu_prompt_cache_ratio > 0} "
+                            f"gpu_prompt_cache_len:{gpu_prompt_cache_len} "
+                            f"gpu_prompt_cache_ratio:{gpu_prompt_cache_ratio} "
                             f"cpu cache hit: {cpu_prompt_cache_len > 0} "
                             f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
                             f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
@@ -759,8 +762,13 @@ async def _wait_to_token_package(
                             f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
                         )
+
                         self.metric_client.histogram_observe("lightllm_cache_length", prompt_cache_len)
                         self.metric_client.histogram_observe("lightllm_cache_ratio", prompt_cache_ratio)
+                        self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", prompt_tokens)
+                        self.metric_client.counter_inc_by("lightllm_generation_tokens_total", out_token_counter)
+                        self.metric_client.gauge_set("lightllm_cache_hit_rate", prompt_cache_ratio)
+                        self.metric_client.gauge_set("lightllm_gen_throughput", generation_throughput)
                         self.metric_client.histogram_observe(
                             "lightllm_request_inference_duration", total_cost_time_ms / 1000.0
                         )

diff --git a/lightllm/server/metrics/manager.py b/lightllm/server/metrics/manager.py
@@ -48,6 +48,9 @@ def on_disconnect(self, conn):
     def exposed_counter_inc(self, name: str, label: str = None) -> None:
         return self.monitor.counter_inc(name, label)
 
+    def exposed_counter_inc_by(self, name: str, amount: float) -> None:
+        return self.monitor.counter_inc_by(name, amount)
+
     def exposed_histogram_observe(self, name: str, value: float, label: str = None) -> None:
         return self.monitor.histogram_observe(name, value, label)
 
@@ -106,6 +109,13 @@ def inner_func():
         self._append_task(inner_func)
         return
 
+    def counter_inc_by(self, *args, **kwargs):
+        def inner_func():
+            return self.conn.root.counter_inc_by(*args, **kwargs)
+
+        self._append_task(inner_func)
+        return
+
     def histogram_observe(self, *args, **kwargs):
         def inner_func():
             return self.conn.root.histogram_observe(*args, **kwargs)

diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py
@@ -27,6 +27,11 @@
     "lightllm_cache_ratio": "cache length / input_length",
     "lightllm_batch_current_max_tokens": "dynamic max token used for current batch",
     "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step",
+    "lightllm_prompt_tokens_total": "Total number of prefill tokens processed",
+    "lightllm_generation_tokens_total": "Total number of generation tokens processed",
+    "lightllm_cache_hit_rate": "Prefix cache hit rate of latest completed request",
+    "lightllm_gen_throughput": "Generation throughput of latest completed request (tokens/s)",
+    "lightllm_num_running_reqs": "Number of running requests",
 }
 
 
@@ -60,6 +65,7 @@ def __init__(self, args):
         self.init_metrics(args)
 
     def init_metrics(self, args):
+        self.model_name = args.model_name
-        self.model_name = args.model_name
+        self.model_name = getattr(args, "model_name", None) or "unknown"
-        self.model_name = args.model_name
+        self.model_name = getattr(args, "model_name", None) or "unknown"
 
         self.create_histogram("lightllm_request_duration", self.duration_buckets)
         self.create_histogram("lightllm_request_validation_duration", self.duration_buckets)
@@ -100,40 +106,43 @@ def init_metrics(self, args):
             mtp_avg_token_per_step_buckets = [1.0, 2.0]
         self.create_histogram("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step_buckets)
 
+        self.create_counter("lightllm_prompt_tokens_total")
+        self.create_counter("lightllm_generation_tokens_total")
+        self.create_gauge("lightllm_cache_hit_rate")
+        self.create_gauge("lightllm_gen_throughput")
+        self.create_gauge("lightllm_num_running_reqs")
+
     def create_histogram(self, name, buckets, labelnames=None):
-        if labelnames is None:
-            histogram = Histogram(name, MONITOR_INFO[name], buckets=buckets, registry=self.registry)
-        else:
-            histogram = Histogram(
-                name, MONITOR_INFO[name], labelnames=labelnames, buckets=buckets, registry=self.registry
-            )
+        all_labels = ["model_name"] + (labelnames or [])
+        histogram = Histogram(name, MONITOR_INFO[name], labelnames=all_labels, buckets=buckets, registry=self.registry)
         self.monitor_registry[name] = histogram
 
     def create_counter(self, name, labelnames=None):
-        if labelnames is None:
-            histogram = Counter(name, MONITOR_INFO[name], registry=self.registry)
-        else:
-            histogram = Counter(name, MONITOR_INFO[name], labelnames=labelnames, registry=self.registry)
-        self.monitor_registry[name] = histogram
+        all_labels = ["model_name"] + (labelnames or [])
+        counter = Counter(name, MONITOR_INFO[name], labelnames=all_labels, registry=self.registry)
+        self.monitor_registry[name] = counter
 
     def create_gauge(self, name):
-        gauge = Gauge(name, MONITOR_INFO[name], registry=self.registry)
+        gauge = Gauge(name, MONITOR_INFO[name], labelnames=["model_name"], registry=self.registry)
         self.monitor_registry[name] = gauge
 
     def counter_inc(self, name, label=None):
         if label is None:
-            self.monitor_registry[name].inc()
+            self.monitor_registry[name].labels(model_name=self.model_name).inc()
         else:
-            self.monitor_registry[name].labels(method=label).inc()
+            self.monitor_registry[name].labels(model_name=self.model_name, method=label).inc()
+
+    def counter_inc_by(self, name, amount):
+        self.monitor_registry[name].labels(model_name=self.model_name).inc(amount)
 
     def histogram_observe(self, name, value, label=None):
         if label is None:
-            self.monitor_registry[name].observe(value)
+            self.monitor_registry[name].labels(model_name=self.model_name).observe(value)
         else:
-            self.monitor_registry[name].labels(method=label).observe(value)
+            self.monitor_registry[name].labels(model_name=self.model_name, method=label).observe(value)
 
     def gauge_set(self, name, value):
-        self.monitor_registry[name].set(value)
+        self.monitor_registry[name].labels(model_name=self.model_name).set(value)
 
     def push_metrices(self):
         if self.gateway_url is not None:

diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
@@ -35,7 +35,6 @@
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
 from .stats import RouterStatics
 
-
 logger = init_logger(__name__)
 
 
@@ -241,10 +240,11 @@ async def loop_for_fwd(
                             f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token"
                         )
                         logger.debug(self.router_statics.log_str())
-                        self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num)
+                    self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num())
                 # pd decode mode need to update token_load more frequently
                 self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode)
                 self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs))
+                self.metric_client.gauge_set("lightllm_num_running_reqs", len(self.running_batch.reqs))
                 self.metric_client.gauge_set("lightllm_queue_size", self.req_queue.get_wait_req_num())
                 self.metric_client.gauge_set(
                     "lightllm_batch_current_max_tokens",
@@ -257,6 +257,7 @@ async def loop_for_fwd(
                 self.req_queue.update_token_load(self.running_batch, force_update=True)
                 if counter_count % 300 == 0:
                     self.metric_client.gauge_set("lightllm_batch_current_size", 0.0)
+                    self.metric_client.gauge_set("lightllm_num_running_reqs", 0.0)
                     self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0)
                     self.metric_client.gauge_set("lightllm_queue_size", 0.0)
                     self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0)