diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py index f204c154ed..dfcb2f8d9e 100644 --- a/lightllm/server/function_call_parser.py +++ b/lightllm/server/function_call_parser.py @@ -30,7 +30,7 @@ from .api_models import Tool logger = logging.getLogger(__name__) -ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "True").upper() in ["ON", "TRUE", "1"] +ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "False").upper() in ["ON", "TRUE", "1"] TOOLS_TAG_LIST = [ "<|plugin|>", diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 8fdd277f57..e47692d1b0 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -706,10 +706,10 @@ async def _wait_to_token_package( if self.pd_mode.is_P() and is_first_token: metadata["prompt_ids"] = prompt_ids - prompt_cache_len = metadata.pop("prompt_cache_len", 0) + gpu_prompt_cache_len = metadata.pop("prompt_cache_len", 0) cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0) disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0) - metadata["prompt_cache_len"] = prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len + metadata["prompt_cache_len"] = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0) if is_first_token: @@ -733,9 +733,12 @@ async def _wait_to_token_package( self.per_token_costs.add(mean_per_token_cost_time_ms) x_request_id = request.headers.get("X-Request-Id", "") if request is not None else "" x_session_id = request.headers.get("X-Session-Id", "") if request is not None else "" - prompt_cache_ratio = prompt_cache_len / prompt_tokens + gpu_prompt_cache_ratio = gpu_prompt_cache_len / prompt_tokens cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens + prompt_cache_len = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len + prompt_cache_ratio = prompt_cache_len / prompt_tokens + generation_throughput = out_token_counter / max(total_cost_time_ms / 1000.0, 1e-6) mtp_avg_token_per_step = out_token_counter / max( (out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1 @@ -748,9 +751,9 @@ async def _wait_to_token_package( f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} " f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms " f"prompt_token_num:{prompt_tokens} " - f"gpu cache hit: {prompt_cache_len > 0} " - f"gpu_prompt_cache_len:{prompt_cache_len} " - f"gpu_prompt_cache_ratio:{prompt_cache_ratio} " + f"gpu cache hit: {gpu_prompt_cache_ratio > 0} " + f"gpu_prompt_cache_len:{gpu_prompt_cache_len} " + f"gpu_prompt_cache_ratio:{gpu_prompt_cache_ratio} " f"cpu cache hit: {cpu_prompt_cache_len > 0} " f"cpu_prompt_cache_len:{cpu_prompt_cache_len} " f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} " @@ -759,8 +762,13 @@ async def _wait_to_token_package( f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} " f"mtp_avg_token_per_step:{mtp_avg_token_per_step} " ) + self.metric_client.histogram_observe("lightllm_cache_length", prompt_cache_len) self.metric_client.histogram_observe("lightllm_cache_ratio", prompt_cache_ratio) + self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", prompt_tokens) + self.metric_client.counter_inc_by("lightllm_generation_tokens_total", out_token_counter) + self.metric_client.gauge_set("lightllm_cache_hit_rate", prompt_cache_ratio) + self.metric_client.gauge_set("lightllm_gen_throughput", generation_throughput) self.metric_client.histogram_observe( "lightllm_request_inference_duration", total_cost_time_ms / 1000.0 ) diff --git a/lightllm/server/metrics/manager.py b/lightllm/server/metrics/manager.py index f3b1a5275b..a95ddc0236 100644 --- a/lightllm/server/metrics/manager.py +++ b/lightllm/server/metrics/manager.py @@ -48,6 +48,9 @@ def on_disconnect(self, conn): def exposed_counter_inc(self, name: str, label: str = None) -> None: return self.monitor.counter_inc(name, label) + def exposed_counter_inc_by(self, name: str, amount: float) -> None: + return self.monitor.counter_inc_by(name, amount) + def exposed_histogram_observe(self, name: str, value: float, label: str = None) -> None: return self.monitor.histogram_observe(name, value, label) @@ -106,6 +109,13 @@ def inner_func(): self._append_task(inner_func) return + def counter_inc_by(self, *args, **kwargs): + def inner_func(): + return self.conn.root.counter_inc_by(*args, **kwargs) + + self._append_task(inner_func) + return + def histogram_observe(self, *args, **kwargs): def inner_func(): return self.conn.root.histogram_observe(*args, **kwargs) diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py index 130f32c7a7..0d42462c3f 100644 --- a/lightllm/server/metrics/metrics.py +++ b/lightllm/server/metrics/metrics.py @@ -27,6 +27,11 @@ "lightllm_cache_ratio": "cache length / input_length", "lightllm_batch_current_max_tokens": "dynamic max token used for current batch", "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step", + "lightllm_prompt_tokens_total": "Total number of prefill tokens processed", + "lightllm_generation_tokens_total": "Total number of generation tokens processed", + "lightllm_cache_hit_rate": "Prefix cache hit rate of latest completed request", + "lightllm_gen_throughput": "Generation throughput of latest completed request (tokens/s)", + "lightllm_num_running_reqs": "Number of running requests", } @@ -60,6 +65,7 @@ def __init__(self, args): self.init_metrics(args) def init_metrics(self, args): + self.model_name = args.model_name self.create_histogram("lightllm_request_duration", self.duration_buckets) self.create_histogram("lightllm_request_validation_duration", self.duration_buckets) @@ -100,40 +106,43 @@ def init_metrics(self, args): mtp_avg_token_per_step_buckets = [1.0, 2.0] self.create_histogram("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step_buckets) + self.create_counter("lightllm_prompt_tokens_total") + self.create_counter("lightllm_generation_tokens_total") + self.create_gauge("lightllm_cache_hit_rate") + self.create_gauge("lightllm_gen_throughput") + self.create_gauge("lightllm_num_running_reqs") + def create_histogram(self, name, buckets, labelnames=None): - if labelnames is None: - histogram = Histogram(name, MONITOR_INFO[name], buckets=buckets, registry=self.registry) - else: - histogram = Histogram( - name, MONITOR_INFO[name], labelnames=labelnames, buckets=buckets, registry=self.registry - ) + all_labels = ["model_name"] + (labelnames or []) + histogram = Histogram(name, MONITOR_INFO[name], labelnames=all_labels, buckets=buckets, registry=self.registry) self.monitor_registry[name] = histogram def create_counter(self, name, labelnames=None): - if labelnames is None: - histogram = Counter(name, MONITOR_INFO[name], registry=self.registry) - else: - histogram = Counter(name, MONITOR_INFO[name], labelnames=labelnames, registry=self.registry) - self.monitor_registry[name] = histogram + all_labels = ["model_name"] + (labelnames or []) + counter = Counter(name, MONITOR_INFO[name], labelnames=all_labels, registry=self.registry) + self.monitor_registry[name] = counter def create_gauge(self, name): - gauge = Gauge(name, MONITOR_INFO[name], registry=self.registry) + gauge = Gauge(name, MONITOR_INFO[name], labelnames=["model_name"], registry=self.registry) self.monitor_registry[name] = gauge def counter_inc(self, name, label=None): if label is None: - self.monitor_registry[name].inc() + self.monitor_registry[name].labels(model_name=self.model_name).inc() else: - self.monitor_registry[name].labels(method=label).inc() + self.monitor_registry[name].labels(model_name=self.model_name, method=label).inc() + + def counter_inc_by(self, name, amount): + self.monitor_registry[name].labels(model_name=self.model_name).inc(amount) def histogram_observe(self, name, value, label=None): if label is None: - self.monitor_registry[name].observe(value) + self.monitor_registry[name].labels(model_name=self.model_name).observe(value) else: - self.monitor_registry[name].labels(method=label).observe(value) + self.monitor_registry[name].labels(model_name=self.model_name, method=label).observe(value) def gauge_set(self, name, value): - self.monitor_registry[name].set(value) + self.monitor_registry[name].labels(model_name=self.model_name).set(value) def push_metrices(self): if self.gateway_url is not None: diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index a5419adb9e..1de4238a5c 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -35,7 +35,6 @@ from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt from .stats import RouterStatics - logger = init_logger(__name__) @@ -241,10 +240,11 @@ async def loop_for_fwd( f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token" ) logger.debug(self.router_statics.log_str()) - self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num) + self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num()) # pd decode mode need to update token_load more frequently self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode) self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs)) + self.metric_client.gauge_set("lightllm_num_running_reqs", len(self.running_batch.reqs)) self.metric_client.gauge_set("lightllm_queue_size", self.req_queue.get_wait_req_num()) self.metric_client.gauge_set( "lightllm_batch_current_max_tokens", @@ -257,6 +257,7 @@ async def loop_for_fwd( self.req_queue.update_token_load(self.running_batch, force_update=True) if counter_count % 300 == 0: self.metric_client.gauge_set("lightllm_batch_current_size", 0.0) + self.metric_client.gauge_set("lightllm_num_running_reqs", 0.0) self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0) self.metric_client.gauge_set("lightllm_queue_size", 0.0) self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0)