Skip to content

Commit 19f3e95

Browse files
committed
feat(observability): integrate OpenTelemetry and Prometheus for enhanced monitoring
- Added observability configuration to `main.py` to initialize metrics and tracing. - Introduced new settings in `config.py` to enable/disable observability features and configure OpenTelemetry parameters. - Created `observability.py` utility for setting up Prometheus metrics and OpenTelemetry tracing, including support for Loki logging. - Updated `pyproject.toml` to include necessary dependencies for observability tools. - Enhanced application logging and monitoring capabilities, allowing for better performance insights and error tracking.
1 parent a095009 commit 19f3e95

4 files changed

Lines changed: 272 additions & 0 deletions

File tree

app/core/config.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ def _sanitize_bool_tokens(cls, data): # type: ignore[return-value]
136136
"INLINE_METADATA_DISABLED",
137137
"ENABLE_BACKGROUND_PROCESSING",
138138
"VECTOR_SEARCH_ENABLED",
139+
"OBSERVABILITY_ENABLED",
140+
"OTEL_TRACES_ENABLED",
141+
"OTEL_METRICS_ENABLED",
142+
"LOKI_ENABLED",
139143
):
140144
if key in data and isinstance(data[key], str):
141145
raw = data[key]
@@ -144,5 +148,50 @@ def _sanitize_bool_tokens(cls, data): # type: ignore[return-value]
144148
data[key] = token
145149
return data
146150

151+
# ------------------------------------------------------------------
152+
# Observability / Telemetry
153+
# ------------------------------------------------------------------
154+
155+
# Master on/off switch – when false no observability instrumentation is
156+
# initialised. This is useful for local development or extremely
157+
# resource-constrained environments where you don't want the overhead of
158+
# telemetry.
159+
OBSERVABILITY_ENABLED: bool = Field(default=True, description="Globally enable/disable all extra observability (metrics/traces/log shipping).")
160+
161+
# --- OpenTelemetry ---------------------------------------------------
162+
163+
# OTLP endpoint that traces / metrics will be sent to. For the Docker
164+
# Compose stack provided by the `mcp-observability` project this will be
165+
# the OpenTelemetry Collector service – typically http://otelcol:4317.
166+
OTEL_EXPORTER_OTLP_ENDPOINT: str | None = Field(
167+
default=None,
168+
description="Base OTLP gRPC endpoint, e.g. http://otelcol:4317. If unset OTLP export is disabled.",
169+
)
170+
171+
# Toggle exporting OpenTelemetry traces. Requires OTEL_EXPORTER_OTLP_ENDPOINT.
172+
OTEL_TRACES_ENABLED: bool = True
173+
174+
# Toggle exporting OpenTelemetry metrics via OTLP. The Prometheus scrape
175+
# endpoint (provided by prometheus-fastapi-instrumentator) remains active
176+
# regardless of this flag so that a local Prometheus instance can still
177+
# pull metrics if desired.
178+
OTEL_METRICS_ENABLED: bool = False
179+
180+
# Sample ratio (0.0-1.0) for traces – 1.0 = always.
181+
OTEL_TRACES_SAMPLER_RATIO: float = Field(default=1.0, ge=0.0, le=1.0)
182+
183+
# --- Centralised logging (Loki) --------------------------------------
184+
185+
LOKI_ENABLED: bool = Field(default=False, description="Enable structured log shipping to Loki.")
186+
LOKI_ENDPOINT: str | None = Field(
187+
default=None,
188+
description="Loki push API endpoint, e.g. http://loki:3100/loki/api/v1/push.",
189+
)
190+
191+
# Extra labels to attach to Loki log streams – provided as a comma-separated
192+
# list of key=value pairs so they can be conveniently set via environment
193+
# variables.
194+
LOKI_EXTRA_LABELS: str | None = Field(default=None)
195+
147196

148197
settings = Settings()

app/main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@
3030
moderation,
3131
)
3232

33+
# --------------------------------------------------------------
34+
# Observability must be configured as early as possible so that
35+
# instrumentation picks up all subsequent application events.
36+
# --------------------------------------------------------------
37+
38+
from app.utils.observability import configure_observability # noqa: E402
39+
3340
logger = logging.getLogger(__name__)
3441

3542
app = FastAPI(title="KillrVideo v2 - Monolith Backend", version=settings.APP_VERSION)
@@ -63,6 +70,14 @@
6370

6471
app.include_router(api_router_v1)
6572

73+
# ---------------------------------------------------------------------------
74+
# Initialise observability (metrics / tracing / log shipping) if enabled.
75+
# This must run *after* the FastAPI instance is created so that instrumentors
76+
# can attach middleware and routes.
77+
# ---------------------------------------------------------------------------
78+
79+
configure_observability(app)
80+
6681
# Attempt to import httpx & httpcore connection error classes for fine-grained
6782
# exception handling. They may not be present in some lightweight test
6883
# environments – fall back gracefully.

app/utils/observability.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# Observability utilities – OpenTelemetry instrumentation, Prometheus metrics
2+
# exposition and structured log shipping.
3+
#
4+
# This module is imported by main.py during application start-up. All
5+
# instrumentation is performed in a best-effort fashion: if a particular
6+
# dependency is missing or an endpoint is unreachable we log a warning but
7+
# allow the application to continue running.
8+
9+
from __future__ import annotations
10+
11+
import logging
12+
import os
13+
import time
14+
from typing import Any
15+
16+
from fastapi import FastAPI
17+
18+
from app.core.config import settings
19+
20+
_logger = logging.getLogger(__name__)
21+
22+
23+
# ---------------------------------------------------------------------------
24+
# Prometheus – latency histogram per route via prometheus-fastapi-instrumentator
25+
# ---------------------------------------------------------------------------
26+
27+
_PROMETHEUS_READY = False
28+
29+
try:
30+
from prometheus_fastapi_instrumentator import Instrumentator # type: ignore
31+
except ModuleNotFoundError: # pragma: no cover
32+
Instrumentator = None # type: ignore
33+
_logger.debug("prometheus_fastapi_instrumentator not available – metrics disabled")
34+
35+
36+
# ---------------------------------------------------------------------------
37+
# OpenTelemetry – traces + optional OTLP metric export
38+
# ---------------------------------------------------------------------------
39+
40+
_OTEL_READY = False
41+
try:
42+
from opentelemetry import trace
43+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor # type: ignore
44+
from opentelemetry.instrumentation.logging import LoggingInstrumentor # type: ignore
45+
from opentelemetry.sdk.resources import Resource
46+
from opentelemetry.sdk.trace import TracerProvider
47+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
48+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter # type: ignore
49+
from opentelemetry.sdk.trace.sampling import TraceIdRatioBased
50+
51+
# Metrics are optional – only import if feature flag enabled
52+
if settings.OTEL_METRICS_ENABLED:
53+
from opentelemetry.sdk.metrics import MeterProvider # type: ignore
54+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( # type: ignore
55+
OTLPMetricExporter,
56+
)
57+
from opentelemetry.sdk.metrics.export import (
58+
PeriodicExportingMetricReader,
59+
)
60+
61+
_OTEL_READY = True
62+
except ModuleNotFoundError as exc: # pragma: no cover
63+
_logger.debug("OpenTelemetry libraries missing – traces disabled (%s)", exc)
64+
65+
66+
# ---------------------------------------------------------------------------
67+
# Loki logging handler
68+
# ---------------------------------------------------------------------------
69+
70+
_LOKI_READY = False
71+
if settings.LOKI_ENABLED and settings.LOKI_ENDPOINT:
72+
try:
73+
import logging_loki # type: ignore
74+
75+
_LOKI_READY = True
76+
except ModuleNotFoundError: # pragma: no cover
77+
_logger.warning("LOKI_ENABLED but logging_loki dependency missing; skipping Loki handler")
78+
79+
80+
# ---------------------------------------------------------------------------
81+
# Public API
82+
# ---------------------------------------------------------------------------
83+
84+
def configure_observability(app: FastAPI) -> None: # noqa: D401
85+
"""Initialise optional observability integrations.
86+
87+
The function is safe to call multiple times – it keeps track of internal
88+
state to ensure instrumentation happens only once.
89+
"""
90+
91+
if not settings.OBSERVABILITY_ENABLED:
92+
_logger.info("Observability explicitly disabled via settings")
93+
return
94+
95+
_setup_prometheus(app)
96+
_setup_opentelemetry(app)
97+
_setup_loki_logging()
98+
99+
100+
# ---------------------------------------------------------------------------
101+
# Internal helpers
102+
# ---------------------------------------------------------------------------
103+
104+
105+
_prometheus_instrumented = False
106+
107+
def _setup_prometheus(app: FastAPI) -> None:
108+
global _prometheus_instrumented
109+
if _prometheus_instrumented or Instrumentator is None:
110+
return
111+
112+
try:
113+
start_time = time.perf_counter()
114+
Instrumentator().instrument(app).expose(app, include_in_schema=False, should_gzip=True)
115+
_prometheus_instrumented = True
116+
_logger.info("Prometheus instrumentation initialised in %.2f ms", (time.perf_counter() - start_time) * 1000)
117+
except Exception as exc: # pragma: no cover
118+
_logger.warning("Failed to initialise Prometheus instrumentation: %s", exc)
119+
120+
121+
_otel_instrumented = False
122+
123+
def _setup_opentelemetry(app: FastAPI) -> None:
124+
global _otel_instrumented
125+
if _otel_instrumented or not _OTEL_READY or not settings.OTEL_TRACES_ENABLED:
126+
return
127+
128+
if not settings.OTEL_EXPORTER_OTLP_ENDPOINT:
129+
_logger.info("OTEL_TRACES_ENABLED but no OTEL_EXPORTER_OTLP_ENDPOINT set – skipping")
130+
return
131+
132+
try:
133+
start = time.perf_counter()
134+
135+
resource_attrs: dict[str, Any] = {
136+
"service.name": settings.PROJECT_NAME,
137+
"service.version": settings.APP_VERSION,
138+
"deployment.environment": settings.ENVIRONMENT,
139+
}
140+
resource = Resource.create(resource_attrs)
141+
142+
sampler = TraceIdRatioBased(settings.OTEL_TRACES_SAMPLER_RATIO)
143+
provider = TracerProvider(resource=resource, sampler=sampler)
144+
trace.set_tracer_provider(provider)
145+
146+
span_exporter = OTLPSpanExporter(endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, insecure=True)
147+
span_processor = BatchSpanProcessor(span_exporter)
148+
provider.add_span_processor(span_processor)
149+
150+
# Instrument FastAPI request handling
151+
FastAPIInstrumentor().instrument_app(app, tracer_provider=provider)
152+
153+
# Correlate application logs with active spans
154+
LoggingInstrumentor().instrument(set_logging_format=True)
155+
156+
# Optional metrics export via OTLP
157+
if settings.OTEL_METRICS_ENABLED:
158+
metric_exporter = OTLPMetricExporter(endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, insecure=True)
159+
reader = PeriodicExportingMetricReader(metric_exporter)
160+
meter_provider = MeterProvider(resource=resource, metric_readers=[reader])
161+
from opentelemetry import metrics # local import to avoid missing module issue
162+
163+
metrics.set_meter_provider(meter_provider)
164+
165+
_otel_instrumented = True
166+
_logger.info("OpenTelemetry tracing initialised (%.2f ms)", (time.perf_counter() - start) * 1000)
167+
except Exception as exc: # pragma: no cover
168+
_logger.warning("Failed to initialise OpenTelemetry tracing: %s", exc)
169+
170+
171+
_loki_handler_added = False
172+
173+
def _setup_loki_logging() -> None:
174+
global _loki_handler_added
175+
if _loki_handler_added or not _LOKI_READY:
176+
return
177+
178+
try:
179+
import logging_loki # type: ignore
180+
181+
tags = {
182+
"service": settings.PROJECT_NAME,
183+
"environment": settings.ENVIRONMENT,
184+
}
185+
if settings.LOKI_EXTRA_LABELS:
186+
for pair in settings.LOKI_EXTRA_LABELS.split(","):
187+
if "=" in pair:
188+
k, v = pair.split("=", 1)
189+
tags[k.strip()] = v.strip()
190+
191+
handler = logging_loki.LokiHandler(
192+
url=settings.LOKI_ENDPOINT, # type: ignore[arg-type]
193+
tags=tags,
194+
version="1",
195+
)
196+
logging.getLogger().addHandler(handler)
197+
_loki_handler_added = True
198+
_logger.info("Loki logging handler attached (endpoint=%s)", settings.LOKI_ENDPOINT)
199+
except Exception as exc: # pragma: no cover
200+
_logger.warning("Failed to attach Loki logging handler: %s", exc)

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ astrapy = "^2.0.1"
1717
cassandra-driver = "^3.29.2"
1818
requests = "^2.31.0"
1919

20+
# Observability / telemetry
21+
prometheus-fastapi-instrumentator = "^6.2.1"
22+
opentelemetry-api = "^1.25.0"
23+
opentelemetry-sdk = "^1.25.0"
24+
opentelemetry-exporter-otlp = "^1.25.0"
25+
opentelemetry-instrumentation-fastapi = "^0.44b0"
26+
opentelemetry-instrumentation-logging = "^0.44b0"
27+
logging-loki = "^0.3.1"
2028

2129
[tool.poetry.group.dev.dependencies]
2230
pytest = "^8.3.5"

0 commit comments

Comments
 (0)