diff --git a/README.md b/README.md index 3f1d866..5fe22b7 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ A lightweight, self-hosted status page that monitors HTTP endpoints and displays - **Tick mark UI** — checkmarks (✓) and crosses (✗) colored by uptime percentage - **24-hour hourly** and **30-day daily** history rows per service - **Color-coded uptime**: green (100%) → yellow (≥95%) → orange (≥50%) → red (>0%) → purple (0%) +- **Telegram alerts** on detected downtime and recovery — one bot per instance, configurable failure threshold - **Single binary** — Rust backend serves the WASM frontend as static files - **SQLite** storage with WAL mode — no external database required - **TOML configuration** with sensible defaults and per-service overrides @@ -92,6 +93,11 @@ url = "sqlite://data/tickers.db" # SQLite database URL (default: "sqlite://data/ check_interval = 60 # Default check interval in seconds (default: 60) timeout = 10 # Default request timeout in seconds (default: 10) +[telegram] # Optional — omit the whole section to disable notifications +bot_token = "123456:ABC-DEF..." # Bot token from @BotFather (required to enable) +chat_ids = ["123456789"] # Chats to notify: user/group IDs or "@channel" (required to enable) +failure_threshold = 3 # Consecutive failed checks before a DOWN alert (default: 3) + [[services]] id = "my-api" # Unique service identifier (required) name = "My API" # Display name (required) @@ -117,6 +123,44 @@ Each `[[services]]` entry defines a monitored endpoint: | `check_interval` | No | from `[defaults]` | Check interval in seconds | | `timeout` | No | from `[defaults]` | Request timeout in seconds | +### Notifications (Telegram) + +Tickers can send a Telegram message when a service is detected down, and again when it recovers. One bot serves the whole instance, and every alert is delivered to all configured chats. Omit the `[telegram]` section to disable notifications entirely. + +**Setup:** + +1. Message [@BotFather](https://t.me/BotFather), send `/newbot`, and copy the **bot token** it gives you. +2. Find your **chat ID**: send any message to your bot (or add it to a group), open `https://api.telegram.org/bot/getUpdates`, and read `result[].message.chat.id`. Group IDs are negative (e.g. `-1001234567890`); channels can use `"@channelusername"` instead. +3. Add a `[telegram]` section to your `tickers.toml`: + + ```toml + [telegram] + bot_token = "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11" + chat_ids = ["123456789", "-1001234567890"] + failure_threshold = 3 + ``` + +| Field | Required | Default | Description | +|---------------------|----------|---------|----------------------------------------------------------| +| `bot_token` | Yes | — | Bot token from @BotFather; empty disables notifications | +| `chat_ids` | Yes | — | Chats to notify — user/group IDs or `"@channel"` | +| `failure_threshold` | No | `3` | Consecutive failed checks before a DOWN alert | + +A **DOWN** alert fires after `failure_threshold` consecutive failed checks, so a single transient blip stays quiet. A **recovery** message fires on the first successful check afterward and includes how long the service was down. Notification state is restored from history on startup, so restarting Tickers won't re-alert an already-down service. + +Example messages: + +``` +🔴 My API is DOWN +https://api.example.com/health +Timeout after 10000ms +2026-06-03 14:32 UTC + +✅ My API recovered +was down for 4m 12s +2026-06-03 14:36 UTC +``` + ## Development ### Prerequisites diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 5fb4007..8c20887 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -20,7 +20,7 @@ sqlx = { version = "0.8", features = ["runtime-tokio", "sqlite", "migrate", "chr toml = "0.8" serde = { version = "1", features = ["derive"] } serde_json = "1" -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } chrono = { version = "0.4", features = ["serde"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/backend/src/config.rs b/backend/src/config.rs index 2b86d48..cb6bce2 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -11,6 +11,8 @@ pub struct Config { #[serde(default)] pub database: DatabaseConfig, #[serde(default)] + pub telegram: TelegramConfig, + #[serde(default)] pub services: Vec, } @@ -36,6 +38,20 @@ pub struct DatabaseConfig { pub url: String, } +#[derive(Debug, Clone, Deserialize)] +pub struct TelegramConfig { + /// Bot token from @BotFather. Empty disables notifications. + #[serde(default)] + pub bot_token: String, + /// Chat/group IDs to notify. Strings accept numeric IDs ("-1001234567890") + /// and channel usernames ("@mychannel"). Empty disables notifications. + #[serde(default)] + pub chat_ids: Vec, + /// Consecutive failed checks required before a DOWN alert is sent. + #[serde(default = "default_failure_threshold")] + pub failure_threshold: u32, +} + #[derive(Debug, Clone, Deserialize)] pub struct ServiceConfig { pub id: String, @@ -72,6 +88,10 @@ fn default_expected_status() -> u16 { 200 } +fn default_failure_threshold() -> u32 { + 3 +} + impl Default for ServerConfig { fn default() -> Self { Self { @@ -98,6 +118,23 @@ impl Default for DatabaseConfig { } } +impl Default for TelegramConfig { + fn default() -> Self { + Self { + bot_token: String::new(), + chat_ids: Vec::new(), + failure_threshold: default_failure_threshold(), + } + } +} + +impl TelegramConfig { + /// Notifications are active only when both a token and at least one chat are set. + pub fn is_enabled(&self) -> bool { + !self.bot_token.is_empty() && !self.chat_ids.is_empty() + } +} + pub enum BodyExpectation { Contains(String), Regex(Regex), @@ -152,6 +189,7 @@ impl Config { server: ServerConfig::default(), defaults: DefaultsConfig::default(), database: DatabaseConfig::default(), + telegram: TelegramConfig::default(), services: vec![], }) } @@ -170,6 +208,14 @@ impl Config { return Err(ConfigError::InvalidRegex(svc.id.clone(), e.to_string())); } } + + let tg = &self.telegram; + if tg.bot_token.is_empty() != tg.chat_ids.is_empty() { + tracing::warn!( + "Telegram is partially configured (need both bot_token and chat_ids); notifications disabled" + ); + } + Ok(()) } } diff --git a/backend/src/db.rs b/backend/src/db.rs index 27f098c..38b1959 100644 --- a/backend/src/db.rs +++ b/backend/src/db.rs @@ -91,6 +91,22 @@ pub async fn get_latest_per_service( .await } +/// Returns the `is_up` of the most recent check for a service, or `None` if it has +/// no history yet. Used to seed notification state across restarts so an already-down +/// service isn't re-alerted. Uses the `idx_check_results_lookup` index. +pub async fn get_last_is_up( + pool: &SqlitePool, + service_id: &str, +) -> Result, sqlx::Error> { + let row = sqlx::query( + "SELECT is_up FROM check_results WHERE service_id = ? ORDER BY checked_at DESC LIMIT 1", + ) + .bind(service_id) + .fetch_optional(pool) + .await?; + Ok(row.map(|r| r.get("is_up"))) +} + pub async fn get_hourly_aggregation( pool: &SqlitePool, service_ids: &[String], diff --git a/backend/src/main.rs b/backend/src/main.rs index e870f23..032c2be 100644 --- a/backend/src/main.rs +++ b/backend/src/main.rs @@ -2,6 +2,7 @@ mod api; mod config; mod db; mod error; +mod notifier; mod worker; use std::sync::Arc; diff --git a/backend/src/notifier.rs b/backend/src/notifier.rs new file mode 100644 index 0000000..608b64a --- /dev/null +++ b/backend/src/notifier.rs @@ -0,0 +1,171 @@ +use crate::config::{ServiceConfig, TelegramConfig}; +use reqwest::Client; +use std::time::Duration; +use tracing::{error, warn}; + +/// Cap on a single Telegram API call so a slow/hung send can't stall a check loop. +const SEND_TIMEOUT: Duration = Duration::from_secs(10); + +/// Sends downtime/recovery messages to one or more Telegram chats via the Bot API. +/// One bot (token) per tickers instance; every notification fans out to all `chat_ids`. +pub struct Notifier { + client: Client, + bot_token: String, + chat_ids: Vec, +} + +impl Notifier { + /// Returns `None` when Telegram isn't fully configured, so callers simply skip notifying. + pub fn from_config(cfg: &TelegramConfig, client: Client) -> Option { + if !cfg.is_enabled() { + return None; + } + Some(Self { + client, + bot_token: cfg.bot_token.clone(), + chat_ids: cfg.chat_ids.clone(), + }) + } + + pub async fn notify_down(&self, service: &ServiceConfig, error: Option<&str>) { + self.send(&format_down(service, error, &now_utc())).await; + } + + pub async fn notify_recovery(&self, service: &ServiceConfig, down_for: Option) { + self.send(&format_recovery(service, down_for, &now_utc())) + .await; + } + + /// Posts `text` to every configured chat. Sends are plain text (no `parse_mode`), + /// so message content needs no Markdown/HTML escaping. A failure on one chat is + /// logged and does not stop delivery to the others. + async fn send(&self, text: &str) { + let url = format!("https://api.telegram.org/bot{}/sendMessage", self.bot_token); + for chat_id in &self.chat_ids { + let body = serde_json::json!({ "chat_id": chat_id, "text": text }); + match self + .client + .post(&url) + .timeout(SEND_TIMEOUT) + .json(&body) + .send() + .await + { + Ok(resp) if resp.status().is_success() => {} + Ok(resp) => { + let status = resp.status(); + let detail = resp.text().await.unwrap_or_default(); + warn!(%chat_id, %status, detail = %detail, "Telegram API returned an error"); + } + Err(e) => error!(%chat_id, error = %e, "Failed to send Telegram notification"), + } + } + } +} + +fn now_utc() -> String { + chrono::Utc::now().format("%Y-%m-%d %H:%M UTC").to_string() +} + +fn format_down(service: &ServiceConfig, error: Option<&str>, timestamp: &str) -> String { + format!( + "🔴 {} is DOWN\n{}\n{}\n{}", + service.name, + service.url, + error.unwrap_or("Check failed"), + timestamp, + ) +} + +fn format_recovery(service: &ServiceConfig, down_for: Option, timestamp: &str) -> String { + let mut msg = format!("✅ {} recovered\n", service.name); + if let Some(d) = down_for { + msg.push_str(&format!("was down for {}\n", humanize(d))); + } + msg.push_str(timestamp); + msg +} + +/// Coarse, human-readable duration: "45s", "4m 12s", "1h 3m". +fn humanize(d: Duration) -> String { + let total = d.as_secs(); + let (h, m, s) = (total / 3600, (total % 3600) / 60, total % 60); + if h > 0 { + format!("{h}h {m}m") + } else if m > 0 { + format!("{m}m {s}s") + } else { + format!("{s}s") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_service() -> ServiceConfig { + ServiceConfig { + id: "example".into(), + name: "Example Service".into(), + url: "https://api.example.com/health".into(), + expected_status: 200, + check_interval: None, + timeout: None, + expected_body: None, + } + } + + #[test] + fn humanize_formats() { + assert_eq!(humanize(Duration::from_secs(0)), "0s"); + assert_eq!(humanize(Duration::from_secs(45)), "45s"); + assert_eq!(humanize(Duration::from_secs(4 * 60 + 12)), "4m 12s"); + assert_eq!(humanize(Duration::from_secs(3600 + 3 * 60)), "1h 3m"); + } + + #[test] + fn down_message_matches_format() { + let msg = format_down( + &sample_service(), + Some("Timeout after 10000ms"), + "2026-06-03 14:32 UTC", + ); + assert_eq!( + msg, + "🔴 Example Service is DOWN\n\ + https://api.example.com/health\n\ + Timeout after 10000ms\n\ + 2026-06-03 14:32 UTC" + ); + } + + #[test] + fn down_message_falls_back_without_error() { + let msg = format_down(&sample_service(), None, "2026-06-03 14:32 UTC"); + assert!(msg.contains("Check failed")); + } + + #[test] + fn recovery_message_with_duration() { + let msg = format_recovery( + &sample_service(), + Some(Duration::from_secs(4 * 60 + 12)), + "2026-06-03 14:36 UTC", + ); + assert_eq!( + msg, + "✅ Example Service recovered\n\ + was down for 4m 12s\n\ + 2026-06-03 14:36 UTC" + ); + } + + #[test] + fn recovery_message_omits_unknown_duration() { + let msg = format_recovery(&sample_service(), None, "2026-06-03 14:36 UTC"); + assert_eq!( + msg, + "✅ Example Service recovered\n2026-06-03 14:36 UTC" + ); + } +} diff --git a/backend/src/worker.rs b/backend/src/worker.rs index 17b7a9b..da175b0 100644 --- a/backend/src/worker.rs +++ b/backend/src/worker.rs @@ -1,7 +1,9 @@ use crate::config::{BodyExpectation, Config, DefaultsConfig, ServiceConfig}; use crate::db; +use crate::notifier::Notifier; use reqwest::Client; use sqlx::SqlitePool; +use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; @@ -10,6 +12,7 @@ pub struct Worker { config: Config, pool: SqlitePool, client: Client, + notifier: Option>, cancel_token: CancellationToken, } @@ -21,10 +24,21 @@ impl Worker { .build() .expect("Failed to build HTTP client"); + // Reuse the same HTTP client for outbound Telegram calls. `None` when disabled. + let notifier = Notifier::from_config(&config.telegram, client.clone()).map(Arc::new); + if notifier.is_some() { + info!( + chats = config.telegram.chat_ids.len(), + failure_threshold = config.telegram.failure_threshold, + "Telegram notifications enabled" + ); + } + Self { config, pool, client, + notifier, cancel_token, } } @@ -32,6 +46,7 @@ impl Worker { pub fn spawn_all(self) -> Vec> { let mut handles = Vec::new(); let defaults = self.config.defaults.clone(); + let failure_threshold = self.config.telegram.failure_threshold.max(1); for service in &self.config.services { let pool = self.pool.clone(); @@ -39,9 +54,19 @@ impl Worker { let token = self.cancel_token.clone(); let service = service.clone(); let defaults = defaults.clone(); + let notifier = self.notifier.clone(); handles.push(tokio::spawn(async move { - run_check_loop(pool, client, service, defaults, token).await; + run_check_loop( + pool, + client, + service, + defaults, + notifier, + failure_threshold, + token, + ) + .await; })); } @@ -60,6 +85,8 @@ async fn run_check_loop( client: Client, service: ServiceConfig, defaults: DefaultsConfig, + notifier: Option>, + failure_threshold: u32, token: CancellationToken, ) { let interval = Duration::from_secs(service.effective_check_interval(&defaults)); @@ -71,12 +98,25 @@ async fn run_check_loop( "Starting check loop" ); - perform_check(&pool, &client, &service, timeout).await; + // Per-service transition tracking, seeded from history so a restart doesn't + // re-alert an already-down service. Only kept when notifications are enabled. + let mut notify_state = match ¬ifier { + Some(_) => Some(NotifyState::seed(&pool, &service.id).await), + None => None, + }; + + let outcome = perform_check(&pool, &client, &service, timeout).await; + if let (Some(n), Some(st)) = (¬ifier, notify_state.as_mut()) { + process_outcome(n, &service, failure_threshold, &outcome, st).await; + } loop { tokio::select! { _ = tokio::time::sleep(interval) => { - perform_check(&pool, &client, &service, timeout).await; + let outcome = perform_check(&pool, &client, &service, timeout).await; + if let (Some(n), Some(st)) = (¬ifier, notify_state.as_mut()) { + process_outcome(n, &service, failure_threshold, &outcome, st).await; + } } _ = token.cancelled() => { info!(service_id = %service.id, "Check loop shutting down"); @@ -86,17 +126,90 @@ async fn run_check_loop( } } +/// Outcome of a single check, surfaced to the notification state machine. +struct CheckOutcome { + is_up: bool, + error_message: Option, +} + +/// Per-service notification state: how many checks have failed in a row, whether a +/// DOWN alert has already been sent for the current outage, and when it began. +struct NotifyState { + consecutive_failures: u32, + alerted_down: bool, + down_since: Option, +} + +impl NotifyState { + /// Seeds `alerted_down` from the last persisted result so a restart mid-outage + /// doesn't fire a duplicate DOWN alert (a later recovery still fires). When seeded + /// down, `down_since` stays unknown, so that recovery omits the duration line. + async fn seed(pool: &SqlitePool, service_id: &str) -> Self { + let alerted_down = match db::get_last_is_up(pool, service_id).await { + Ok(Some(is_up)) => !is_up, + Ok(None) => false, + Err(e) => { + warn!(service_id = %service_id, error = %e, "Failed to seed notification state"); + false + } + }; + Self { + consecutive_failures: 0, + alerted_down, + down_since: None, + } + } +} + +/// Drives DOWN/recovery notifications from a check outcome. Alerts after +/// `failure_threshold` consecutive failures; recovers on the first success. +async fn process_outcome( + notifier: &Notifier, + service: &ServiceConfig, + failure_threshold: u32, + outcome: &CheckOutcome, + state: &mut NotifyState, +) { + if outcome.is_up { + state.consecutive_failures = 0; + if state.alerted_down { + let down_for = state.down_since.map(|t| t.elapsed()); + notifier.notify_recovery(service, down_for).await; + state.alerted_down = false; + state.down_since = None; + info!(service_id = %service.id, "Sent recovery notification"); + } + } else { + // Mark the start of a fresh outage (not one we've already alerted/seeded). + if state.consecutive_failures == 0 && !state.alerted_down { + state.down_since = Some(Instant::now()); + } + state.consecutive_failures += 1; + if !state.alerted_down && state.consecutive_failures >= failure_threshold { + notifier + .notify_down(service, outcome.error_message.as_deref()) + .await; + state.alerted_down = true; + info!( + service_id = %service.id, + failures = state.consecutive_failures, + "Sent downtime notification" + ); + } + } +} + async fn perform_check( pool: &SqlitePool, client: &Client, service: &ServiceConfig, timeout: Duration, -) { +) -> CheckOutcome { let start = Instant::now(); let result = client.get(&service.url).timeout(timeout).send().await; let elapsed_ms = start.elapsed().as_millis() as i64; - match result { + let (is_up, status_code, error_message) = match result { Ok(response) => { let status = response.status().as_u16() as i32; let status_ok = response.status().as_u16() == service.expected_status; @@ -135,18 +248,7 @@ async fn perform_check( } }; - if let Err(e) = db::insert_check_result( - pool, - &service.id, - is_up, - Some(status), - elapsed_ms, - error_message.as_deref(), - ) - .await - { - error!(service_id = %service.id, error = %e, "Failed to insert check result"); - } + (is_up, Some(status), error_message) } Err(err) => { let error_msg = if err.is_timeout() { @@ -159,19 +261,26 @@ async fn perform_check( warn!(service_id = %service.id, error = %error_msg, "Health check failed"); - if let Err(e) = db::insert_check_result( - pool, - &service.id, - false, - None, - elapsed_ms, - Some(&error_msg), - ) - .await - { - error!(service_id = %service.id, error = %e, "Failed to insert check result"); - } + (false, None, Some(error_msg)) } + }; + + if let Err(e) = db::insert_check_result( + pool, + &service.id, + is_up, + status_code, + elapsed_ms, + error_message.as_deref(), + ) + .await + { + error!(service_id = %service.id, error = %e, "Failed to insert check result"); + } + + CheckOutcome { + is_up, + error_message, } } diff --git a/tickers.toml b/tickers.toml index 426e70e..e25c6e6 100644 --- a/tickers.toml +++ b/tickers.toml @@ -9,6 +9,13 @@ timeout = 10 # seconds [database] url = "sqlite://data/tickers.db" +# Telegram downtime/recovery notifications (one bot per instance). +# Omit this whole section to disable. Both bot_token and chat_ids are required. +# [telegram] +# bot_token = "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11" # from @BotFather +# chat_ids = ["123456789", "-1001234567890"] # users, groups, or @channel +# failure_threshold = 3 # consecutive failed checks before a DOWN alert (default 3) + [[services]] id = "example" name = "Example Service"