Skip to content

Commit 9de440c

Browse files
CopilotSteake
andauthored
Operationalize Production Infrastructure: Multi-Region, Monitoring, Chaos Testing (#117)
* Initial plan * Add production infrastructure with multi-region deployment, monitoring, and chaos testing Co-authored-by: Steake <530040+Steake@users.noreply.github.com> * Add infrastructure validation script and testing documentation Co-authored-by: Steake <530040+Steake@users.noreply.github.com> * Address code review feedback: security improvements and configuration fixes Co-authored-by: Steake <530040+Steake@users.noreply.github.com> * Add comprehensive implementation summary and finalize documentation Co-authored-by: Steake <530040+Steake@users.noreply.github.com> * Fix PR review feedback: improve Python code quality and require Grafana password - Remove unused imports (random, Optional) from chaos_test.py - Fix type hint: change 'any' to 'Any' - Replace bare except clauses with specific exception types - Remove unused variable nodes_group_a - Require GRAFANA_ADMIN_PASSWORD (no default) for security - Document env var substitution requirements in alertmanager.yml - Add preprocessing notes to kubernetes deployment.yaml - Update all documentation to reflect required password Co-authored-by: Steake <530040+Steake@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Steake <530040+Steake@users.noreply.github.com>
1 parent c8d1329 commit 9de440c

23 files changed

Lines changed: 5828 additions & 2 deletions

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,11 @@ criterion/
3535
help_output.txt
3636
target/
3737

38+
# Infrastructure data
39+
/infra/docker/prometheus-data/
40+
/infra/docker/grafana-data/
41+
/infra/docker/alertmanager-data/
42+
/backups/
43+
3844
# SDK compiled bytecode
3945
sdk/**/*.bin

crates/bitcell-node/src/monitoring/metrics.rs

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
//! Metrics collection and export
22
3+
use std::sync::Arc;
4+
use tokio::net::TcpListener;
5+
use tokio::io::{AsyncReadExt, AsyncWriteExt};
6+
37
pub use super::MetricsRegistry;
48

59
/// HTTP server for Prometheus metrics endpoint
@@ -22,8 +26,78 @@ impl MetricsServer {
2226
self.registry.export_prometheus()
2327
}
2428

25-
// Future: Actual HTTP server implementation would go here
26-
// For now, just expose the metrics getter
29+
/// Get health check status
30+
pub fn get_health(&self) -> String {
31+
// Basic health check - node is up if we can respond
32+
let chain_height = self.registry.get_chain_height();
33+
let peer_count = self.registry.get_peer_count();
34+
35+
format!(
36+
r#"{{"status":"ok","chain_height":{},"peer_count":{}}}"#,
37+
chain_height, peer_count
38+
)
39+
}
40+
41+
/// Start HTTP server for metrics and health endpoints
42+
pub async fn serve(self) -> Result<(), std::io::Error> {
43+
let addr = format!("0.0.0.0:{}", self.port);
44+
let listener = TcpListener::bind(&addr).await?;
45+
let registry = Arc::new(self.registry);
46+
47+
tracing::info!("Metrics server listening on {}", addr);
48+
49+
loop {
50+
match listener.accept().await {
51+
Ok((mut socket, _)) => {
52+
let registry_clone = Arc::clone(&registry);
53+
54+
tokio::spawn(async move {
55+
let mut buffer = [0; 1024];
56+
57+
match socket.read(&mut buffer).await {
58+
Ok(n) if n > 0 => {
59+
let request = String::from_utf8_lossy(&buffer[..n]);
60+
61+
let response = if request.starts_with("GET /health") {
62+
// Health check endpoint
63+
let chain_height = registry_clone.get_chain_height();
64+
let peer_count = registry_clone.get_peer_count();
65+
let body = format!(
66+
r#"{{"status":"ok","chain_height":{},"peer_count":{}}}"#,
67+
chain_height, peer_count
68+
);
69+
format!(
70+
"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
71+
body.len(), body
72+
)
73+
} else if request.starts_with("GET /metrics") {
74+
// Prometheus metrics endpoint
75+
let body = registry_clone.export_prometheus();
76+
format!(
77+
"HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: {}\r\n\r\n{}",
78+
body.len(), body
79+
)
80+
} else {
81+
// 404 for other paths
82+
let body = "Not Found";
83+
format!(
84+
"HTTP/1.1 404 Not Found\r\nContent-Length: {}\r\n\r\n{}",
85+
body.len(), body
86+
)
87+
};
88+
89+
let _ = socket.write_all(response.as_bytes()).await;
90+
}
91+
_ => {}
92+
}
93+
});
94+
}
95+
Err(e) => {
96+
tracing::error!("Failed to accept connection: {}", e);
97+
}
98+
}
99+
}
100+
}
27101
}
28102

29103
#[cfg(test)]

infra/.gitignore

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Temporary data directories
2+
/tmp/bitcell/
3+
4+
# Docker volumes data
5+
*.tar.gz
6+
7+
# Backup files
8+
/backups/
9+
10+
# Log files
11+
*.log
12+
13+
# Environment files with secrets
14+
.env.production
15+
.env.local
16+
17+
# Grafana data
18+
grafana-data/
19+
20+
# Prometheus data
21+
prometheus-data/
22+
23+
# Alertmanager data
24+
alertmanager-data/

0 commit comments

Comments
 (0)