synor/monitoring/alerts.yml
Gulshan Yadav 6b5a232a5e feat: Desktop wallet, gas estimator UI, and 30-day monitoring stack
Security (Desktop Wallet):
- Implement BIP39 mnemonic generation with cryptographic RNG
- Add Argon2id password-based key derivation (64MB, 3 iterations)
- Add ChaCha20-Poly1305 authenticated encryption for seed storage
- Add mnemonic auto-clear (60s timeout) and clipboard auto-clear (30s)
- Add sanitized error logging to prevent credential leaks
- Strengthen CSP with object-src, base-uri, form-action, frame-ancestors
- Clear sensitive state on component unmount

Explorer (Gas Estimator):
- Add Gas Estimation page with from/to/amount/data inputs
- Add bech32 address validation (synor1/tsynor1 prefix)
- Add BigInt-based amount parsing to avoid floating point errors
- Add production guard for mock mode (cannot enable in prod builds)

Monitoring (30-day Testnet):
- Add Prometheus config with 30-day retention
- Add comprehensive alert rules for node health, consensus, network, mempool
- Add Alertmanager with severity-based routing and inhibition rules
- Add Grafana with auto-provisioned datasource and dashboard
- Add Synor testnet dashboard with uptime SLA tracking

Docker:
- Update docker-compose.testnet.yml with monitoring profile
- Fix node-exporter for macOS Docker Desktop compatibility
- Change Grafana port to 3001 to avoid conflict
2026-01-10 04:38:09 +05:30

172 lines
6.1 KiB
YAML

# Synor Testnet Alert Rules
# For 30-day stability validation
groups:
# ==========================================================================
# Node Health Alerts
# ==========================================================================
- name: synor_node_health
interval: 30s
rules:
# Node Down Alert
- alert: SynorNodeDown
expr: up{job="synor-nodes"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Synor node {{ $labels.instance }} is down"
description: "Node {{ $labels.instance }} has been unreachable for more than 2 minutes."
# Node Restarted
- alert: SynorNodeRestarted
expr: changes(process_start_time_seconds{job="synor-nodes"}[5m]) > 0
labels:
severity: warning
annotations:
summary: "Synor node {{ $labels.instance }} restarted"
description: "Node has restarted in the last 5 minutes."
# ==========================================================================
# Consensus Alerts
# ==========================================================================
- name: synor_consensus
interval: 1m
rules:
# No new blocks for 10 minutes (at 10 BPS, this is critical)
- alert: SynorNoNewBlocks
expr: increase(synor_block_count_total[10m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "No new blocks produced on {{ $labels.instance }}"
description: "No blocks have been produced in the last 10 minutes. Consensus may be stalled."
# Block rate too low (< 5 BPS when target is 10)
- alert: SynorLowBlockRate
expr: rate(synor_block_count_total[5m]) < 5
for: 10m
labels:
severity: warning
annotations:
summary: "Low block rate on {{ $labels.instance }}"
description: "Block rate is {{ $value | humanize }}/s (target: 10/s)"
# DAA Score not increasing
- alert: SynorDaaScoreStalled
expr: increase(synor_daa_score[5m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "DAA score stalled on {{ $labels.instance }}"
description: "DAA score has not increased in 5 minutes."
# ==========================================================================
# Network Alerts
# ==========================================================================
- name: synor_network
interval: 1m
rules:
# Low peer count
- alert: SynorLowPeerCount
expr: synor_peer_count < 2
for: 5m
labels:
severity: warning
annotations:
summary: "Low peer count on {{ $labels.instance }}"
description: "Node has only {{ $value }} peers (minimum recommended: 3)"
# Network partition (node isolated)
- alert: SynorNetworkPartition
expr: synor_peer_count == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is isolated"
description: "Node has 0 peers - possible network partition."
# ==========================================================================
# Mempool Alerts
# ==========================================================================
- name: synor_mempool
interval: 1m
rules:
# Mempool growing too large
- alert: SynorMempoolOverflow
expr: synor_mempool_size > 10000
for: 5m
labels:
severity: warning
annotations:
summary: "Mempool overflow on {{ $labels.instance }}"
description: "Mempool has {{ $value }} transactions (threshold: 10000)"
# Mempool not draining
- alert: SynorMempoolStale
expr: synor_mempool_size > 100 and increase(synor_mempool_txs_removed[10m]) == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Mempool not draining on {{ $labels.instance }}"
description: "Mempool has {{ $value }} transactions but none are being processed."
# ==========================================================================
# Resource Alerts
# ==========================================================================
- name: synor_resources
interval: 30s
rules:
# High CPU usage
- alert: SynorHighCpuUsage
expr: rate(process_cpu_seconds_total{job="synor-nodes"}[5m]) > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value | humanizePercentage }}"
# High memory usage
- alert: SynorHighMemoryUsage
expr: process_resident_memory_bytes{job="synor-nodes"} > 4e9
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value | humanize1024 }}"
# Disk space low (host)
- alert: SynorLowDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on host"
description: "Only {{ $value | humanizePercentage }} disk space remaining"
# ==========================================================================
# Uptime Tracking (for 99.9% SLA)
# ==========================================================================
- name: synor_uptime
interval: 1m
rules:
# Record uptime for SLA calculation
- record: synor:uptime_ratio:30d
expr: avg_over_time(up{job="synor-nodes"}[30d])
# Alert if below 99.9% uptime target
- alert: SynorUptimeBelowSLA
expr: synor:uptime_ratio:30d < 0.999
for: 1h
labels:
severity: warning
annotations:
summary: "Uptime below SLA target"
description: "30-day uptime is {{ $value | humanizePercentage }} (target: 99.9%)"