# Synor Testnet Alert Rules # For 30-day stability validation groups: # ========================================================================== # Node Health Alerts # ========================================================================== - name: synor_node_health interval: 30s rules: # Node Down Alert - alert: SynorNodeDown expr: up{job="synor-nodes"} == 0 for: 2m labels: severity: critical annotations: summary: "Synor node {{ $labels.instance }} is down" description: "Node {{ $labels.instance }} has been unreachable for more than 2 minutes." # Node Restarted - alert: SynorNodeRestarted expr: changes(process_start_time_seconds{job="synor-nodes"}[5m]) > 0 labels: severity: warning annotations: summary: "Synor node {{ $labels.instance }} restarted" description: "Node has restarted in the last 5 minutes." # ========================================================================== # Consensus Alerts # ========================================================================== - name: synor_consensus interval: 1m rules: # No new blocks for 10 minutes (at 10 BPS, this is critical) - alert: SynorNoNewBlocks expr: increase(synor_block_count_total[10m]) == 0 for: 5m labels: severity: critical annotations: summary: "No new blocks produced on {{ $labels.instance }}" description: "No blocks have been produced in the last 10 minutes. Consensus may be stalled." # Block rate too low (< 5 BPS when target is 10) - alert: SynorLowBlockRate expr: rate(synor_block_count_total[5m]) < 5 for: 10m labels: severity: warning annotations: summary: "Low block rate on {{ $labels.instance }}" description: "Block rate is {{ $value | humanize }}/s (target: 10/s)" # DAA Score not increasing - alert: SynorDaaScoreStalled expr: increase(synor_daa_score[5m]) == 0 for: 5m labels: severity: critical annotations: summary: "DAA score stalled on {{ $labels.instance }}" description: "DAA score has not increased in 5 minutes." # ========================================================================== # Network Alerts # ========================================================================== - name: synor_network interval: 1m rules: # Low peer count - alert: SynorLowPeerCount expr: synor_peer_count < 2 for: 5m labels: severity: warning annotations: summary: "Low peer count on {{ $labels.instance }}" description: "Node has only {{ $value }} peers (minimum recommended: 3)" # Network partition (node isolated) - alert: SynorNetworkPartition expr: synor_peer_count == 0 for: 2m labels: severity: critical annotations: summary: "Node {{ $labels.instance }} is isolated" description: "Node has 0 peers - possible network partition." # ========================================================================== # Mempool Alerts # ========================================================================== - name: synor_mempool interval: 1m rules: # Mempool growing too large - alert: SynorMempoolOverflow expr: synor_mempool_size > 10000 for: 5m labels: severity: warning annotations: summary: "Mempool overflow on {{ $labels.instance }}" description: "Mempool has {{ $value }} transactions (threshold: 10000)" # Mempool not draining - alert: SynorMempoolStale expr: synor_mempool_size > 100 and increase(synor_mempool_txs_removed[10m]) == 0 for: 10m labels: severity: warning annotations: summary: "Mempool not draining on {{ $labels.instance }}" description: "Mempool has {{ $value }} transactions but none are being processed." # ========================================================================== # Resource Alerts # ========================================================================== - name: synor_resources interval: 30s rules: # High CPU usage - alert: SynorHighCpuUsage expr: rate(process_cpu_seconds_total{job="synor-nodes"}[5m]) > 0.9 for: 10m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value | humanizePercentage }}" # High memory usage - alert: SynorHighMemoryUsage expr: process_resident_memory_bytes{job="synor-nodes"} > 4e9 for: 10m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is {{ $value | humanize1024 }}" # Disk space low (host) - alert: SynorLowDiskSpace expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1 for: 5m labels: severity: critical annotations: summary: "Low disk space on host" description: "Only {{ $value | humanizePercentage }} disk space remaining" # ========================================================================== # Uptime Tracking (for 99.9% SLA) # ========================================================================== - name: synor_uptime interval: 1m rules: # Record uptime for SLA calculation - record: synor:uptime_ratio:30d expr: avg_over_time(up{job="synor-nodes"}[30d]) # Alert if below 99.9% uptime target - alert: SynorUptimeBelowSLA expr: synor:uptime_ratio:30d < 0.999 for: 1h labels: severity: warning annotations: summary: "Uptime below SLA target" description: "30-day uptime is {{ $value | humanizePercentage }} (target: 99.9%)"