- Replace manual modulo checks with .is_multiple_of() - Use enumerate() instead of manual loop counters - Use iterator .take() instead of index-based loops - Use slice literals instead of unnecessary vec![] - Allow too_many_arguments in IBC and bridge crates (protocol requirements) - Allow assertions on constants in integration tests
1205 lines
40 KiB
Rust
1205 lines
40 KiB
Rust
//! Network partition detection for Synor blockchain.
|
|
//!
|
|
//! This module provides mechanisms to detect when a node might be partitioned
|
|
//! from the main network. Partition detection is critical for GHOSTDAG-based
|
|
//! blockchains to prevent the node from operating on a minority fork.
|
|
//!
|
|
//! # Detection Methods
|
|
//!
|
|
//! The detector uses multiple signals to identify potential partitions:
|
|
//!
|
|
//! - **Peer count drops**: Sudden loss of peers may indicate network issues
|
|
//! - **Tip divergence**: When our tips don't match what peers report
|
|
//! - **Block production stalls**: No new blocks received for extended periods
|
|
//! - **Peer diversity degradation**: Loss of geographic/subnet diversity
|
|
//! - **Protocol version skew**: Most peers on different protocol versions
|
|
//!
|
|
//! # Partition States
|
|
//!
|
|
//! - `Connected`: Normal operation, healthy network connectivity
|
|
//! - `Degraded`: Warning state, some metrics are concerning but not critical
|
|
//! - `Partitioned`: Node appears isolated from the main network
|
|
//!
|
|
//! # Usage
|
|
//!
|
|
//! ```ignore
|
|
//! use synor_network::partition::{PartitionDetector, PartitionConfig};
|
|
//!
|
|
//! let config = PartitionConfig::default();
|
|
//! let detector = PartitionDetector::new(config);
|
|
//!
|
|
//! // Periodically update metrics
|
|
//! detector.record_peer_connected(peer_id, Some(ip), protocol_version);
|
|
//! detector.record_block_received(block_hash, timestamp);
|
|
//!
|
|
//! // Check partition status
|
|
//! match detector.status() {
|
|
//! PartitionStatus::Connected => { /* normal operation */ }
|
|
//! PartitionStatus::Degraded { reasons } => { /* log warnings */ }
|
|
//! PartitionStatus::Partitioned { reasons } => { /* halt mining, alert */ }
|
|
//! }
|
|
//! ```
|
|
|
|
use hashbrown::{HashMap, HashSet};
|
|
use libp2p::PeerId;
|
|
use parking_lot::RwLock;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::VecDeque;
|
|
use std::net::IpAddr;
|
|
use std::time::{Duration, Instant};
|
|
use synor_types::Hash256;
|
|
|
|
/// Configuration for partition detection.
|
|
#[derive(Clone, Debug)]
|
|
pub struct PartitionConfig {
|
|
/// Minimum number of peers required to consider ourselves connected.
|
|
pub min_peers: usize,
|
|
/// Minimum number of unique subnets required for diversity.
|
|
pub min_subnets: usize,
|
|
/// Minimum number of outbound peers required.
|
|
pub min_outbound_peers: usize,
|
|
/// Maximum time without receiving a new block before stall warning.
|
|
pub block_stall_warning: Duration,
|
|
/// Maximum time without receiving a new block before partition alert.
|
|
pub block_stall_critical: Duration,
|
|
/// Maximum percentage of peers allowed from a single subnet.
|
|
pub max_subnet_concentration: f32,
|
|
/// Maximum tip age before considering ourselves potentially partitioned.
|
|
pub max_tip_age: Duration,
|
|
/// How many peer tips must match ours to consider ourselves connected.
|
|
pub min_tip_agreement: f32,
|
|
/// How often to check partition status.
|
|
pub check_interval: Duration,
|
|
/// Rolling window size for block arrival times.
|
|
pub block_time_window: usize,
|
|
/// Threshold for detecting block production slowdown (ratio of expected).
|
|
pub block_rate_threshold: f32,
|
|
/// Expected block interval (for GHOSTDAG, this is the target block time).
|
|
pub expected_block_interval: Duration,
|
|
/// Minimum protocol version agreement percentage.
|
|
pub min_protocol_agreement: f32,
|
|
}
|
|
|
|
impl Default for PartitionConfig {
|
|
fn default() -> Self {
|
|
PartitionConfig {
|
|
min_peers: 3,
|
|
min_subnets: 2,
|
|
min_outbound_peers: 2,
|
|
block_stall_warning: Duration::from_secs(30),
|
|
block_stall_critical: Duration::from_secs(120),
|
|
max_subnet_concentration: 0.5, // No subnet should have >50% of peers
|
|
max_tip_age: Duration::from_secs(300),
|
|
min_tip_agreement: 0.3, // At least 30% of peers should agree on tips
|
|
check_interval: Duration::from_secs(10),
|
|
block_time_window: 100,
|
|
block_rate_threshold: 0.1, // Alert if block rate drops to 10% of expected
|
|
expected_block_interval: Duration::from_millis(1000), // 1 block per second
|
|
min_protocol_agreement: 0.5, // At least 50% should be on same protocol
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Reasons why the network might be degraded or partitioned.
|
|
///
|
|
/// Note: Percentage values are stored as u8 (0-100) for Eq/Hash compatibility.
|
|
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
pub enum PartitionReason {
|
|
/// Not enough connected peers.
|
|
InsufficientPeers { current: usize, required: usize },
|
|
/// Not enough outbound connections.
|
|
InsufficientOutbound { current: usize, required: usize },
|
|
/// Lack of subnet diversity.
|
|
LowSubnetDiversity { current: usize, required: usize },
|
|
/// Too many peers from a single subnet.
|
|
/// `percentage` is 0-100.
|
|
SubnetConcentration { subnet: u32, percentage: u8 },
|
|
/// No new blocks received recently.
|
|
/// `duration_secs` is the number of seconds since last block.
|
|
BlockProductionStalled { duration_secs: u64 },
|
|
/// Block production rate significantly lower than expected.
|
|
/// Rates are stored as millibps (blocks per 1000 seconds) for precision.
|
|
LowBlockRate {
|
|
current_rate_millibps: u32,
|
|
expected_rate_millibps: u32,
|
|
},
|
|
/// Our tips don't match peer tips.
|
|
/// `matching_peers_pct` is 0-100.
|
|
TipDivergence {
|
|
matching_peers_pct: u8,
|
|
threshold_pct: u8,
|
|
},
|
|
/// Tip is too old.
|
|
/// `age_secs` and `max_age_secs` are in seconds.
|
|
StaleTip { age_secs: u64, max_age_secs: u64 },
|
|
/// Protocol version mismatch with majority.
|
|
ProtocolVersionSkew {
|
|
our_version: u32,
|
|
majority_version: u32,
|
|
},
|
|
/// Most peers have higher blue scores, we may be on a minority fork.
|
|
BehindNetwork { our_score: u64, network_score: u64 },
|
|
/// All connections are inbound (potential eclipse attack).
|
|
NoOutboundConnections,
|
|
/// Lost significant portion of peers suddenly.
|
|
SuddenPeerLoss { lost: usize, remaining: usize },
|
|
}
|
|
|
|
impl PartitionReason {
|
|
/// Returns a human-readable description of the reason.
|
|
pub fn description(&self) -> String {
|
|
match self {
|
|
PartitionReason::InsufficientPeers { current, required } => {
|
|
format!(
|
|
"Only {} peers connected, need at least {}",
|
|
current, required
|
|
)
|
|
}
|
|
PartitionReason::InsufficientOutbound { current, required } => {
|
|
format!(
|
|
"Only {} outbound peers, need at least {}",
|
|
current, required
|
|
)
|
|
}
|
|
PartitionReason::LowSubnetDiversity { current, required } => {
|
|
format!(
|
|
"Only {} unique subnets, need at least {}",
|
|
current, required
|
|
)
|
|
}
|
|
PartitionReason::SubnetConcentration { subnet, percentage } => {
|
|
format!(
|
|
"Subnet {:X} has {}% of peers, max allowed 50%",
|
|
subnet, percentage
|
|
)
|
|
}
|
|
PartitionReason::BlockProductionStalled { duration_secs } => {
|
|
format!("No new blocks for {} seconds", duration_secs)
|
|
}
|
|
PartitionReason::LowBlockRate {
|
|
current_rate_millibps,
|
|
expected_rate_millibps,
|
|
} => {
|
|
format!(
|
|
"Block rate {:.2}/s, expected {:.2}/s",
|
|
*current_rate_millibps as f64 / 1000.0,
|
|
*expected_rate_millibps as f64 / 1000.0
|
|
)
|
|
}
|
|
PartitionReason::TipDivergence {
|
|
matching_peers_pct,
|
|
threshold_pct,
|
|
} => {
|
|
format!(
|
|
"Only {}% of peers agree on tips, need {}%",
|
|
matching_peers_pct, threshold_pct
|
|
)
|
|
}
|
|
PartitionReason::StaleTip {
|
|
age_secs,
|
|
max_age_secs,
|
|
} => {
|
|
format!(
|
|
"Best tip is {} seconds old, max allowed {} seconds",
|
|
age_secs, max_age_secs
|
|
)
|
|
}
|
|
PartitionReason::ProtocolVersionSkew {
|
|
our_version,
|
|
majority_version,
|
|
} => {
|
|
format!(
|
|
"Our protocol version {} differs from majority {}",
|
|
our_version, majority_version
|
|
)
|
|
}
|
|
PartitionReason::BehindNetwork {
|
|
our_score,
|
|
network_score,
|
|
} => {
|
|
format!(
|
|
"Our blue score {} is behind network {}",
|
|
our_score, network_score
|
|
)
|
|
}
|
|
PartitionReason::NoOutboundConnections => {
|
|
"No outbound connections (potential eclipse attack)".to_string()
|
|
}
|
|
PartitionReason::SuddenPeerLoss { lost, remaining } => {
|
|
format!("Lost {} peers suddenly, {} remaining", lost, remaining)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns true if this reason indicates a critical issue.
|
|
pub fn is_critical(&self) -> bool {
|
|
match self {
|
|
PartitionReason::NoOutboundConnections => true,
|
|
PartitionReason::InsufficientPeers { current: 0, .. } => true,
|
|
PartitionReason::BlockProductionStalled { duration_secs } => *duration_secs >= 60,
|
|
_ => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Current partition status.
|
|
#[derive(Clone, Debug)]
|
|
pub enum PartitionStatus {
|
|
/// Node is well connected to the network.
|
|
Connected,
|
|
/// Node has some connectivity issues but is not partitioned.
|
|
Degraded {
|
|
/// Reasons for degraded status.
|
|
reasons: Vec<PartitionReason>,
|
|
},
|
|
/// Node appears to be partitioned from the main network.
|
|
Partitioned {
|
|
/// Reasons for partition detection.
|
|
reasons: Vec<PartitionReason>,
|
|
},
|
|
}
|
|
|
|
impl PartitionStatus {
|
|
/// Returns true if the node is fully connected.
|
|
pub fn is_connected(&self) -> bool {
|
|
matches!(self, PartitionStatus::Connected)
|
|
}
|
|
|
|
/// Returns true if the node is partitioned.
|
|
pub fn is_partitioned(&self) -> bool {
|
|
matches!(self, PartitionStatus::Partitioned { .. })
|
|
}
|
|
|
|
/// Returns true if the node is degraded or partitioned.
|
|
pub fn is_degraded(&self) -> bool {
|
|
!matches!(self, PartitionStatus::Connected)
|
|
}
|
|
|
|
/// Returns all reasons for current status.
|
|
pub fn reasons(&self) -> &[PartitionReason] {
|
|
match self {
|
|
PartitionStatus::Connected => &[],
|
|
PartitionStatus::Degraded { reasons } => reasons,
|
|
PartitionStatus::Partitioned { reasons } => reasons,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Information about a connected peer for partition detection.
|
|
#[derive(Clone, Debug)]
|
|
struct PeerPartitionInfo {
|
|
/// Peer ID.
|
|
peer_id: PeerId,
|
|
/// IP address if known.
|
|
ip: Option<IpAddr>,
|
|
/// Subnet identifier (/16 for IPv4).
|
|
subnet: Option<u32>,
|
|
/// Whether this is an outbound connection.
|
|
is_outbound: bool,
|
|
/// Protocol version.
|
|
protocol_version: Option<u32>,
|
|
/// Peer's reported tips.
|
|
tips: HashSet<Hash256>,
|
|
/// Peer's reported blue score.
|
|
blue_score: Option<u64>,
|
|
/// When the peer connected.
|
|
connected_at: Instant,
|
|
/// When we last heard from this peer.
|
|
last_seen: Instant,
|
|
}
|
|
|
|
impl PeerPartitionInfo {
|
|
fn new(peer_id: PeerId, ip: Option<IpAddr>, is_outbound: bool) -> Self {
|
|
let subnet = ip.and_then(|addr| Self::extract_subnet(&addr));
|
|
let now = Instant::now();
|
|
PeerPartitionInfo {
|
|
peer_id,
|
|
ip,
|
|
subnet,
|
|
is_outbound,
|
|
protocol_version: None,
|
|
tips: HashSet::new(),
|
|
blue_score: None,
|
|
connected_at: now,
|
|
last_seen: now,
|
|
}
|
|
}
|
|
|
|
fn extract_subnet(ip: &IpAddr) -> Option<u32> {
|
|
match ip {
|
|
IpAddr::V4(addr) => {
|
|
let octets = addr.octets();
|
|
Some(((octets[0] as u32) << 8) | (octets[1] as u32))
|
|
}
|
|
IpAddr::V6(addr) => {
|
|
// Use first 32 bits for IPv6 subnet grouping
|
|
let segments = addr.segments();
|
|
Some(((segments[0] as u32) << 16) | (segments[1] as u32))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Block arrival record for rate calculation.
|
|
#[derive(Clone, Debug)]
|
|
struct BlockArrival {
|
|
/// Block hash.
|
|
hash: Hash256,
|
|
/// When the block was received.
|
|
received_at: Instant,
|
|
}
|
|
|
|
/// Statistics for partition monitoring.
|
|
#[derive(Clone, Debug, Default)]
|
|
pub struct PartitionStats {
|
|
/// Current number of peers.
|
|
pub peer_count: usize,
|
|
/// Number of outbound peers.
|
|
pub outbound_count: usize,
|
|
/// Number of inbound peers.
|
|
pub inbound_count: usize,
|
|
/// Number of unique subnets.
|
|
pub unique_subnets: usize,
|
|
/// Time since last block.
|
|
pub time_since_last_block: Option<Duration>,
|
|
/// Current block rate (blocks per second).
|
|
pub block_rate: f32,
|
|
/// Percentage of peers agreeing on our tips.
|
|
pub tip_agreement: f32,
|
|
/// Our blue score.
|
|
pub local_blue_score: u64,
|
|
/// Highest blue score seen on network.
|
|
pub network_blue_score: u64,
|
|
/// Current partition status.
|
|
pub status: String,
|
|
/// Number of partition reasons.
|
|
pub warning_count: usize,
|
|
/// Number of critical reasons.
|
|
pub critical_count: usize,
|
|
/// Last status check time.
|
|
pub last_check: Option<Instant>,
|
|
}
|
|
|
|
/// Network partition detector.
|
|
pub struct PartitionDetector {
|
|
/// Configuration.
|
|
config: PartitionConfig,
|
|
/// Connected peers.
|
|
peers: RwLock<HashMap<PeerId, PeerPartitionInfo>>,
|
|
/// Our local tips.
|
|
local_tips: RwLock<HashSet<Hash256>>,
|
|
/// Our local blue score.
|
|
local_blue_score: RwLock<u64>,
|
|
/// Our protocol version.
|
|
our_protocol_version: u32,
|
|
/// Recent block arrivals for rate calculation.
|
|
block_arrivals: RwLock<VecDeque<BlockArrival>>,
|
|
/// Last block received time.
|
|
last_block_time: RwLock<Option<Instant>>,
|
|
/// Historical peer count for sudden loss detection.
|
|
peer_count_history: RwLock<VecDeque<(Instant, usize)>>,
|
|
/// Last status check time.
|
|
last_check: RwLock<Instant>,
|
|
/// Cached status.
|
|
cached_status: RwLock<Option<PartitionStatus>>,
|
|
/// Listeners for partition events.
|
|
alert_sent: RwLock<bool>,
|
|
}
|
|
|
|
impl PartitionDetector {
|
|
/// Creates a new partition detector.
|
|
pub fn new(config: PartitionConfig) -> Self {
|
|
PartitionDetector {
|
|
config,
|
|
peers: RwLock::new(HashMap::new()),
|
|
local_tips: RwLock::new(HashSet::new()),
|
|
local_blue_score: RwLock::new(0),
|
|
our_protocol_version: 1, // Default to version 1
|
|
block_arrivals: RwLock::new(VecDeque::with_capacity(100)),
|
|
last_block_time: RwLock::new(None),
|
|
peer_count_history: RwLock::new(VecDeque::with_capacity(60)),
|
|
last_check: RwLock::new(Instant::now()),
|
|
cached_status: RwLock::new(None),
|
|
alert_sent: RwLock::new(false),
|
|
}
|
|
}
|
|
|
|
/// Creates a new partition detector with custom protocol version.
|
|
pub fn with_protocol_version(config: PartitionConfig, protocol_version: u32) -> Self {
|
|
let mut detector = Self::new(config);
|
|
detector.our_protocol_version = protocol_version;
|
|
detector
|
|
}
|
|
|
|
/// Records a peer connection.
|
|
pub fn record_peer_connected(&self, peer_id: PeerId, ip: Option<IpAddr>, is_outbound: bool) {
|
|
let info = PeerPartitionInfo::new(peer_id, ip, is_outbound);
|
|
self.peers.write().insert(peer_id, info);
|
|
|
|
// Record peer count for sudden loss detection
|
|
let count = self.peers.read().len();
|
|
let mut history = self.peer_count_history.write();
|
|
history.push_back((Instant::now(), count));
|
|
if history.len() > 60 {
|
|
history.pop_front();
|
|
}
|
|
|
|
// Invalidate cached status
|
|
*self.cached_status.write() = None;
|
|
}
|
|
|
|
/// Records a peer disconnection.
|
|
pub fn record_peer_disconnected(&self, peer_id: &PeerId) {
|
|
self.peers.write().remove(peer_id);
|
|
|
|
// Record peer count
|
|
let count = self.peers.read().len();
|
|
let mut history = self.peer_count_history.write();
|
|
history.push_back((Instant::now(), count));
|
|
if history.len() > 60 {
|
|
history.pop_front();
|
|
}
|
|
|
|
// Invalidate cached status
|
|
*self.cached_status.write() = None;
|
|
}
|
|
|
|
/// Updates peer's protocol version.
|
|
pub fn update_peer_protocol_version(&self, peer_id: &PeerId, version: u32) {
|
|
if let Some(peer) = self.peers.write().get_mut(peer_id) {
|
|
peer.protocol_version = Some(version);
|
|
peer.last_seen = Instant::now();
|
|
}
|
|
}
|
|
|
|
/// Updates peer's reported tips.
|
|
pub fn update_peer_tips(&self, peer_id: &PeerId, tips: Vec<Hash256>) {
|
|
if let Some(peer) = self.peers.write().get_mut(peer_id) {
|
|
peer.tips = tips.into_iter().collect();
|
|
peer.last_seen = Instant::now();
|
|
}
|
|
}
|
|
|
|
/// Updates peer's blue score.
|
|
pub fn update_peer_blue_score(&self, peer_id: &PeerId, score: u64) {
|
|
if let Some(peer) = self.peers.write().get_mut(peer_id) {
|
|
peer.blue_score = Some(score);
|
|
peer.last_seen = Instant::now();
|
|
}
|
|
}
|
|
|
|
/// Records a new block received.
|
|
pub fn record_block_received(&self, hash: Hash256) {
|
|
let now = Instant::now();
|
|
|
|
// Update last block time
|
|
*self.last_block_time.write() = Some(now);
|
|
|
|
// Add to block arrivals for rate calculation
|
|
let mut arrivals = self.block_arrivals.write();
|
|
arrivals.push_back(BlockArrival {
|
|
hash,
|
|
received_at: now,
|
|
});
|
|
|
|
// Keep only recent blocks
|
|
while arrivals.len() > self.config.block_time_window {
|
|
arrivals.pop_front();
|
|
}
|
|
|
|
// Remove blocks older than 10 minutes
|
|
let cutoff = now - Duration::from_secs(600);
|
|
while let Some(front) = arrivals.front() {
|
|
if front.received_at < cutoff {
|
|
arrivals.pop_front();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Invalidate cached status
|
|
*self.cached_status.write() = None;
|
|
}
|
|
|
|
/// Sets the local tips.
|
|
pub fn set_local_tips(&self, tips: Vec<Hash256>) {
|
|
*self.local_tips.write() = tips.into_iter().collect();
|
|
}
|
|
|
|
/// Sets the local blue score.
|
|
pub fn set_local_blue_score(&self, score: u64) {
|
|
*self.local_blue_score.write() = score;
|
|
}
|
|
|
|
/// Calculates the current block rate (blocks per second).
|
|
pub fn calculate_block_rate(&self) -> f32 {
|
|
let arrivals = self.block_arrivals.read();
|
|
if arrivals.len() < 2 {
|
|
return 0.0;
|
|
}
|
|
|
|
let first = arrivals.front().unwrap().received_at;
|
|
let last = arrivals.back().unwrap().received_at;
|
|
let duration = last.duration_since(first);
|
|
|
|
if duration.as_secs_f32() > 0.0 {
|
|
(arrivals.len() - 1) as f32 / duration.as_secs_f32()
|
|
} else {
|
|
0.0
|
|
}
|
|
}
|
|
|
|
/// Checks for sudden peer loss.
|
|
fn check_sudden_peer_loss(&self) -> Option<PartitionReason> {
|
|
let history = self.peer_count_history.read();
|
|
if history.len() < 2 {
|
|
return None;
|
|
}
|
|
|
|
// Look for significant drop in last 30 seconds
|
|
let now = Instant::now();
|
|
let cutoff = now - Duration::from_secs(30);
|
|
|
|
let mut max_count = 0;
|
|
let mut current_count = 0;
|
|
|
|
for (time, count) in history.iter() {
|
|
if *time < cutoff {
|
|
if *count > max_count {
|
|
max_count = *count;
|
|
}
|
|
} else {
|
|
current_count = *count;
|
|
}
|
|
}
|
|
|
|
if max_count == 0 {
|
|
return None;
|
|
}
|
|
|
|
// Alert if we lost more than 50% of peers
|
|
if current_count < max_count / 2 && max_count >= 4 {
|
|
let lost = max_count - current_count;
|
|
return Some(PartitionReason::SuddenPeerLoss {
|
|
lost,
|
|
remaining: current_count,
|
|
});
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Evaluates all partition signals and returns current status.
|
|
pub fn evaluate(&self) -> PartitionStatus {
|
|
let mut warning_reasons = Vec::new();
|
|
let mut critical_reasons = Vec::new();
|
|
|
|
let peers = self.peers.read();
|
|
let peer_count = peers.len();
|
|
|
|
// === Peer Count Checks ===
|
|
|
|
// Check minimum peers
|
|
if peer_count < self.config.min_peers {
|
|
let reason = PartitionReason::InsufficientPeers {
|
|
current: peer_count,
|
|
required: self.config.min_peers,
|
|
};
|
|
if peer_count == 0 {
|
|
critical_reasons.push(reason);
|
|
} else {
|
|
warning_reasons.push(reason);
|
|
}
|
|
}
|
|
|
|
// Check outbound connections
|
|
let outbound_count = peers.values().filter(|p| p.is_outbound).count();
|
|
if outbound_count == 0 && peer_count > 0 {
|
|
critical_reasons.push(PartitionReason::NoOutboundConnections);
|
|
} else if outbound_count < self.config.min_outbound_peers {
|
|
warning_reasons.push(PartitionReason::InsufficientOutbound {
|
|
current: outbound_count,
|
|
required: self.config.min_outbound_peers,
|
|
});
|
|
}
|
|
|
|
// === Subnet Diversity Checks ===
|
|
|
|
let mut subnet_counts: HashMap<u32, usize> = HashMap::new();
|
|
for peer in peers.values() {
|
|
if let Some(subnet) = peer.subnet {
|
|
*subnet_counts.entry(subnet).or_insert(0) += 1;
|
|
}
|
|
}
|
|
|
|
let unique_subnets = subnet_counts.len();
|
|
if unique_subnets < self.config.min_subnets && peer_count >= self.config.min_peers {
|
|
warning_reasons.push(PartitionReason::LowSubnetDiversity {
|
|
current: unique_subnets,
|
|
required: self.config.min_subnets,
|
|
});
|
|
}
|
|
|
|
// Check for subnet concentration
|
|
if peer_count > 0 {
|
|
for (&subnet, &count) in &subnet_counts {
|
|
let percentage = count as f32 / peer_count as f32;
|
|
if percentage > self.config.max_subnet_concentration {
|
|
warning_reasons.push(PartitionReason::SubnetConcentration {
|
|
subnet,
|
|
percentage: (percentage * 100.0) as u8,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// === Block Production Checks ===
|
|
|
|
let last_block_time = *self.last_block_time.read();
|
|
if let Some(last_time) = last_block_time {
|
|
let elapsed = last_time.elapsed();
|
|
if elapsed > self.config.block_stall_critical {
|
|
critical_reasons.push(PartitionReason::BlockProductionStalled {
|
|
duration_secs: elapsed.as_secs(),
|
|
});
|
|
} else if elapsed > self.config.block_stall_warning {
|
|
warning_reasons.push(PartitionReason::BlockProductionStalled {
|
|
duration_secs: elapsed.as_secs(),
|
|
});
|
|
}
|
|
}
|
|
|
|
// Check block rate (only if we have enough samples to calculate)
|
|
let arrivals_count = self.block_arrivals.read().len();
|
|
if arrivals_count >= 2 {
|
|
let block_rate = self.calculate_block_rate();
|
|
let expected_rate = 1.0 / self.config.expected_block_interval.as_secs_f32();
|
|
if block_rate < expected_rate * self.config.block_rate_threshold {
|
|
warning_reasons.push(PartitionReason::LowBlockRate {
|
|
current_rate_millibps: (block_rate * 1000.0) as u32,
|
|
expected_rate_millibps: (expected_rate * 1000.0) as u32,
|
|
});
|
|
}
|
|
}
|
|
|
|
// === Tip Agreement Checks ===
|
|
|
|
let local_tips = self.local_tips.read();
|
|
if !local_tips.is_empty() && peer_count > 0 {
|
|
let mut matching_peers = 0;
|
|
for peer in peers.values() {
|
|
if !peer.tips.is_empty() {
|
|
// Check if any of our tips match any of peer's tips
|
|
if local_tips.iter().any(|t| peer.tips.contains(t)) {
|
|
matching_peers += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
let peers_with_tips = peers.values().filter(|p| !p.tips.is_empty()).count();
|
|
if peers_with_tips > 0 {
|
|
let agreement = matching_peers as f32 / peers_with_tips as f32;
|
|
if agreement < self.config.min_tip_agreement {
|
|
warning_reasons.push(PartitionReason::TipDivergence {
|
|
matching_peers_pct: (agreement * 100.0) as u8,
|
|
threshold_pct: (self.config.min_tip_agreement * 100.0) as u8,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// === Blue Score Checks ===
|
|
|
|
let local_score = *self.local_blue_score.read();
|
|
let network_score = peers
|
|
.values()
|
|
.filter_map(|p| p.blue_score)
|
|
.max()
|
|
.unwrap_or(local_score);
|
|
|
|
// If we're significantly behind the network
|
|
if network_score > local_score + 100 {
|
|
warning_reasons.push(PartitionReason::BehindNetwork {
|
|
our_score: local_score,
|
|
network_score,
|
|
});
|
|
}
|
|
|
|
// === Protocol Version Checks ===
|
|
|
|
let mut version_counts: HashMap<u32, usize> = HashMap::new();
|
|
for peer in peers.values() {
|
|
if let Some(version) = peer.protocol_version {
|
|
*version_counts.entry(version).or_insert(0) += 1;
|
|
}
|
|
}
|
|
|
|
if let Some((&majority_version, &count)) =
|
|
version_counts.iter().max_by_key(|(_, count)| *count)
|
|
{
|
|
let peers_with_version = version_counts.values().sum::<usize>();
|
|
let percentage = count as f32 / peers_with_version as f32;
|
|
|
|
if majority_version != self.our_protocol_version
|
|
&& percentage > self.config.min_protocol_agreement
|
|
{
|
|
warning_reasons.push(PartitionReason::ProtocolVersionSkew {
|
|
our_version: self.our_protocol_version,
|
|
majority_version,
|
|
});
|
|
}
|
|
}
|
|
|
|
// === Sudden Peer Loss Check ===
|
|
|
|
drop(peers); // Release lock before checking history
|
|
if let Some(reason) = self.check_sudden_peer_loss() {
|
|
critical_reasons.push(reason);
|
|
}
|
|
|
|
// === Determine Status ===
|
|
|
|
// Update last check time
|
|
*self.last_check.write() = Instant::now();
|
|
|
|
if !critical_reasons.is_empty() {
|
|
// Any critical reason means we're partitioned
|
|
let mut all_reasons = critical_reasons;
|
|
all_reasons.extend(warning_reasons);
|
|
PartitionStatus::Partitioned {
|
|
reasons: all_reasons,
|
|
}
|
|
} else if !warning_reasons.is_empty() {
|
|
PartitionStatus::Degraded {
|
|
reasons: warning_reasons,
|
|
}
|
|
} else {
|
|
// Reset alert flag when we're connected
|
|
*self.alert_sent.write() = false;
|
|
PartitionStatus::Connected
|
|
}
|
|
}
|
|
|
|
/// Returns the current status, using cache if recent.
|
|
pub fn status(&self) -> PartitionStatus {
|
|
let last_check = *self.last_check.read();
|
|
if last_check.elapsed() < self.config.check_interval {
|
|
if let Some(status) = self.cached_status.read().clone() {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
let status = self.evaluate();
|
|
*self.cached_status.write() = Some(status.clone());
|
|
status
|
|
}
|
|
|
|
/// Forces a fresh status evaluation, bypassing cache.
|
|
pub fn force_check(&self) -> PartitionStatus {
|
|
let status = self.evaluate();
|
|
*self.cached_status.write() = Some(status.clone());
|
|
status
|
|
}
|
|
|
|
/// Returns whether an alert should be sent (called once per partition event).
|
|
pub fn should_alert(&self) -> bool {
|
|
let status = self.status();
|
|
if status.is_partitioned() {
|
|
let mut alert_sent = self.alert_sent.write();
|
|
if !*alert_sent {
|
|
*alert_sent = true;
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
/// Clears the alert flag (call when recovering from partition).
|
|
pub fn clear_alert(&self) {
|
|
*self.alert_sent.write() = false;
|
|
}
|
|
|
|
/// Returns current statistics.
|
|
pub fn stats(&self) -> PartitionStats {
|
|
let peers = self.peers.read();
|
|
let status = self.status();
|
|
|
|
let (status_str, warning_count, critical_count) = match &status {
|
|
PartitionStatus::Connected => ("Connected".to_string(), 0, 0),
|
|
PartitionStatus::Degraded { reasons } => ("Degraded".to_string(), reasons.len(), 0),
|
|
PartitionStatus::Partitioned { reasons } => {
|
|
let critical = reasons.iter().filter(|r| r.is_critical()).count();
|
|
(
|
|
"Partitioned".to_string(),
|
|
reasons.len() - critical,
|
|
critical,
|
|
)
|
|
}
|
|
};
|
|
|
|
let mut subnet_set = HashSet::new();
|
|
for peer in peers.values() {
|
|
if let Some(subnet) = peer.subnet {
|
|
subnet_set.insert(subnet);
|
|
}
|
|
}
|
|
|
|
let network_score = peers
|
|
.values()
|
|
.filter_map(|p| p.blue_score)
|
|
.max()
|
|
.unwrap_or(0);
|
|
|
|
PartitionStats {
|
|
peer_count: peers.len(),
|
|
outbound_count: peers.values().filter(|p| p.is_outbound).count(),
|
|
inbound_count: peers.values().filter(|p| !p.is_outbound).count(),
|
|
unique_subnets: subnet_set.len(),
|
|
time_since_last_block: self.last_block_time.read().map(|t| t.elapsed()),
|
|
block_rate: self.calculate_block_rate(),
|
|
tip_agreement: self.calculate_tip_agreement(),
|
|
local_blue_score: *self.local_blue_score.read(),
|
|
network_blue_score: network_score,
|
|
status: status_str,
|
|
warning_count,
|
|
critical_count,
|
|
last_check: Some(*self.last_check.read()),
|
|
}
|
|
}
|
|
|
|
/// Calculates tip agreement percentage.
|
|
fn calculate_tip_agreement(&self) -> f32 {
|
|
let peers = self.peers.read();
|
|
let local_tips = self.local_tips.read();
|
|
|
|
if local_tips.is_empty() || peers.is_empty() {
|
|
return 1.0; // Assume agreement if we have no data
|
|
}
|
|
|
|
let peers_with_tips: Vec<_> = peers.values().filter(|p| !p.tips.is_empty()).collect();
|
|
if peers_with_tips.is_empty() {
|
|
return 1.0;
|
|
}
|
|
|
|
let matching = peers_with_tips
|
|
.iter()
|
|
.filter(|p| local_tips.iter().any(|t| p.tips.contains(t)))
|
|
.count();
|
|
|
|
matching as f32 / peers_with_tips.len() as f32
|
|
}
|
|
|
|
/// Returns the number of connected peers.
|
|
pub fn peer_count(&self) -> usize {
|
|
self.peers.read().len()
|
|
}
|
|
|
|
/// Returns the number of outbound peers.
|
|
pub fn outbound_count(&self) -> usize {
|
|
self.peers.read().values().filter(|p| p.is_outbound).count()
|
|
}
|
|
|
|
/// Returns the number of unique subnets.
|
|
pub fn unique_subnet_count(&self) -> usize {
|
|
let peers = self.peers.read();
|
|
let mut subnets = HashSet::new();
|
|
for peer in peers.values() {
|
|
if let Some(subnet) = peer.subnet {
|
|
subnets.insert(subnet);
|
|
}
|
|
}
|
|
subnets.len()
|
|
}
|
|
|
|
/// Clears all state (for testing or reset).
|
|
pub fn clear(&self) {
|
|
self.peers.write().clear();
|
|
self.local_tips.write().clear();
|
|
*self.local_blue_score.write() = 0;
|
|
self.block_arrivals.write().clear();
|
|
*self.last_block_time.write() = None;
|
|
self.peer_count_history.write().clear();
|
|
*self.cached_status.write() = None;
|
|
*self.alert_sent.write() = false;
|
|
}
|
|
}
|
|
|
|
impl Default for PartitionDetector {
|
|
fn default() -> Self {
|
|
Self::new(PartitionConfig::default())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use std::net::Ipv4Addr;
|
|
|
|
fn random_peer_id() -> PeerId {
|
|
PeerId::random()
|
|
}
|
|
|
|
fn create_detector() -> PartitionDetector {
|
|
PartitionDetector::new(PartitionConfig {
|
|
min_peers: 3,
|
|
min_subnets: 2,
|
|
min_outbound_peers: 1,
|
|
block_stall_warning: Duration::from_secs(5),
|
|
block_stall_critical: Duration::from_secs(10),
|
|
..Default::default()
|
|
})
|
|
}
|
|
|
|
#[test]
|
|
fn test_connected_with_good_peers() {
|
|
let detector = create_detector();
|
|
|
|
// Add diverse outbound peers
|
|
for i in 0..5 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, true);
|
|
detector.update_peer_protocol_version(&peer_id, 1);
|
|
}
|
|
|
|
// Record a recent block
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let status = detector.force_check();
|
|
assert!(status.is_connected());
|
|
}
|
|
|
|
#[test]
|
|
fn test_degraded_with_low_peers() {
|
|
let detector = create_detector();
|
|
|
|
// Add only 2 peers (below minimum of 3)
|
|
for i in 0..2 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, true);
|
|
}
|
|
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let status = detector.force_check();
|
|
assert!(status.is_degraded());
|
|
|
|
match status {
|
|
PartitionStatus::Degraded { reasons } => {
|
|
assert!(reasons
|
|
.iter()
|
|
.any(|r| matches!(r, PartitionReason::InsufficientPeers { .. })));
|
|
}
|
|
_ => panic!("Expected degraded status"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_partitioned_no_outbound() {
|
|
let detector = create_detector();
|
|
|
|
// Add only inbound peers
|
|
for i in 0..5 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, false); // inbound
|
|
}
|
|
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let status = detector.force_check();
|
|
assert!(status.is_partitioned());
|
|
|
|
match status {
|
|
PartitionStatus::Partitioned { reasons } => {
|
|
assert!(reasons
|
|
.iter()
|
|
.any(|r| matches!(r, PartitionReason::NoOutboundConnections)));
|
|
}
|
|
_ => panic!("Expected partitioned status"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_subnet_concentration() {
|
|
let detector = create_detector();
|
|
|
|
// Add all peers from same subnet
|
|
for i in 0..5 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, i + 1)));
|
|
detector.record_peer_connected(peer_id, ip, true);
|
|
}
|
|
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let status = detector.force_check();
|
|
assert!(status.is_degraded());
|
|
|
|
match status {
|
|
PartitionStatus::Degraded { reasons } | PartitionStatus::Partitioned { reasons } => {
|
|
assert!(reasons.iter().any(|r| matches!(
|
|
r,
|
|
PartitionReason::SubnetConcentration { .. }
|
|
| PartitionReason::LowSubnetDiversity { .. }
|
|
)));
|
|
}
|
|
_ => panic!("Expected degraded or partitioned status"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_block_rate_calculation() {
|
|
let detector = create_detector();
|
|
|
|
// Simulate receiving blocks
|
|
for i in 0..10 {
|
|
detector.record_block_received(Hash256::from_bytes([i; 32]));
|
|
std::thread::sleep(Duration::from_millis(10));
|
|
}
|
|
|
|
let rate = detector.calculate_block_rate();
|
|
// Should be roughly 100 blocks/second (10 blocks in ~0.1 seconds)
|
|
assert!(rate > 0.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_peer_count_tracking() {
|
|
let detector = create_detector();
|
|
|
|
let peer1 = random_peer_id();
|
|
let peer2 = random_peer_id();
|
|
|
|
detector.record_peer_connected(peer1, None, true);
|
|
assert_eq!(detector.peer_count(), 1);
|
|
|
|
detector.record_peer_connected(peer2, None, false);
|
|
assert_eq!(detector.peer_count(), 2);
|
|
assert_eq!(detector.outbound_count(), 1);
|
|
|
|
detector.record_peer_disconnected(&peer1);
|
|
assert_eq!(detector.peer_count(), 1);
|
|
assert_eq!(detector.outbound_count(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_tip_agreement() {
|
|
let detector = create_detector();
|
|
|
|
// Set our tips
|
|
let our_tip = Hash256::from_bytes([1u8; 32]);
|
|
detector.set_local_tips(vec![our_tip]);
|
|
|
|
// Add peers
|
|
for i in 0..5 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, true);
|
|
|
|
// Some peers agree, some don't
|
|
if i < 3 {
|
|
detector.update_peer_tips(&peer_id, vec![our_tip]);
|
|
} else {
|
|
detector.update_peer_tips(&peer_id, vec![Hash256::from_bytes([2u8; 32])]);
|
|
}
|
|
}
|
|
|
|
let agreement = detector.calculate_tip_agreement();
|
|
assert!((agreement - 0.6).abs() < 0.01); // 3 out of 5 = 60%
|
|
}
|
|
|
|
#[test]
|
|
fn test_partition_reason_descriptions() {
|
|
let reasons = vec![
|
|
PartitionReason::InsufficientPeers {
|
|
current: 1,
|
|
required: 3,
|
|
},
|
|
PartitionReason::NoOutboundConnections,
|
|
PartitionReason::BlockProductionStalled { duration_secs: 60 },
|
|
];
|
|
|
|
for reason in reasons {
|
|
let desc = reason.description();
|
|
assert!(!desc.is_empty());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_stats() {
|
|
let detector = create_detector();
|
|
|
|
// Add some peers
|
|
for i in 0..3 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, i.is_multiple_of(2));
|
|
detector.update_peer_blue_score(&peer_id, 1000 + i as u64);
|
|
}
|
|
|
|
detector.set_local_blue_score(500);
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let stats = detector.stats();
|
|
assert_eq!(stats.peer_count, 3);
|
|
assert_eq!(stats.outbound_count, 2);
|
|
assert_eq!(stats.inbound_count, 1);
|
|
assert_eq!(stats.local_blue_score, 500);
|
|
assert_eq!(stats.network_blue_score, 1002);
|
|
}
|
|
|
|
#[test]
|
|
fn test_should_alert() {
|
|
let config = PartitionConfig {
|
|
min_peers: 3,
|
|
..Default::default()
|
|
};
|
|
let detector = PartitionDetector::new(config);
|
|
|
|
// With no peers, we're partitioned
|
|
let _ = detector.force_check();
|
|
|
|
// First call should return true
|
|
assert!(detector.should_alert());
|
|
|
|
// Second call should return false (already alerted)
|
|
assert!(!detector.should_alert());
|
|
|
|
// Add enough peers to recover
|
|
for i in 0..5 {
|
|
let peer_id = random_peer_id();
|
|
let ip = Some(IpAddr::V4(Ipv4Addr::new(10, i, 1, 1)));
|
|
detector.record_peer_connected(peer_id, ip, true);
|
|
}
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
let _ = detector.force_check();
|
|
|
|
// Clear all peers again
|
|
detector.clear();
|
|
let _ = detector.force_check();
|
|
|
|
// Should alert again after recovery
|
|
assert!(detector.should_alert());
|
|
}
|
|
|
|
#[test]
|
|
fn test_clear() {
|
|
let detector = create_detector();
|
|
|
|
// Add some state
|
|
detector.record_peer_connected(random_peer_id(), None, true);
|
|
detector.set_local_tips(vec![Hash256::from_bytes([1u8; 32])]);
|
|
detector.set_local_blue_score(1000);
|
|
detector.record_block_received(Hash256::from_bytes([1u8; 32]));
|
|
|
|
assert_eq!(detector.peer_count(), 1);
|
|
|
|
// Clear
|
|
detector.clear();
|
|
|
|
assert_eq!(detector.peer_count(), 0);
|
|
assert_eq!(detector.calculate_block_rate(), 0.0);
|
|
}
|
|
}
|