From 4fb2ae658a8402e63b2163c1a9591bf656a5574f Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 21 May 2025 23:34:28 -0500 Subject: [PATCH 01/23] Implement reliable range sync for PeerDAS --- .../src/block_verification_types.rs | 8 +- beacon_node/beacon_chain/src/test_utils.rs | 3 +- .../src/service/api_types.rs | 22 +- .../lighthouse_network/src/types/globals.rs | 19 + .../network/src/sync/backfill_sync/mod.rs | 34 +- .../network/src/sync/block_lookups/mod.rs | 6 +- beacon_node/network/src/sync/manager.rs | 107 ++- beacon_node/network/src/sync/mod.rs | 1 - .../network/src/sync/network_context.rs | 587 ++++++------ .../block_components_by_range.rs | 550 +++++++++++ .../sync/network_context/custody_by_range.rs | 481 ++++++++++ .../{custody.rs => custody_by_root.rs} | 233 +++-- .../src/sync/network_context/requests.rs | 8 +- beacon_node/network/src/sync/peer_sampling.rs | 12 +- .../network/src/sync/range_sync/batch.rs | 17 +- .../network/src/sync/range_sync/chain.rs | 123 ++- .../src/sync/range_sync/chain_collection.rs | 7 + .../network/src/sync/range_sync/mod.rs | 5 +- .../network/src/sync/range_sync/range.rs | 20 +- beacon_node/network/src/sync/tests/lookups.rs | 122 ++- beacon_node/network/src/sync/tests/mod.rs | 9 +- beacon_node/network/src/sync/tests/range.rs | 901 +++++++++++++++--- consensus/types/src/signed_beacon_block.rs | 4 + 23 files changed, 2579 insertions(+), 700 deletions(-) create mode 100644 beacon_node/network/src/sync/network_context/block_components_by_range.rs create mode 100644 beacon_node/network/src/sync/network_context/custody_by_range.rs rename beacon_node/network/src/sync/network_context/{custody.rs => custody_by_root.rs} (70%) diff --git a/beacon_node/beacon_chain/src/block_verification_types.rs b/beacon_node/beacon_chain/src/block_verification_types.rs index 7abaf09e5e0..84011e23ff9 100644 --- a/beacon_node/beacon_chain/src/block_verification_types.rs +++ b/beacon_node/beacon_chain/src/block_verification_types.rs @@ -199,7 +199,7 @@ impl RpcBlock { custody_columns: Vec>, expected_custody_indices: Vec, spec: &ChainSpec, - ) -> Result { + ) -> Result { let block_root = block_root.unwrap_or_else(|| get_block_root(&block)); let custody_columns_count = expected_custody_indices.len(); @@ -209,11 +209,7 @@ impl RpcBlock { custody_columns, spec.number_of_columns as usize, ) - .map_err(|e| { - AvailabilityCheckError::Unexpected(format!( - "custody_columns len exceeds number_of_columns: {e:?}" - )) - })?, + .map_err(|e| format!("custody_columns len exceeds number_of_columns: {e:?}"))?, expected_custody_indices, }; Ok(Self { diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 858aaafcf07..8f5a119fb5d 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2418,7 +2418,8 @@ where columns, expected_custody_indices, &self.spec, - )? + ) + .map_err(BlockError::InternalError)? } else { RpcBlock::new_without_blobs(Some(block_root), block, sampling_column_count) } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index b36f8cc2154..8300ad4bb89 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -59,6 +59,14 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, + /// The Id of the parent custody by range request that issued this data_columns_by_range request + pub parent_request_id: CustodyByRangeRequestId, +} + +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyByRangeRequestId { + /// Id to identify this attempt at a meta custody by range request for `parent_request_id` + pub id: Id, /// The Id of the overall By Range request for block components. pub parent_request_id: ComponentsByRangeRequestId, } @@ -221,6 +229,7 @@ macro_rules! impl_display { impl_display!(BlocksByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(BlobsByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(DataColumnsByRangeRequestId, "{}/{}", id, parent_request_id); +impl_display!(CustodyByRangeRequestId, "{}/{}", id, parent_request_id); impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); @@ -299,14 +308,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { + parent_request_id: CustodyByRangeRequestId { id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: ComponentsByRangeRequestId { + id: 121, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, }, }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/122/121/RangeSync/0/54"); } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index fd99d935890..7fa751a9057 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -245,6 +245,25 @@ impl NetworkGlobals { Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) } + pub fn new_test_globals_as_supernode( + trusted_peers: Vec, + config: Arc, + spec: Arc, + is_supernode: bool, + ) -> NetworkGlobals { + let metadata = MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets: Default::default(), + custody_group_count: if is_supernode { + spec.number_of_custody_groups + } else { + spec.custody_requirement + }, + }); + Self::new_test_globals_with_metadata(trusted_peers, metadata, config, spec) + } + pub(crate) fn new_test_globals_with_metadata( trusted_peers: Vec, metadata: MetaData, diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 7b5701cc8d2..45b9c61641b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -21,11 +21,11 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::{PeerAction, PeerId}; +use lighthouse_network::PeerAction; use logging::crit; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashSet, + HashMap, HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -312,7 +312,6 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> Result<(), BackFillError> { @@ -326,11 +325,18 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - match batch.download_failed(Some(*peer_id)) { + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + match batch.download_failed(None) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) - } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { + RpcResponseError::RpcError(_) + | RpcResponseError::VerifyError(_) + | RpcResponseError::InternalError(_) => { + BackFillError::BatchDownloadFailed(batch_id) + } + RpcResponseError::RequestExpired(_) => BackFillError::Paused, + }), Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { @@ -929,6 +935,8 @@ impl BackFillSync { RangeRequestId::BackfillSync { batch_id }, &synced_peers, &failed_peers, + // Does not track total requests per peers for now + &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request @@ -940,15 +948,9 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - RpcRequestSendError::NoPeer(no_peer) => { - // If we are here the chain has no more synced peers - info!( - "reason" = format!("insufficient_synced_peers({no_peer:?})"), - "Backfill sync paused" - ); - self.set_state(BackFillState::Paused); - return Err(BackFillError::Paused); - } + // TODO(das): block_components_by_range requests can now hang out indefinitely. + // Is that fine? Maybe we should fail the requests from the network_context + // level without involving the BackfillSync itself. RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 8c884f644e1..2c59f710d04 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -494,7 +494,7 @@ impl BlockLookups { let Some(lookup) = self.single_block_lookups.get_mut(&id.lookup_id) else { // We don't have the ability to cancel in-flight RPC requests. So this can happen // if we started this RPC request, and later saw the block/blobs via gossip. - debug!(?id, "Block returned for single block lookup not present"); + debug!(%id, "Block returned for single block lookup not present"); return Err(LookupRequestError::UnknownLookup); }; @@ -507,7 +507,7 @@ impl BlockLookups { Ok((response, peer_group, seen_timestamp)) => { debug!( ?block_root, - ?id, + %id, ?peer_group, ?response_type, "Received lookup download success" @@ -540,7 +540,7 @@ impl BlockLookups { // the peer and the request ID which is linked to this `id` value here. debug!( ?block_root, - ?id, + %id, ?response_type, error = ?e, "Received lookup download failure" diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 3c94793941c..0cf17c7b899 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,7 +36,8 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, + CustodyByRangeResult, CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, + SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -58,9 +59,10 @@ use beacon_chain::{ use futures::StreamExt; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SamplingId, SamplingRequester, SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SamplingId, SamplingRequester, + SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::PeerId; @@ -336,23 +338,6 @@ impl SyncManager { .collect() } - #[cfg(test)] - pub(crate) fn get_range_sync_chains( - &self, - ) -> Result, &'static str> { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn range_sync_state(&self) -> super::range_sync::SyncChainStatus { - self.range_sync.state() - } - - #[cfg(test)] - pub(crate) fn __range_failed_chains(&mut self) -> Vec { - self.range_sync.__failed_chains() - } - #[cfg(test)] pub(crate) fn get_failed_chains(&mut self) -> Vec { self.block_lookups.get_failed_chains() @@ -377,6 +362,18 @@ impl SyncManager { self.sampling.get_request_status(block_root, index) } + // Leak the full network context to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn network(&mut self) -> &mut SyncNetworkContext { + &mut self.network + } + + // Leak the full range_sync to prevent having to add many cfg(test) methods here + #[cfg(test)] + pub(crate) fn range_sync(&mut self) -> &mut RangeSync { + &mut self.range_sync + } + #[cfg(test)] pub(crate) fn update_execution_engine_state(&mut self, state: EngineState) { self.handle_new_execution_engine_state(state); @@ -442,6 +439,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. @@ -545,6 +545,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + for (id, result) in self.network.continue_custody_by_range_requests() { + self.on_custody_by_range_result(id, result); + } } /// Updates the syncing state of a peer. @@ -1186,10 +1189,9 @@ impl SyncManager { block: RpcEvent>>, ) { if let Some(resp) = self.network.on_blocks_by_range_response(id, peer_id, block) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Block(id, resp), + RangeBlockComponent::Block(id, resp, peer_id), ); } } @@ -1201,10 +1203,9 @@ impl SyncManager { blob: RpcEvent>>, ) { if let Some(resp) = self.network.on_blobs_by_range_response(id, peer_id, blob) { - self.on_range_components_response( + self.on_block_components_by_range_response( id.parent_request_id, - peer_id, - RangeBlockComponent::Blob(id, resp), + RangeBlockComponent::Blob(id, resp, peer_id), ); } } @@ -1215,18 +1216,46 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { + // data_columns_by_range returns either an Ok list of data columns, or an RpcResponseError if let Some(resp) = self .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + // custody_by_range accumulates the results of multiple data_columns_by_range requests + // returning a bigger list of data columns across all the column indices this node has + // to custody + if let Some(result) = + self.network + .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) + { + self.on_custody_by_range_result(id.parent_request_id, result); + } } } + fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + result: CustodyByRangeResult, + ) { + // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not + // not have to pass a PeerGroup in case of error + let peers = match &result { + Ok((_, peers, _)) => peers.clone(), + // TODO(das): this PeerGroup with no peers incorrect + Err(_) => PeerGroup::from_set(<_>::default()), + }; + + self.on_block_components_by_range_response( + id.parent_request_id, + RangeBlockComponent::CustodyColumns( + id, + result.map(|(data, _peers, timestamp)| (data, timestamp)), + peers, + ), + ); + } + fn on_custody_by_root_result( &mut self, requester: CustodyRequester, @@ -1267,17 +1296,15 @@ impl SyncManager { /// Handles receiving a response for a range sync request that should have both blocks and /// blobs. - fn on_range_components_response( + fn on_block_components_by_range_response( &mut self, range_request_id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self.network.range_block_component_response( - range_request_id, - peer_id, - range_block_component, - ) { + if let Some(resp) = self + .network + .on_block_components_by_range_response(range_request_id, range_block_component) + { match resp { Ok((blocks, batch_peers)) => { match range_request_id.requester { @@ -1315,7 +1342,6 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.inject_error( &mut self.network, - peer_id, batch_id, chain_id, range_request_id.id, @@ -1327,7 +1353,6 @@ impl SyncManager { match self.backfill_sync.inject_error( &mut self.network, batch_id, - &peer_id, range_request_id.id, e, ) { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 0f5fd6fb9f1..97302df04e8 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -3,7 +3,6 @@ //! Stores the various syncing methods for the beacon chain. mod backfill_sync; mod block_lookups; -mod block_sidecar_coupling; pub mod manager; mod network_context; mod peer_sampling; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 50b39fe72ef..d7ad9d3eb7a 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,11 +1,11 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; +use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; +use self::custody_by_root::{ActiveCustodyByRootRequest, CustodyByRootRequestResult}; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; -use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::{BatchPeers, ByRangeRequestType}; +use super::range_sync::BatchPeers; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; @@ -17,15 +17,17 @@ use crate::sync::block_lookups::SingleLookupId; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; -use custody::CustodyRequestResult; +pub use block_components_by_range::BlockComponentsByRangeRequest; +#[cfg(test)] +pub use block_components_by_range::BlockComponentsByRangeRequestStep; use fnv::FnvHashMap; use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + CustodyByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; @@ -36,7 +38,6 @@ use requests::{ }; #[cfg(test)] use slot_clock::SlotClock; -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -47,11 +48,13 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; -pub mod custody; +pub mod block_components_by_range; +pub mod custody_by_range; +pub mod custody_by_root; mod requests; #[derive(Debug)] @@ -72,32 +75,29 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; + /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = - Result<(DataColumnSidecarList, PeerGroup, Duration), RpcResponseError>; +pub type CustodyByRootResult = RpcResponseBatchResult>; -#[derive(Debug)] +pub type CustodyByRangeResult = RpcResponseBatchResult>; + +#[derive(Debug, Clone)] pub enum RpcResponseError { RpcError(#[allow(dead_code)] RPCError), VerifyError(LookupVerifyError), - CustodyRequestError(#[allow(dead_code)] CustodyRequestError), - BlockComponentCouplingError(#[allow(dead_code)] String), + RequestExpired(String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// No peer available matching the required criteria - NoPeer(NoPeerError), /// These errors should never happen, including unreachable custody errors or network send /// errors. InternalError(String), -} - -/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` -#[derive(Debug, PartialEq, Eq)] -pub enum NoPeerError { - BlockPeer, - CustodyPeer(ColumnIndex), + // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream + // consumers that sends are expected to be infallible. If this assumption changes in the future, + // add a new variant. } #[derive(Debug, PartialEq, Eq)] @@ -150,6 +150,17 @@ impl PeerGroup { } }) } + + pub fn as_reversed_map(&self) -> HashMap { + // TODO(das): should we change PeerGroup to hold this map? + let mut index_to_peer = HashMap::::new(); + for (peer, indices) in self.peers.iter() { + for &index in indices { + index_to_peer.insert(index as u64, *peer); + } + } + index_to_peer + } } /// Sequential ID that uniquely identifies ReqResp outgoing requests @@ -195,12 +206,15 @@ pub struct SyncNetworkContext { data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root - custody_by_root_requests: FnvHashMap>, + /// Mapping of active custody column by root requests for a block root + custody_by_root_requests: FnvHashMap>, + + /// Mapping of active custody column by range requests + custody_by_range_requests: FnvHashMap>, /// BlocksByRange requests paired with other ByRange requests for data components - components_by_range_requests: - FnvHashMap>, + block_components_by_range_requests: + FnvHashMap>, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -219,14 +233,17 @@ pub enum RangeBlockComponent { Block( BlocksByRangeRequestId, RpcResponseResult>>>, + PeerId, ), Blob( BlobsByRangeRequestId, RpcResponseResult>>>, + PeerId, ), CustodyColumns( - DataColumnsByRangeRequestId, + CustodyByRangeRequestId, RpcResponseResult>>>, + PeerGroup, ), } @@ -283,7 +300,8 @@ impl SyncNetworkContext { blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), - components_by_range_requests: FnvHashMap::default(), + custody_by_range_requests: <_>::default(), + block_components_by_range_requests: <_>::default(), network_beacon_processor, chain, fork_context, @@ -297,6 +315,14 @@ impl SyncNetworkContext { /// Returns the ids of all the requests made to the given peer_id. pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Vec { + self.active_requests() + .filter(|(_, request_peer)| *request_peer == peer_id) + .map(|(id, _)| id) + .collect() + } + + /// Returns the ids of all active requests + pub fn active_requests(&mut self) -> impl Iterator { // Note: using destructuring pattern without a default case to make sure we don't forget to // add new request types to this function. Otherwise, lookup sync can break and lookups // will get stuck if a peer disconnects during an active requests. @@ -311,8 +337,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -320,29 +347,23 @@ impl SyncNetworkContext { } = self; let blocks_by_root_ids = blocks_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlock { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlock { id: *id }, peer)); let blobs_by_root_ids = blobs_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|id| SyncRequestId::SingleBlob { id: *id }); + .active_requests() + .map(|(id, peer)| (SyncRequestId::SingleBlob { id: *id }, peer)); let data_column_by_root_ids = data_columns_by_root_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRoot(*id), peer)); let blocks_by_range_ids = blocks_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlocksByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlocksByRange(*id), peer)); let blobs_by_range_ids = blobs_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::BlobsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::BlobsByRange(*id), peer)); let data_column_by_range_ids = data_columns_by_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + .active_requests() + .map(|(id, peer)| (SyncRequestId::DataColumnsByRange(*id), peer)); blocks_by_root_ids .chain(blobs_by_root_ids) @@ -350,6 +371,18 @@ impl SyncNetworkContext { .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) + } + + #[cfg(test)] + pub fn active_block_components_by_range_requests( + &self, + ) -> Vec<( + ComponentsByRangeRequestId, + BlockComponentsByRangeRequestStep, + )> { + self.block_components_by_range_requests + .iter() + .map(|(id, req)| (*id, req.state_step())) .collect() } @@ -362,6 +395,10 @@ impl SyncNetworkContext { &self.network_beacon_processor.network_globals } + pub fn spec(&self) -> &ChainSpec { + &self.chain.spec + } + /// Returns the Client type of the peer if known pub fn client_type(&self, peer_id: &PeerId) -> Client { self.network_globals() @@ -414,8 +451,9 @@ impl SyncNetworkContext { data_columns_by_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, + custody_by_range_requests: _, // components_by_range_requests is a meta request of various _by_range requests - components_by_range_requests: _, + block_components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -447,205 +485,95 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: &HashSet, peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, ) -> Result { - let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let batch_type = self.batch_type(batch_epoch); - - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Attempt to find all required custody peers before sending any request or creating an ID - let columns_by_range_peers_to_request = - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.select_columns_by_range_peers_to_request( - &column_indexes, - peers, - active_request_count_by_peer, - peers_to_deprioritize, - )?) - } else { - None - }; - - // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - )?) - } else { - None - }; - - let data_column_requests = columns_by_range_peers_to_request - .map(|columns_by_range_peers_to_request| { - let column_to_peer_map = columns_by_range_peers_to_request - .iter() - .flat_map(|(peer_id, columns)| columns.iter().map(|column| (*column, *peer_id))) - .collect::>(); - - let requests = columns_by_range_peers_to_request - .into_iter() - .map(|(peer_id, columns)| { - self.send_data_columns_by_range_request( - peer_id, - DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns, - }, - id, - ) - }) - .collect::, _>>()?; - - Ok((requests, column_to_peer_map)) - }) - .transpose()?; + let req = BlockComponentsByRangeRequest::new( + id, + request, + peers, + peers_to_deprioritize, + total_requests_per_peer, + self, + )?; - let info = - RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_column_requests); - self.components_by_range_requests.insert(id, info); + self.block_components_by_range_requests.insert(id, req); + // TODO: use ID Ok(id.id) } - fn select_columns_by_range_peers_to_request( - &self, - custody_indexes: &HashSet, - peers: &HashSet, - active_request_count_by_peer: HashMap, - peers_to_deprioritize: &HashSet, - ) -> Result>, RpcRequestSendError> { - let mut columns_to_request_by_peer = HashMap::>::new(); - - for column_index in custody_indexes { - // Strictly consider peers that are custodials of this column AND are part of this - // syncing chain. If the forward range sync chain has few peers, it's likely that this - // function will not be able to find peers on our custody columns. - let Some(custody_peer) = peers - .iter() - .filter(|peer| { - self.network_globals() - .is_custody_peer_of(*column_index, peer) - }) - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - // Also account for requests that are not yet issued tracked in peer_id_to_request_map - // We batch requests to the same peer, so count existance in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // TODO(das): this will be pretty bad UX. To improve we should: - // - Handle the no peers case gracefully, maybe add some timeout and give a few - // minutes / seconds to the peer manager to locate peers on this subnet before - // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( - *column_index, - ))); - }; - - columns_to_request_by_peer - .entry(custody_peer) - .or_default() - .push(*column_index); - } - - Ok(columns_to_request_by_peer) - } - - /// Received a _by_range response for a request that couples blocks and its data - /// - /// `peer_id` is the peer that served this individual RPC _by_range response. + /// Received a blocks by range or blobs by range response for a request that couples blocks ' + /// and blobs. #[allow(clippy::type_complexity)] - pub fn range_block_component_response( + pub fn on_block_components_by_range_response( &mut self, id: ComponentsByRangeRequestId, - peer_id: PeerId, range_block_component: RangeBlockComponent, ) -> Option>, BatchPeers), RpcResponseError>> { - let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { - metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); return None; }; - if let Err(e) = { - let request = entry.get_mut(); - match range_block_component { - RangeBlockComponent::Block(req_id, resp) => resp.and_then(|(blocks, _)| { - request - .add_blocks(req_id, blocks, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::Blob(req_id, resp) => resp.and_then(|(blobs, _)| { + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { + resp.and_then(|(custody_columns, _)| { request - .add_blobs(req_id, blobs, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }), - RangeBlockComponent::CustodyColumns(req_id, resp) => { - resp.and_then(|(custody_columns, _)| { - request - .add_custody_columns(req_id, custody_columns, peer_id) - .map_err(RpcResponseError::BlockComponentCouplingError) - }) - } + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) } - } { - entry.remove(); - return Some(Err(e)); - } + }; + + let result = result.transpose(); - if let Some(blocks_result) = entry.get().responses(&self.chain.spec) { - entry.remove(); - // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) - } else { - None + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_range_requests.insert(id, request); + } } + result } /// Request block of `block_root` if necessary by checking: @@ -853,7 +781,7 @@ impl SyncNetworkContext { } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, @@ -951,7 +879,7 @@ impl SyncNetworkContext { ); let requester = CustodyRequester(id); - let mut request = ActiveCustodyRequest::new( + let mut request = ActiveCustodyByRootRequest::new( block_root, CustodyId { requester }, &custody_indexes_to_fetch, @@ -967,25 +895,7 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(id.req_id)) } - Err(e) => Err(match e { - CustodyRequestError::NoPeer(column_index) => { - RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) - } - // - TooManyFailures: Should never happen, `request` has just been created, it's - // count of download_failures is 0 here - // - BadState: Should never happen, a bad state can only happen when handling a - // network response - // - UnexpectedRequestId: Never happens: this Err is only constructed handling a - // download or processing response - // - SendFailed: Should never happen unless in a bad drop sequence when shutting - // down the node - e @ (CustodyRequestError::TooManyFailures - | CustodyRequestError::BadState { .. } - | CustodyRequestError::UnexpectedRequestId { .. } - | CustodyRequestError::SendFailed { .. }) => { - RpcRequestSendError::InternalError(format!("{e:?}")) - } - }), + Err(e) => Err(e.into()), } } @@ -1073,8 +983,8 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, - ) -> Result { + parent_request_id: CustodyByRangeRequestId, + ) -> Result { let id = DataColumnsByRangeRequestId { id: self.next_id(), parent_request_id, @@ -1085,7 +995,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + .map_err(|_| "network send error")?; debug!( method = "DataColumnsByRange", @@ -1108,6 +1018,50 @@ impl SyncNetworkContext { Ok(id) } + /// Request to fetch all needed custody columns of a range of slot. This function may not send + /// any request to the network if no columns have to be fetched based on the import state of the + /// node. A custody request is a "super request" that may trigger 0 or more `data_columns_by_range` + /// requests. + pub fn send_custody_by_range_request( + &mut self, + parent_id: ComponentsByRangeRequestId, + blocks_with_data: Vec, + epoch: Epoch, + column_indices: Vec, + lookup_peers: Arc>>, + ) -> Result { + let id = CustodyByRangeRequestId { + id: self.next_id(), + parent_request_id: parent_id, + }; + + debug!( + indices = ?column_indices, + %id, + "Starting custody columns by range request" + ); + + let mut request = ActiveCustodyByRangeRequest::new( + id, + epoch, + blocks_with_data, + &column_indices, + lookup_peers, + ); + + // Note that you can only send, but not handle a response here + match request.continue_requests(self) { + Ok(_) => { + // Ignoring the result of `continue_requests` is okay. A request that has just been + // created cannot return data immediately, it must send some request to the network + // first. And there must exist some request, `custody_indexes_to_fetch` is not empty. + self.custody_by_range_requests.insert(id, request); + Ok(id) + } + Err(e) => Err(e.into()), + } + } + pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1212,34 +1166,6 @@ impl SyncNetworkContext { id } - /// Check whether a batch for this epoch (and only this epoch) should request just blocks or - /// blocks and blobs. - fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { - // Induces a compile time panic if this doesn't hold true. - #[allow(clippy::assertions_on_constants)] - const _: () = assert!( - super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 - && super::range_sync::EPOCHS_PER_BATCH == 1, - "To deal with alignment with deneb boundaries, batches need to be of just one epoch" - ); - - if self - .chain - .data_availability_checker - .data_columns_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndColumns - } else if self - .chain - .data_availability_checker - .blobs_required_for_epoch(epoch) - { - ByRangeRequestType::BlocksAndBlobs - } else { - ByRangeRequestType::Blocks - } - } - /// Attempt to make progress on all custody_by_root requests. Some request may be stale waiting /// for custody peers. Returns a Vec of results as zero or more requests may fail in this /// attempt. @@ -1266,6 +1192,32 @@ impl SyncNetworkContext { .collect() } + /// Attempt to make progress on all custody_by_range requests. Some request may be stale waiting + /// for custody peers. Returns a Vec of results as zero or more requests may fail in this + /// attempt. + pub fn continue_custody_by_range_requests( + &mut self, + ) -> Vec<(CustodyByRangeRequestId, CustodyByRangeResult)> { + let ids = self + .custody_by_range_requests + .keys() + .copied() + .collect::>(); + + // Need to collect ids and results in separate steps to re-borrow self. + ids.into_iter() + .filter_map(|id| { + let mut request = self + .custody_by_range_requests + .remove(&id) + .expect("key of hashmap"); + let result = request.continue_requests(self); + self.handle_custody_by_range_result(id, request, result) + .map(|result| (id, result)) + }) + .collect() + } + // Request handlers pub(crate) fn on_single_block_response( @@ -1425,8 +1377,10 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_root_requests.remove(&id.requester) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Custody column downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_root"], + ); return None; }; @@ -1438,8 +1392,8 @@ impl SyncNetworkContext { fn handle_custody_by_root_result( &mut self, id: CustodyRequester, - request: ActiveCustodyRequest, - result: CustodyRequestResult, + request: ActiveCustodyByRootRequest, + result: CustodyByRootRequestResult, ) -> Option> { let span = span!( Level::INFO, @@ -1448,18 +1402,16 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result - .map_err(RpcResponseError::CustodyRequestError) - .transpose(); + let result = result.map_err(Into::::into).transpose(); // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to // an Option first to use in an `if let Some() { act on result }` block. match result.as_ref() { Some(Ok((columns, peer_group, _))) => { - debug!(?id, count = columns.len(), peers = ?peer_group, "Custody request success, removing") + debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } Some(Err(e)) => { - debug!(?id, error = ?e, "Custody request failure, removing" ) + debug!(%id, error = ?e, "Custody by root request failure, removing" ) } None => { self.custody_by_root_requests.insert(id, request); @@ -1468,6 +1420,61 @@ impl SyncNetworkContext { result } + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result. + /// - `None`: Request still active, requester should do no action + #[allow(clippy::type_complexity)] + pub fn on_custody_by_range_response( + &mut self, + id: CustodyByRangeRequestId, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + resp: RpcResponseResult>>>, + ) -> Option> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.custody_by_range_requests.remove(&id) else { + // TOOD(das): This log can happen if the request is error'ed early and dropped + debug!(%id, "Custody by range downloaded event for unknown request"); + return None; + }; + + let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + + self.handle_custody_by_range_result(id, request, result) + } + + fn handle_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + request: ActiveCustodyByRangeRequest, + result: CustodyByRangeRequestResult, + ) -> Option> { + let result = result.map_err(Into::::into).transpose(); + + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + match result.as_ref() { + Some(Ok((columns, _peer_group, _))) => { + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!(%id, count = columns.len(), "Custody by range request success, removing") + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Custody by range request failure, removing" ) + } + None => { + self.custody_by_range_requests.insert(id, request); + } + } + result + } + pub fn send_block_for_processing( &self, id: Id, @@ -1529,7 +1536,7 @@ impl SyncNetworkContext { .beacon_processor_if_enabled() .ok_or(SendErrorProcessor::ProcessorNotAvailable)?; - debug!(?block_root, ?id, "Sending blobs for processing"); + debug!(?block_root, %id, "Sending blobs for processing"); // Lookup sync event safety: If `beacon_processor.send_rpc_blobs` returns Ok() sync // must receive a single `SyncMessage::BlockComponentProcessed` event with this process type beacon_processor @@ -1600,8 +1607,8 @@ impl SyncNetworkContext { ), ("custody_by_root", self.custody_by_root_requests.len()), ( - "components_by_range", - self.components_by_range_requests.len(), + "block_components_by_range", + self.block_components_by_range_requests.len(), ), ] { metrics::set_gauge_vec(&metrics::SYNC_ACTIVE_NETWORK_REQUESTS, &[id], count as i64); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs new file mode 100644 index 00000000000..4545806a05e --- /dev/null +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -0,0 +1,550 @@ +use crate::sync::network_context::{ + PeerGroup, RpcRequestSendError, RpcResponseError, SyncNetworkContext, +}; +use crate::sync::range_sync::BatchPeers; +use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::data_column_verification::CustodyDataColumn; +use beacon_chain::{get_block_root, BeaconChainTypes}; +use lighthouse_network::rpc::methods::{BlobsByRangeRequest, BlocksByRangeRequest}; +use lighthouse_network::service::api_types::{ + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyByRangeRequestId, +}; +use lighthouse_network::PeerId; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use types::{ + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, RuntimeVariableList, + SignedBeaconBlock, Slot, +}; + +pub struct BlockComponentsByRangeRequest { + id: ComponentsByRangeRequestId, + peers: Arc>>, + request: BlocksByRangeRequest, + state: State, +} + +enum State { + Base { + blocks_by_range_request: + ByRangeRequest>>>, + }, + // Two single concurrent requests for block + blobs + DenebEnabled { + blocks_by_range_request: + ByRangeRequest>>>, + blobs_by_range_request: ByRangeRequest>>>, + }, + // Request blocks first, then columns + FuluEnabled(FuluEnabledState), +} + +enum FuluEnabledState { + BlockRequest { + blocks_by_range_request: + ByRangeRequest>>>, + }, + CustodyRequest { + blocks: Vec>>, + block_peer: PeerId, + custody_by_range_request: + ByRangeRequest>>, PeerGroup>, + }, +} + +enum ByRangeRequest { + /// Active(RequestIndex) + Active(I), + /// Complete(DownloadedData, Peers) + Complete(T, P), +} + +pub type BlockComponentsByRangeRequestResult = + Result>, BatchPeers)>, Error>; + +pub enum Error { + InternalError(String), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + } + } +} + +/// FOR TESTING ONLY +#[cfg(test)] +#[derive(Debug)] +pub enum BlockComponentsByRangeRequestStep { + BlocksRequest, + CustodyRequest, +} + +impl BlockComponentsByRangeRequest { + pub fn new( + id: ComponentsByRangeRequestId, + request: BlocksByRangeRequest, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + total_requests_per_peer: &HashMap, + cx: &mut SyncNetworkContext, + ) -> Result { + // Induces a compile time panic if this doesn't hold true. + #[allow(clippy::assertions_on_constants)] + const _: () = assert!( + super::super::backfill_sync::BACKFILL_EPOCHS_PER_BATCH == 1 + && super::super::range_sync::EPOCHS_PER_BATCH == 1, + "To deal with alignment with deneb boundaries, batches need to be of just one epoch" + ); + // The assertion above ensures each batch is in one single epoch + let batch_epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + let batch_fork = cx.spec().fork_name_at_epoch(batch_epoch); + + // TODO(das): a change of behaviour here is that if the SyncingChain has a single peer we + // will request all blocks for the first 5 epochs to that same single peer. Before we would + // query only idle peers in the syncing chain. + let Some(block_peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // TODO(das): Should we use active_request_count_by_peer? + // Prefer peers with less overall requests + // active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Prefer peers with less total cummulative requests, so we fetch data from a + // diverse set of peers + total_requests_per_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // When a peer disconnects and is removed from the SyncingChain peer set, if the set + // reaches zero the SyncingChain is removed. + // TODO(das): add test for this. + return Err(RpcRequestSendError::InternalError( + "A batch peer set should never be empty".to_string(), + )); + }; + + let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; + + let state = if batch_fork.fulu_enabled() { + State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + }) + } else if batch_fork.deneb_enabled() { + // TODO(deneb): is it okay to send blobs_by_range requests outside the DA window? I + // would like the beacon processor / da_checker to be the one that decides if an + // RpcBlock is valid or not with respect to containing blobs. Having sync not even + // attempt a requests seems like an added limitation. + let blobs_req_id = cx.send_blobs_by_range_request( + block_peer, + BlobsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + }, + id, + )?; + State::DenebEnabled { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + blobs_by_range_request: ByRangeRequest::Active(blobs_req_id), + } + } else { + State::Base { + blocks_by_range_request: ByRangeRequest::Active(blocks_req_id), + } + }; + + Ok(Self { + id, + // TODO(das): share the rwlock with the range sync batch. Are peers added to the batch + // after being created? + peers: Arc::new(RwLock::new(peers.clone())), + request, + state, + }) + } + + pub fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_base( + blocks.to_vec(), + cx.network_globals().sampling_columns.len(), + ); + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range requests to complete + Ok(None) + } + } + State::DenebEnabled { + blocks_by_range_request, + blobs_by_range_request, + } => { + if let (Some((blocks, block_peer)), Some((blobs, _))) = ( + blocks_by_range_request.to_finished(), + blobs_by_range_request.to_finished(), + ) { + // We use the same block_peer for the blobs request + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = + couple_blocks_deneb(blocks.to_vec(), blobs.to_vec(), cx.spec())?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for blocks_by_range and blobs_by_range requests to complete + Ok(None) + } + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { + blocks_by_range_request, + } => { + if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { + // TODO(das): use the peer group + let blocks_with_data = blocks + .iter() + .filter(|block| block.has_data()) + .map(|block| block.signed_block_header()) + .collect::>(); + + if blocks_with_data.is_empty() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + // Done, we got blocks and no columns needed + let peer_group = BatchPeers::new_from_block_peer(*block_peer); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + vec![], + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + let mut column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect::>(); + column_indices.sort_unstable(); + + let req_id = cx + .send_custody_by_range_request( + self.id, + blocks_with_data, + Slot::new(*self.request.start_slot()) + .epoch(T::EthSpec::slots_per_epoch()), + column_indices, + self.peers.clone(), + ) + .map_err(|e| match e { + RpcRequestSendError::InternalError(e) => { + Error::InternalError(e) + } + })?; + + *state = FuluEnabledState::CustodyRequest { + blocks: blocks.to_vec(), + block_peer: *block_peer, + custody_by_range_request: ByRangeRequest::Active(req_id), + }; + + // Wait for the new custody_by_range request to complete + Ok(None) + } + } else { + // Wait for the block request to complete + Ok(None) + } + } + FuluEnabledState::CustodyRequest { + blocks, + block_peer, + custody_by_range_request, + } => { + if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { + let custody_column_indices = cx + .network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(); + + let peer_group = + BatchPeers::new(*block_peer, column_peers.as_reversed_map()); + let rpc_blocks = couple_blocks_fulu( + blocks.to_vec(), + columns.to_vec(), + custody_column_indices, + cx.spec(), + )?; + Ok(Some((rpc_blocks, peer_group))) + } else { + // Wait for the custody_by_range request to complete + Ok(None) + } + } + }, + } + } + + pub fn on_blocks_by_range_result( + &mut self, + id: BlocksByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { + blocks_by_range_request, + } + | State::DenebEnabled { + blocks_by_range_request, + .. + } + | State::FuluEnabled(FuluEnabledState::BlockRequest { + blocks_by_range_request, + }) => { + blocks_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(FuluEnabledState::CustodyRequest { .. }) => { + return Err(Error::InternalError( + "Received blocks_by_range response expecting custody_by_range".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_blobs_by_range_result( + &mut self, + id: BlobsByRangeRequestId, + data: Vec>>, + peer_id: PeerId, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } => { + return Err(Error::InternalError( + "Received blobs_by_range response before Deneb".to_string(), + )) + } + State::DenebEnabled { + blobs_by_range_request, + .. + } => { + blobs_by_range_request.finish(id, data, peer_id)?; + } + State::FuluEnabled(_) => { + return Err(Error::InternalError( + "Received blobs_by_range response after PeerDAS".to_string(), + )) + } + } + + self.continue_requests(cx) + } + + pub fn on_custody_by_range_result( + &mut self, + id: CustodyByRangeRequestId, + data: Vec>>, + peers: PeerGroup, + cx: &mut SyncNetworkContext, + ) -> BlockComponentsByRangeRequestResult { + match &mut self.state { + State::Base { .. } | State::DenebEnabled { .. } => { + return Err(Error::InternalError( + "Received custody_by_range response before PeerDAS".to_string(), + )) + } + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + return Err(Error::InternalError( + "Received custody_by_range expecting blocks_by_range".to_string(), + )); + } + FuluEnabledState::CustodyRequest { + custody_by_range_request, + .. + } => { + custody_by_range_request.finish(id, data, peers)?; + } + }, + } + + self.continue_requests(cx) + } + + #[cfg(test)] + pub fn state_step(&self) -> BlockComponentsByRangeRequestStep { + match &self.state { + State::Base { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::DenebEnabled { .. } => BlockComponentsByRangeRequestStep::BlocksRequest, + State::FuluEnabled(state) => match state { + FuluEnabledState::BlockRequest { .. } => { + BlockComponentsByRangeRequestStep::BlocksRequest + } + FuluEnabledState::CustodyRequest { .. } => { + BlockComponentsByRangeRequestStep::CustodyRequest + } + }, + } + } +} + +fn couple_blocks_base( + blocks: Vec>>, + custody_columns_count: usize, +) -> Vec> { + blocks + .into_iter() + .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .collect() +} + +fn couple_blocks_deneb( + blocks: Vec>>, + blobs: Vec>>, + spec: &ChainSpec, +) -> Result>, Error> { + let mut blobs_by_block = HashMap::>>>::new(); + for blob in blobs { + let block_root = blob.block_root(); + blobs_by_block.entry(block_root).or_default().push(blob); + } + + // Now collect all blobs that match to the block by block root. BlobsByRange request checks + // the inclusion proof so we know that the commitment is the expected. + // + // BlobsByRange request handler ensures that we don't receive more blobs than possible. + // If the peer serving the request sends us blobs that don't pair well we'll send to the + // processor blocks without expected blobs, resulting in a downscoring event. A serving peer + // could serve fake blobs for blocks that don't have data, but it would gain nothing by it + // wasting theirs and our bandwidth 1:1. Therefore blobs that don't pair well are just ignored. + // + // RpcBlock::new ensures that the count of blobs is consistent with the block + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; + let blobs = blobs_by_block.remove(&block_root).unwrap_or_default(); + // BlobsByRange request handler enforces that blobs are sorted by index + let blobs = RuntimeVariableList::new(blobs, max_blobs_per_block).map_err(|_| { + Error::InternalError("Blobs returned exceeds max length".to_string()) + })?; + Ok(RpcBlock::new(Some(block_root), block, Some(blobs)) + .expect("TODO: don't do matching here")) + }) + .collect::>, Error>>() +} + +fn couple_blocks_fulu( + blocks: Vec>>, + data_columns: Vec>>, + custody_column_indices: Vec, + spec: &ChainSpec, +) -> Result>, Error> { + // Group data columns by block_root and index + let mut custody_columns_by_block = HashMap::>>::new(); + + for column in data_columns { + let block_root = column.block_root(); + + if custody_column_indices.contains(&column.index) { + custody_columns_by_block + .entry(block_root) + .or_default() + // Safe to convert to `CustodyDataColumn`: we have asserted that the index of + // this column is in the set of `expects_custody_columns` and with the expected + // block root, so for the expected epoch of this batch. + .push(CustodyDataColumn::from_asserted_custody(column)); + } + } + + // Now iterate all blocks ensuring that the block roots of each block and data column match, + blocks + .into_iter() + .map(|block| { + let block_root = get_block_root(&block); + let data_columns_with_block_root = custody_columns_by_block + // Remove to only use columns once + .remove(&block_root) + .unwrap_or_default(); + + // TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need + // the spec here. + RpcBlock::new_with_custody_columns( + Some(block_root), + block, + data_columns_with_block_root, + custody_column_indices.clone(), + spec, + ) + .map_err(Error::InternalError) + }) + .collect::, _>>() +} + +impl ByRangeRequest { + fn finish(&mut self, id: I, data: T, peer_id: P) -> Result<(), Error> { + match self { + Self::Active(expected_id) => { + if expected_id != &id { + return Err(Error::InternalError(format!( + "unexpected req_id expected {expected_id} got {id}" + ))); + } + *self = Self::Complete(data, peer_id); + Ok(()) + } + Self::Complete(_, _) => Err(Error::InternalError(format!( + "request already complete {id}" + ))), + } + } + + fn to_finished(&self) -> Option<(&T, &P)> { + match self { + Self::Active(_) => None, + Self::Complete(data, peer_id) => Some((data, peer_id)), + } + } +} diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs new file mode 100644 index 00000000000..9f8e163ba47 --- /dev/null +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -0,0 +1,481 @@ +use super::custody_by_root::{ColumnRequest, Error}; +use crate::sync::network_context::RpcResponseError; +use beacon_chain::validator_monitor::timestamp_now; +use beacon_chain::BeaconChainTypes; +use fnv::FnvHashMap; +use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; +use lighthouse_network::service::api_types::{ + CustodyByRangeRequestId, DataColumnsByRangeRequestId, +}; +use lighthouse_network::{PeerAction, PeerId}; +use lru_cache::LRUTimeCache; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::HashSet; +use std::time::{Duration, Instant}; +use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use tracing::{debug, warn}; +use types::{ + data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, + SignedBeaconBlockHeader, Slot, +}; + +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; + +const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; +const REQUEST_EXPIRY_SECONDS: u64 = 300; + +type DataColumnSidecarList = Vec>>; + +pub struct ActiveCustodyByRangeRequest { + start_time: Instant, + id: CustodyByRangeRequestId, + // TODO(das): Pass a better type for the by_range request + epoch: Epoch, + /// Blocks that we expect peers to serve data columns for + blocks_with_data: Vec, + /// List of column indices this request needs to download to complete successfully + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>, + >, + /// Active requests for 1 or more columns each + active_batch_columns_requests: + FnvHashMap, + /// Peers that have recently failed to successfully respond to a columns by root request. + /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. + peers_with_custody_failures: LRUTimeCache, + peers_with_temporary_faults: LRUTimeCache, + // TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request + // structs are dropped after some time, that disconnected peers are pruned (but we may want to + // retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache + // with a long time (like 5 minutes). + peers_with_permanent_faults: HashSet, + /// Set of peers that claim to have imported this block and their custody columns + lookup_peers: Arc>>, + + _phantom: PhantomData, +} + +struct ActiveBatchColumnsRequest { + indices: Vec, +} + +pub type CustodyByRangeRequestResult = + Result, PeerGroup, Duration)>, Error>; + +enum ColumnResponseError { + NonMatchingColumn { + slot: Slot, + actual_block_root: Hash256, + expected_block_root: Hash256, + }, + MissingColumn(Slot), +} + +impl ActiveCustodyByRangeRequest { + pub(crate) fn new( + id: CustodyByRangeRequestId, + epoch: Epoch, + blocks_with_data: Vec, + column_indices: &[ColumnIndex], + lookup_peers: Arc>>, + ) -> Self { + Self { + start_time: Instant::now(), + id, + epoch, + blocks_with_data, + column_requests: HashMap::from_iter( + column_indices + .iter() + .map(|index| (*index, ColumnRequest::new())), + ), + active_batch_columns_requests: <_>::default(), + peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs( + TEMPORARY_FAULT_EXPIRY_SECONDS, + )), + peers_with_permanent_faults: HashSet::new(), + lookup_peers, + _phantom: PhantomData, + } + } + + /// Insert a downloaded column into an active custody request. Then make progress on the + /// entire request. + /// + /// ### Returns + /// + /// - `Err`: Custody request has failed and will be dropped + /// - `Ok(Some)`: Custody request has successfully completed and will be dropped + /// - `Ok(None)`: Custody request still active + pub(crate) fn on_data_column_downloaded( + &mut self, + peer_id: PeerId, + req_id: DataColumnsByRangeRequestId, + resp: RpcResponseResult>, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { + warn!( + id = %self.id, + %req_id, + "Received custody by range response for unrequested index" + ); + return Ok(None); + }; + + match resp { + Ok((data_columns, seen_timestamp)) => { + // Map columns by index as an optimization to not loop the returned list on each + // requested index. The worse case is 128 loops over a 128 item vec + mutation to + // drop the consumed columns. + let mut data_columns_by_index = + HashMap::<(ColumnIndex, Slot), Arc>>::new(); + for data_column in data_columns { + data_columns_by_index + .insert((data_column.index, data_column.slot()), data_column); + } + + // Accumulate columns that the peer does not have to issue a single log per request + let mut missing_column_indexes = vec![]; + let mut incorrect_column_indices = vec![]; + let mut imported_column_indices = vec![]; + + for index in &batch_request.indices { + let column_request = + self.column_requests + .get_mut(index) + .ok_or(Error::InternalError(format!( + "unknown column_index {index}" + )))?; + + let columns_at_index = self + .blocks_with_data + .iter() + .map(|block| { + let slot = block.message.slot; + if let Some(data_column) = data_columns_by_index.remove(&(*index, slot)) + { + let actual_block_root = + data_column.signed_block_header.message.canonical_root(); + let expected_block_root = block.message.canonical_root(); + if actual_block_root != expected_block_root { + Err(ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root: data_column + .signed_block_header + .message + .canonical_root(), + expected_block_root: block.message.canonical_root(), + }) + } else { + Ok(data_column) + } + } else { + // The following three statements are true: + // - block at `slot` is not missed, and has data + // - peer custodies this column `index` + // - peer claims to be synced to at least `slot` + // + // Therefore not returning this column is an protocol violation that we + // penalize and mark the peer as failed to retry with another peer. + // + // TODO(das) do not consider this case a success. We know for sure the block has + // data. However we allow the peer to return empty as we can't attribute fault. + // TODO(das): Should track which columns are missing and eventually give up + // TODO(das): If the peer is in the lookup peer set it claims to have imported + // the block AND its custody columns. So in this case we can downscore + Err(ColumnResponseError::MissingColumn(slot)) + } + }) + .collect::, _>>(); + + match columns_at_index { + Ok(columns_at_index) => { + column_request.on_download_success( + req_id, + peer_id, + columns_at_index, + seen_timestamp, + )?; + + imported_column_indices.push(index); + } + Err(e) => { + column_request.on_download_error(req_id)?; + + match e { + ColumnResponseError::NonMatchingColumn { + slot, + actual_block_root, + expected_block_root, + } => { + incorrect_column_indices.push(( + index, + slot, + actual_block_root, + expected_block_root, + )); + } + ColumnResponseError::MissingColumn(slot) => { + missing_column_indexes.push((index, slot)); + } + } + } + } + } + + // Log missing_column_indexes and incorrect_column_indices here in batch per request + // to make this logs more compact and less noisy. + if !imported_column_indices.is_empty() { + // TODO(das): this log may be redundant. We already log on DataColumnsByRange + // completed, and on DataColumnsByRange sent we log the column indices + // ``` + // Sync RPC request sent method="DataColumnsByRange" slots=8 epoch=4 columns=[52] peer=16Uiu2HAmEooeoHzHDYS35TSHrJDSfmREecPyFskrLPYm9Gm1EURj id=493/399/10/RangeSync/4/1 + // Sync RPC request completed id=493/399/10/RangeSync/4/1 method="DataColumnsByRange" count=1 + // ``` + // Which can be traced to this custody by range request, and the initial log + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + count = imported_column_indices.len(), + "Custody by range request download imported columns" + ); + } + + if !incorrect_column_indices.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + incorrect_columns = ?incorrect_column_indices, + "Custody by range peer returned non-matching columns" + ); + + // Returning a non-canonical column is not a permanent fault. We should not + // retry the peer for some time but the peer may return a canonical column in + // the future. + // TODO(das): if this finalized sync the fault is permanent + self.peers_with_temporary_faults.insert(peer_id); + cx.report_peer( + peer_id, + PeerAction::MidToleranceError, + "non-matching data column", + ); + } + + if !missing_column_indexes.is_empty() { + // Note: Batch logging that columns are missing to not spam logger + debug!( + id = %self.id, + data_columns_by_range_req_id = %req_id, + %peer_id, + // TODO(das): this property can become very noisy, being the full range 0..128 + ?missing_column_indexes, + "Custody by range peer claims to not have some data" + ); + + // Not having columns is not a permanent fault. The peer may be backfilling. + self.peers_with_custody_failures.insert(peer_id); + cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); + } + } + Err(err) => { + debug!( + id = %self.id, + %req_id, + %peer_id, + error = ?err, + "Custody by range download error" + ); + + // TODO(das): Should mark peer as failed and try from another peer + for column_index in &batch_request.indices { + self.column_requests + .get_mut(column_index) + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; + } + + match err { + // Verify errors are correctness errors against our request or about the + // returned data itself. This peer is faulty or malicious, should not be + // retried. + RpcResponseError::VerifyError(_) => { + self.peers_with_permanent_faults.insert(peer_id); + } + // Network errors are not permanent faults and worth retrying + RpcResponseError::RpcError(_) => { + self.peers_with_temporary_faults.insert(peer_id); + } + // Do nothing for internal errors + RpcResponseError::InternalError(_) => {} + // unreachable + RpcResponseError::RequestExpired(_) => {} + } + } + }; + + self.continue_requests(cx) + } + + pub(crate) fn continue_requests( + &mut self, + cx: &mut SyncNetworkContext, + ) -> CustodyByRangeRequestResult { + if self.column_requests.values().all(|r| r.is_downloaded()) { + // All requests have completed successfully. + let mut peers = HashMap::>::new(); + let mut seen_timestamps = vec![]; + let columns = std::mem::take(&mut self.column_requests) + .into_values() + .map(|request| { + let (peer, data_columns, seen_timestamp) = request.complete()?; + + for data_column in &data_columns { + let columns_by_peer = peers.entry(peer).or_default(); + if !columns_by_peer.contains(&(data_column.index as usize)) { + columns_by_peer.push(data_column.index as usize); + } + } + + seen_timestamps.push(seen_timestamp); + + Ok(data_columns) + }) + .collect::, _>>()? + // Flatten Vec> to Vec + // TODO(das): maybe not optimal for the coupling logic later + .into_iter() + .flatten() + .collect(); + + let peer_group = PeerGroup::from_set(peers); + let max_seen_timestamp = seen_timestamps.into_iter().max().unwrap_or(timestamp_now()); + return Ok(Some((columns, peer_group, max_seen_timestamp))); + } + + let active_request_count_by_peer = cx.active_request_count_by_peer(); + let mut columns_to_request_by_peer = HashMap::>::new(); + let lookup_peers = self.lookup_peers.read(); + + // Need to: + // - track how many active requests a peer has for load balancing + // - which peers have failures to attempt others + // - which peer returned what to have PeerGroup attributability + + for (column_index, request) in self.column_requests.iter_mut() { + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); + } + + // TODO(das): When is a fork and only a subset of your peers know about a block, we should + // only query the peers on that fork. Should this case be handled? How to handle it? + let custodial_peers = cx.get_custodial_peers(*column_index); + + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column + let mut priorized_peers = custodial_peers + .iter() + .filter(|peer| { + // Never request again peers with permanent faults + // Do not request peers with custody failures for some time + !self.peers_with_permanent_faults.contains(peer) + && !self.peers_with_custody_failures.contains(peer) + }) + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // De-prioritize peers that have failed to successfully respond to + // requests recently, but allow to immediatelly request them again + self.peers_with_temporary_faults.contains(peer), + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::thread_rng().gen::(), + *peer, + ) + }) + .collect::>(); + priorized_peers.sort_unstable(); + + if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + columns_to_request_by_peer + .entry(*peer_id) + .or_default() + .push(*column_index); + } else { + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above + } + } + } + + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { + let req_id = cx + .send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + // TODO(das): generalize with constants from batch + start_slot: self + .epoch + .start_slot(T::EthSpec::slots_per_epoch()) + .as_u64(), + count: T::EthSpec::slots_per_epoch(), + columns: indices.clone(), + }, + self.id, + ) + .map_err(|e| Error::InternalError(format!("send failed {e}")))?; + + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::InternalError(format!( + "Unknown column_request {column_index}" + )))?; + + column_request.on_download_start(req_id)?; + } + + self.active_batch_columns_requests + .insert(req_id, ActiveBatchColumnsRequest { indices }); + } + + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + + Ok(None) + } +} diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs similarity index 70% rename from beacon_node/network/src/sync/network_context/custody.rs rename to beacon_node/network/src/sync/network_context/custody_by_root.rs index f4d010b881e..c547837fc7f 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -1,5 +1,6 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, RpcRequestSendError, + RpcResponseError, }; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; @@ -12,22 +13,29 @@ use rand::Rng; use std::collections::HashSet; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; +use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::EthSpec; use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; -const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); +const REQUEST_EXPIRY_SECONDS: u64 = 300; +/// TODO(das): this attempt count is nested into the existing lookup request count. +const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; type DataColumnSidecarList = Vec>>; -pub struct ActiveCustodyRequest { +pub struct ActiveCustodyByRootRequest { + start_time: Instant, block_root: Hash256, custody_id: CustodyId, /// List of column indices this request needs to download to complete successfully - column_requests: FnvHashMap>, + #[allow(clippy::type_complexity)] + column_requests: FnvHashMap< + ColumnIndex, + ColumnRequest>>, + >, /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, @@ -40,29 +48,47 @@ pub struct ActiveCustodyRequest { _phantom: PhantomData, } -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug)] pub enum Error { - SendFailed(&'static str), - TooManyFailures, - BadState(String), - NoPeer(ColumnIndex), - /// Received a download result for a different request id than the in-flight request. - /// There should only exist a single request at a time. Having multiple requests is a bug and - /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. - UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, - }, + InternalError(String), + TooManyDownloadErrors(RpcResponseError), + ExpiredNoCustodyPeers(Vec), +} + +impl From for RpcResponseError { + fn from(e: Error) -> Self { + match e { + Error::InternalError(e) => RpcResponseError::InternalError(e), + Error::TooManyDownloadErrors(e) => e, + Error::ExpiredNoCustodyPeers(indices) => RpcResponseError::RequestExpired(format!( + "Expired waiting for custody peers {indices:?}" + )), + } + } +} + +impl From for RpcRequestSendError { + fn from(e: Error) -> Self { + match e { + Error::TooManyDownloadErrors(_) => { + RpcRequestSendError::InternalError("Download error in request send".to_string()) + } + Error::InternalError(e) => RpcRequestSendError::InternalError(e), + Error::ExpiredNoCustodyPeers(_) => RpcRequestSendError::InternalError( + "Request can not expire when requesting it".to_string(), + ), + } + } } struct ActiveBatchColumnsRequest { indices: Vec, } -pub type CustodyRequestResult = +pub type CustodyByRootRequestResult = Result, PeerGroup, Duration)>, Error>; -impl ActiveCustodyRequest { +impl ActiveCustodyByRootRequest { pub(crate) fn new( block_root: Hash256, custody_id: CustodyId, @@ -70,6 +96,7 @@ impl ActiveCustodyRequest { lookup_peers: Arc>>, ) -> Self { Self { + start_time: Instant::now(), block_root, custody_id, column_requests: HashMap::from_iter( @@ -98,7 +125,7 @@ impl ActiveCustodyRequest { req_id: DataColumnsByRootRequestId, resp: RpcResponseResult>, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { let Some(batch_request) = self.active_batch_columns_requests.get_mut(&req_id) else { warn!( block_root = ?self.block_root, @@ -131,7 +158,7 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; if let Some(data_column) = data_columns.remove(column_index) { column_request.on_download_success( @@ -182,8 +209,8 @@ impl ActiveCustodyRequest { for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) - .ok_or(Error::BadState("unknown column_index".to_owned()))? - .on_download_error_and_mark_failure(req_id)?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))? + .on_download_error_and_mark_failure(req_id, err.clone())?; } self.failed_peers.insert(peer_id); @@ -196,7 +223,7 @@ impl ActiveCustodyRequest { pub(crate) fn continue_requests( &mut self, cx: &mut SyncNetworkContext, - ) -> CustodyRequestResult { + ) -> CustodyByRootRequestResult { if self.column_requests.values().all(|r| r.is_downloaded()) { // All requests have completed successfully. let mut peers = HashMap::>::new(); @@ -222,6 +249,7 @@ impl ActiveCustodyRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); + let mut indices_without_peers = vec![]; // Need to: // - track how many active requests a peer has for load balancing @@ -229,9 +257,9 @@ impl ActiveCustodyRequest { // - which peer returned what to have PeerGroup attributability for (column_index, request) in self.column_requests.iter_mut() { - if let Some(wait_duration) = request.is_awaiting_download() { - if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { - return Err(Error::TooManyFailures); + if request.is_awaiting_download() { + if let Some(last_error) = request.too_many_failures() { + return Err(Error::TooManyDownloadErrors(last_error)); } // TODO(das): When is a fork and only a subset of your peers know about a block, we should @@ -270,21 +298,22 @@ impl ActiveCustodyRequest { .entry(*peer_id) .or_default() .push(*column_index); - } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { - // Allow to request to sit stale in `NotStarted` state for at most - // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that - // lookup will naturally retry when other peers send us attestations for - // descendants of this un-available lookup. - return Err(Error::NoPeer(*column_index)); } else { - // Do not issue requests if there is no custody peer on this column + // Do not issue requests if there is no custody peer on this column. The request + // will sit idle without making progress. The only way to make to progress is: + // - Add a new peer that custodies the missing columns + // - Call `continue_requests` + // + // Otherwise this request should be dropped and failed after some time. + // TODO(das): implement the above + indices_without_peers.push(column_index); } } } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx - .data_column_lookup_request( + .data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, DataColumnsByRootSingleBlockRequest { @@ -297,7 +326,9 @@ impl ActiveCustodyRequest { // columns. For the rest of peers, don't downscore if columns are missing. lookup_peers.contains(&peer_id), ) - .map_err(Error::SendFailed)?; + .map_err(|e| { + Error::InternalError(format!("Send failed data_columns_by_root {e:?}")) + })?; match request_result { LookupRequestResult::RequestSent(req_id) => { @@ -306,7 +337,7 @@ impl ActiveCustodyRequest { .column_requests .get_mut(column_index) // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; + .ok_or(Error::InternalError("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; } @@ -319,117 +350,149 @@ impl ActiveCustodyRequest { } } + if self.start_time.elapsed() > Duration::from_secs(REQUEST_EXPIRY_SECONDS) + && !self.column_requests.values().any(|r| r.is_downloading()) + { + let awaiting_peers_indicies = self + .column_requests + .iter() + .filter(|(_, r)| r.is_awaiting_download()) + .map(|(id, _)| *id) + .collect::>(); + return Err(Error::ExpiredNoCustodyPeers(awaiting_peers_indicies)); + } + Ok(None) } } -/// TODO(das): this attempt count is nested into the existing lookup request count. -const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; - -struct ColumnRequest { - status: Status, - download_failures: usize, +pub struct ColumnRequest { + status: Status, + download_failures: Vec, } -#[derive(Debug, Clone)] -enum Status { - NotStarted(Instant), - Downloading(DataColumnsByRootRequestId), - Downloaded(PeerId, Arc>, Duration), +#[derive(Debug, Clone, IntoStaticStr)] +pub enum Status { + NotStarted, + Downloading(I), + Downloaded(PeerId, T, Duration), } -impl ColumnRequest { - fn new() -> Self { +impl ColumnRequest { + pub fn new() -> Self { Self { - status: Status::NotStarted(Instant::now()), - download_failures: 0, + status: Status::NotStarted, + download_failures: vec![], } } - fn is_awaiting_download(&self) -> Option { + pub fn is_awaiting_download(&self) -> bool { match self.status { - Status::NotStarted(start_time) => Some(start_time.elapsed()), - Status::Downloading { .. } | Status::Downloaded { .. } => None, + Status::NotStarted => true, + Status::Downloading { .. } | Status::Downloaded { .. } => false, } } - fn is_downloaded(&self) -> bool { + pub fn is_downloading(&self) -> bool { match self.status { - Status::NotStarted { .. } | Status::Downloading { .. } => false, + Status::NotStarted => false, + Status::Downloading { .. } => true, + Status::Downloaded { .. } => false, + } + } + + pub fn is_downloaded(&self) -> bool { + match self.status { + Status::NotStarted | Status::Downloading { .. } => false, Status::Downloaded { .. } => true, } } - fn on_download_start(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn too_many_failures(&self) -> Option { + if self.download_failures.len() > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { + Some( + self.download_failures + .last() + .cloned() + .expect("download_failures is not empty"), + ) + } else { + None + } + } + + pub fn on_download_start(&mut self, req_id: I) -> Result<(), Error> { match &self.status { - Status::NotStarted { .. } => { + Status::NotStarted => { self.status = Status::Downloading(req_id); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_start expected NotStarted got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_start expected NotStarted got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error(&mut self, req_id: DataColumnsByRootRequestId) -> Result<(), Error> { + pub fn on_download_error(&mut self, req_id: I) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } - self.status = Status::NotStarted(Instant::now()); + self.status = Status::NotStarted; Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_error expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_error expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn on_download_error_and_mark_failure( + pub fn on_download_error_and_mark_failure( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, + e: RpcResponseError, ) -> Result<(), Error> { - // TODO(das): Should track which peers don't have data - self.download_failures += 1; + self.download_failures.push(e); self.on_download_error(req_id) } - fn on_download_success( + pub fn on_download_success( &mut self, - req_id: DataColumnsByRootRequestId, + req_id: I, peer_id: PeerId, - data_column: Arc>, + data_column: T, seen_timestamp: Duration, ) -> Result<(), Error> { match &self.status { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { - return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, - }); + return Err(Error::InternalError(format!( + "Received download result for req_id {req_id} expecting {expected_req_id}" + ))); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); Ok(()) } - other => Err(Error::BadState(format!( - "bad state on_download_success expected Downloading got {other:?}" + other => Err(Error::InternalError(format!( + "bad state on_download_success expected Downloading got {}", + Into::<&'static str>::into(other), ))), } } - fn complete(self) -> Result<(PeerId, Arc>, Duration), Error> { + pub fn complete(self) -> Result<(PeerId, T, Duration), Error> { match self.status { Status::Downloaded(peer_id, data_column, seen_timestamp) => { Ok((peer_id, data_column, seen_timestamp)) } - other => Err(Error::BadState(format!( - "bad state complete expected Downloaded got {other:?}" + other => Err(Error::InternalError(format!( + "bad state complete expected Downloaded got {}", + Into::<&'static str>::into(other), ))), } } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index cd70a2e7ebc..8228ea5d9d5 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -26,7 +26,7 @@ mod blocks_by_root; mod data_columns_by_range; mod data_columns_by_root; -#[derive(Debug, PartialEq, Eq, IntoStaticStr)] +#[derive(Debug, Clone, PartialEq, Eq, IntoStaticStr)] pub enum LookupVerifyError { NotEnoughResponsesReturned { actual: usize, @@ -177,12 +177,10 @@ impl ActiveRequests { } } - pub fn active_requests_of_peer(&self, peer_id: &PeerId) -> Vec<&K> { + pub fn active_requests(&self) -> impl Iterator { self.requests .iter() - .filter(|(_, request)| &request.peer_id == peer_id) - .map(|(id, _)| id) - .collect() + .map(|(id, request)| (id, &request.peer_id)) } pub fn iter_request_peers(&self) -> impl Iterator + '_ { diff --git a/beacon_node/network/src/sync/peer_sampling.rs b/beacon_node/network/src/sync/peer_sampling.rs index 59b751787e3..d76c7d2bbc2 100644 --- a/beacon_node/network/src/sync/peer_sampling.rs +++ b/beacon_node/network/src/sync/peer_sampling.rs @@ -98,13 +98,13 @@ impl Sampling { // TODO(das): Should track failed sampling request for some time? Otherwise there's // a risk of a loop with multiple triggers creating the request, then failing, // and repeat. - debug!(?id, "Ignoring duplicate sampling request"); + debug!(%id, "Ignoring duplicate sampling request"); return None; } }; debug!( - ?id, + %id, column_selection = ?request.column_selection(), "Created new sample request" ); @@ -138,7 +138,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample downloaded event for unknown request"); + debug!(%id, "Sample downloaded event for unknown request"); return None; }; @@ -167,7 +167,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let Some(request) = self.requests.get_mut(&id.id) else { // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(?id, "Sample verified event for unknown request"); + debug!(%id, "Sample verified event for unknown request"); return None; }; @@ -191,7 +191,7 @@ impl Sampling { ) -> Option<(SamplingRequester, SamplingResult)> { let result = result.transpose(); if let Some(result) = result { - debug!(?id, ?result, "Sampling request completed, removing"); + debug!(%id, ?result, "Sampling request completed, removing"); metrics::inc_counter_vec( &metrics::SAMPLING_REQUEST_RESULT, &[metrics::from_result(&result)], @@ -570,7 +570,7 @@ impl ActiveSamplingRequest { // Send requests. let mut sent_request = false; for (peer_id, column_indexes) in column_indexes_to_request { - cx.data_column_lookup_request( + cx.data_columns_by_root_request( DataColumnsByRootRequester::Sampling(SamplingId { id: self.requester_id, sampling_request_id: self.current_sampling_request_id, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 72598a25405..81f33352f50 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,4 +1,5 @@ use beacon_chain::block_verification_types::RpcBlock; +use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; @@ -17,15 +18,7 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -/// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] -#[strum(serialize_all = "snake_case")] -pub enum ByRangeRequestType { - BlocksAndColumns, - BlocksAndBlobs, - Blocks, -} - +// TODO(das): Consider merging with PeerGroup #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, @@ -53,6 +46,12 @@ impl BatchPeers { pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.get(index) } + + pub fn iter_unique_peers(&self) -> impl Iterator { + std::iter::once(&self.block_peer) + .chain(self.column_peers.values()) + .unique() + } } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index ba809a14ba1..abea407b0ed 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -10,7 +10,7 @@ use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use std::collections::{btree_map::Entry, BTreeMap, HashSet}; +use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -87,9 +87,11 @@ pub struct SyncingChain { batches: BTreeMap>, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain - /// and thus available to download this chain from, as well as the batches we are currently - /// requesting. - peers: HashSet, + /// and thus available to download this chain from. + /// + /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain + /// `HashMap` + peers: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -121,7 +123,40 @@ pub enum ChainSyncingState { Syncing, } +#[cfg(test)] +#[derive(Debug, Eq, PartialEq)] +pub enum BatchStateSummary { + Downloading, + Processing, + AwaitingProcessing, + AwaitingValidation, + Unexpected(&'static str), +} + impl SyncingChain { + /// Returns a summary of batch states for assertions in tests. + #[cfg(test)] + pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> { + self.batches + .iter() + .map(|(id, batch)| { + let state = match batch.state() { + // A batch is never left in this state, it's only the initial value + BatchState::AwaitingDownload => { + BatchStateSummary::Unexpected("AwaitingDownload") + } + BatchState::Downloading { .. } => BatchStateSummary::Downloading, + BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing, + BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"), + BatchState::Processing { .. } => BatchStateSummary::Processing, + BatchState::Failed => BatchStateSummary::Unexpected("Failed"), + BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation, + }; + (*id, state) + }) + .collect() + } + #[allow(clippy::too_many_arguments)] pub fn new( id: Id, @@ -138,7 +173,7 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashSet::from_iter([peer_id]), + peers: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -168,7 +203,7 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.iter().cloned() + self.peers.keys().cloned() } /// Progress in epochs made by the chain @@ -221,6 +256,12 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { + // Account for one more requests to this peer + // TODO(das): this code assumes that we do a single request per peer per RpcBlock + for peer in batch_peers.iter_unique_peers() { + *self.peers.entry(*peer).or_default() += 1; + } + // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -400,11 +441,6 @@ impl SyncingChain { self.request_batches(network)?; } } - } else if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - // This is to handle the case where no batch was sent for the current processing - // target when there is no sampling peers available. This is a valid state and should not - // return an error. - return Ok(KeepChain); } else { return Err(RemoveChain::WrongChainState(format!( "Batch not found for current processing target {}", @@ -577,7 +613,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for peer in self.peers.drain() { + for (peer, _) in self.peers.drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -842,7 +878,7 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id); + self.peers.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -854,7 +890,6 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, ) -> ProcessingResult { @@ -869,7 +904,6 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - %peer_id, %request_id, ?batch_state, "Batch not expecting block" @@ -880,12 +914,13 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - %peer_id, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? + // TODO(das): Is it necessary for the batch to track failed peers? Can we make this + // mechanism compatible with PeerDAS and before PeerDAS? + batch.download_failed(None)? { return Err(RemoveChain::ChainFailed { blacklist, @@ -896,7 +931,6 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - %peer_id, %request_id, batch_state, "Batch not found" @@ -937,6 +971,7 @@ impl SyncingChain { }, &synced_peers, &failed_peers, + &self.peers, ) { Ok(request_id) => { // inform the batch about the new request @@ -953,14 +988,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For - // sync to work properly it must be okay to have "stalled" batches in - // AwaitingDownload state. Currently it will error with invalid state if - // that happens. Sync manager must periodicatlly prune stalled batches like - // we do for lookup sync. Then we can deprecate the redundant - // `good_peers_on_sampling_subnets` checks. - e - @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { + RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried @@ -1019,11 +1047,6 @@ impl SyncingChain { // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); - return Ok(KeepChain); - } - if let Entry::Vacant(entry) = self.batches.entry(epoch) { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH); entry.insert(optimistic_batch); @@ -1046,35 +1069,6 @@ impl SyncingChain { Ok(KeepChain) } - /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in - /// every sampling column subnet. - fn good_peers_on_sampling_subnets( - &self, - epoch: Epoch, - network: &SyncNetworkContext, - ) -> bool { - if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { - // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network - .network_globals() - .sampling_subnets - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer(*subnet_id) - .count(); - - peer_count > 0 - }); - peers_on_all_custody_subnets - } else { - true - } - } - /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] @@ -1107,15 +1101,6 @@ impl SyncingChain { return None; } - // don't send batch requests until we have peers on sampling subnets - // TODO(das): this is a workaround to avoid sending out excessive block requests because - // block and data column requests are currently coupled. This can be removed once we find a - // way to decouple the requests and do retries individually, see issue #6258. - if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); - return None; - } - // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index 9f500c61e0b..454f7c02d15 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -54,6 +54,13 @@ pub struct ChainCollection { } impl ChainCollection { + #[cfg(test)] + pub(crate) fn iter(&self) -> impl Iterator> { + self.finalized_chains + .values() + .chain(self.head_chains.values()) + } + pub fn new(beacon_chain: Arc>) -> Self { ChainCollection { beacon_chain, diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 1218e0cd09c..e9fb0219c45 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,10 +9,9 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, - ByRangeRequestType, }; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] -pub use chain_collection::SyncChainStatus; +pub use chain::BatchStateSummary; +pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index e2c076484a5..473e2066cee 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,6 +39,8 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. +#[cfg(test)] +use super::chain::BatchStateSummary; use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; @@ -100,10 +102,23 @@ where } #[cfg(test)] - pub(crate) fn __failed_chains(&mut self) -> Vec { + pub(crate) fn failed_chains(&mut self) -> Vec { self.failed_chains.keys().copied().collect() } + #[cfg(test)] + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.chains + .iter() + .flat_map(|chain| { + chain + .batches_state() + .into_iter() + .map(|(batch_id, state)| (chain.id(), batch_id, state)) + }) + .collect() + } + #[instrument(parent = None, level = "info", fields(component = "range_sync"), @@ -344,7 +359,6 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -352,7 +366,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) + chain.inject_error(network, batch_id, request_id, err) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 5863091cf0e..3e83605a276 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -35,7 +35,7 @@ use lighthouse_network::{ SamplingRequester, SingleLookupReqId, SyncRequestId, }, types::SyncState, - NetworkConfig, NetworkGlobals, PeerId, + NetworkConfig, NetworkGlobals, PeerId, SyncInfo, }; use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; @@ -53,8 +53,21 @@ const SAMPLING_REQUIRED_SUCCESSES: usize = 2; type DCByRootIds = Vec; type DCByRootId = (SyncRequestId, Vec); +pub enum PeersConfig { + SupernodeAndRandom, + SupernodeOnly, +} + impl TestRig { pub fn test_setup() -> Self { + Self::test_setup_with_options(false) + } + + pub fn test_setup_as_supernode() -> Self { + Self::test_setup_with_options(true) + } + + fn test_setup_with_options(is_supernode: bool) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -83,10 +96,11 @@ impl TestRig { // TODO(das): make the generation of the ENR use the deterministic rng to have consistent // column assignments let network_config = Arc::new(NetworkConfig::default()); - let globals = Arc::new(NetworkGlobals::new_test_globals( + let globals = Arc::new(NetworkGlobals::new_test_globals_as_supernode( Vec::new(), network_config, chain.spec.clone(), + is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -113,6 +127,7 @@ impl TestRig { network_rx, network_rx_queue: vec![], sync_rx, + sent_blocks_by_range: <_>::default(), rng, network_globals: beacon_processor.network_globals.clone(), sync_manager: SyncManager::new( @@ -244,8 +259,8 @@ impl TestRig { self.sync_manager.active_parent_lookups().len() } - fn active_range_sync_chain(&self) -> (RangeSyncType, Slot, Slot) { - self.sync_manager.get_range_sync_chains().unwrap().unwrap() + fn active_range_sync_chain(&mut self) -> (RangeSyncType, Slot, Slot) { + self.sync_manager.range_sync().state().unwrap().unwrap() } fn assert_single_lookups_count(&self, count: usize) { @@ -355,29 +370,63 @@ impl TestRig { self.expect_empty_network(); } - pub fn new_connected_peer(&mut self) -> PeerId { + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(false) + } + + // Don't make pub, use `add_connected_peer_testing_only` + fn new_connected_supernode_peer(&mut self) -> PeerId { + self.add_connected_peer_testing_only(true) + } + + pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key); - self.log(&format!("Added new peer for testing {peer_id:?}")); + .__add_connected_peer_testing_only(supernode, &self.harness.spec, key); + let mut peer_custody_subnets = self + .network_globals + .peers + .read() + .peer_info(&peer_id) + .expect("peer was just added") + .custody_subnets_iter() + .map(|subnet| **subnet) + .collect::>(); + peer_custody_subnets.sort_unstable(); + self.log(&format!( + "Added new peer for testing {peer_id:?} custody subnets {peer_custody_subnets:?}" + )); peer_id } - pub fn new_connected_supernode_peer(&mut self) -> PeerId { - let key = self.determinstic_key(); - self.network_globals - .peers - .write() - .__add_connected_peer_testing_only(true, &self.harness.spec, key) + pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { + let peer_id = self.add_connected_peer_testing_only(supernode); + self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); + peer_id } fn determinstic_key(&mut self) -> CombinedKey { k256::ecdsa::SigningKey::random(&mut self.rng).into() } + pub fn add_sync_peers(&mut self, config: PeersConfig, remote_info: SyncInfo) { + match config { + PeersConfig::SupernodeAndRandom => { + for _ in 0..100 { + self.add_sync_peer(false, remote_info.clone()); + } + self.add_sync_peer(true, remote_info); + } + PeersConfig::SupernodeOnly => { + self.add_sync_peer(true, remote_info); + } + } + } + pub fn new_connected_peers_for_peerdas(&mut self) { // Enough sampling peers with few columns for _ in 0..100 { @@ -840,6 +889,19 @@ impl TestRig { } } + // Find, not pop + pub fn filter_received_network_events) -> Option>( + &mut self, + predicate_transform: F, + ) -> Vec { + self.drain_network_rx(); + + self.network_rx_queue + .iter() + .filter_map(predicate_transform) + .collect() + } + pub fn pop_received_processor_event) -> Option>( &mut self, predicate_transform: F, @@ -1088,6 +1150,21 @@ impl TestRig { } } + pub fn expect_no_penalty_for_anyone(&mut self) { + self.drain_network_rx(); + let downscore_events = self + .network_rx_queue + .iter() + .filter_map(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), + _ => None, + }) + .collect::>(); + if !downscore_events.is_empty() { + panic!("Expected no downscoring events but found: {downscore_events:?}"); + } + } + #[track_caller] fn expect_parent_chain_process(&mut self) { match self.beacon_processor_rx.try_recv() { @@ -1123,6 +1200,25 @@ impl TestRig { } } + #[track_caller] + pub fn expect_penalties(&mut self, expected_penalty_msg: &'static str) { + let all_penalties = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((*peer_id, *msg)), + _ => None, + }); + if all_penalties + .iter() + .any(|(_, msg)| *msg != expected_penalty_msg) + { + panic!( + "Expected penalties only of {expected_penalty_msg}, but found {all_penalties:?}" + ); + } + self.log(&format!( + "Found expected penalties {expected_penalty_msg}: {all_penalties:?}" + )); + } + #[track_caller] pub fn expect_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { let penalty_msg = self diff --git a/beacon_node/network/src/sync/tests/mod.rs b/beacon_node/network/src/sync/tests/mod.rs index ec24ddb036a..a09313c5021 100644 --- a/beacon_node/network/src/sync/tests/mod.rs +++ b/beacon_node/network/src/sync/tests/mod.rs @@ -6,13 +6,17 @@ use beacon_chain::builder::Witness; use beacon_chain::eth1_chain::CachingEth1Backend; use beacon_chain::test_utils::{BeaconChainHarness, EphemeralHarnessType}; use beacon_processor::WorkEvent; +use lighthouse_network::service::api_types::ComponentsByRangeRequestId; use lighthouse_network::NetworkGlobals; use rand_chacha::ChaCha20Rng; use slot_clock::ManualSlotClock; +use std::collections::HashMap; use std::sync::Arc; use store::MemoryStore; use tokio::sync::mpsc; -use types::{ChainSpec, ForkName, MinimalEthSpec as E}; +use types::{ChainSpec, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; + +pub use lookups::PeersConfig; mod lookups; mod range; @@ -64,4 +68,7 @@ struct TestRig { rng: ChaCha20Rng, fork_name: ForkName, spec: Arc, + + // Cache of sent blocks for PeerDAS responses + sent_blocks_by_range: HashMap>>>, } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 06dca355e53..c82e4f97769 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -2,27 +2,33 @@ use super::*; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; -use crate::sync::network_context::RangeRequestId; -use crate::sync::range_sync::RangeSyncType; -use crate::sync::SyncMessage; +use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; +use crate::sync::range_sync::{BatchId, BatchStateSummary, RangeSyncType}; +use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; +use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, - OldBlocksByRangeRequestV2, }; use lighthouse_network::rpc::{RequestType, StatusMessage}; use lighthouse_network::service::api_types::{ - AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + DataColumnsByRangeRequestId, SyncRequestId, }; -use lighthouse_network::{PeerId, SyncInfo}; +use lighthouse_network::types::SyncState; +use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; +use rand::SeedableRng; +use rand_chacha::ChaCha20Rng; +use std::collections::HashSet; use std::time::Duration; +use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BlobSidecarList, BlockImportSource, Epoch, EthSpec, Hash256, MinimalEthSpec as E, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, + DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, + SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -34,10 +40,43 @@ pub(crate) enum DataSidecars { enum ByRangeDataRequestIds { PreDeneb, - PrePeerDAS(BlobsByRangeRequestId, PeerId), - PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PrePeerDAS(BlobsByRangeRequestId, PeerId, BlobsByRangeRequest), + PostPeerDAS( + Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )>, + ), } +impl ByRangeDataRequestIds { + fn peer(&self) -> PeerId { + match self { + Self::PreDeneb => panic!("no requests PreDeneb"), + Self::PrePeerDAS(_, peer, _) => *peer, + Self::PostPeerDAS(reqs) => { + if reqs.len() != 1 { + panic!("Should have 1 PostPeerDAS request"); + } + reqs.first().expect("no PostPeerDAS requests").1 + } + } + } +} + +struct Config { + peers: PeersConfig, +} + +type BlocksByRangeRequestData = (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest); + +type DataColumnsByRangeRequestData = ( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, +); + /// Sync tests are usually written in the form: /// - Do some action /// - Expect a request to be sent @@ -46,10 +85,11 @@ enum ByRangeDataRequestIds { /// To make writting tests succint, the machinery in this testing rig automatically identifies /// _which_ request to complete. Picking the right request is critical for tests to pass, so this /// filter allows better expressivity on the criteria to identify the right request. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, Copy)] struct RequestFilter { peer: Option, epoch: Option, + column_index: Option, } impl RequestFilter { @@ -62,13 +102,117 @@ impl RequestFilter { self.epoch = Some(epoch); self } + + fn column_index(mut self, index: u64) -> Self { + self.column_index = Some(index); + self + } + + fn blocks_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::BlocksByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), + } if self.matches_blocks_by_range(peer_id, req) => Some((*id, *peer_id, req.clone())), + _ => None, + } + } + + fn data_columns_by_range_requests( + &self, + ev: &NetworkMessage, + ) -> Option { + match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRange(req), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), + } if self.matches_data_columns_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } + _ => None, + } + } + + fn matches_blocks_by_range(&self, peer: &PeerId, req: &OldBlocksByRangeRequest) -> bool { + self.matches_common(peer, *req.start_slot()) + } + + fn matches_blobs_by_range(&self, peer: &PeerId, req: &BlobsByRangeRequest) -> bool { + self.matches_common(peer, req.start_slot) + } + + fn matches_data_columns_by_range( + &self, + peer: &PeerId, + req: &DataColumnsByRangeRequest, + ) -> bool { + if let Some(index) = self.column_index { + if !req.columns.contains(&index) { + return false; + } + } + self.matches_common(peer, req.start_slot) + } + + fn matches_common(&self, peer: &PeerId, start_slot: u64) -> bool { + if let Some(expected_epoch) = self.epoch { + let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); + if epoch != expected_epoch { + return false; + } + } + if let Some(expected_peer) = self.peer { + if *peer != expected_peer { + return false; + } + } + true + } } fn filter() -> RequestFilter { RequestFilter::default() } +/// Instruct the testing rig how to complete requests for _by_range requests +#[derive(Debug, Clone, Copy)] +struct CompleteConfig { + block_count: usize, + with_data: bool, + custody_failure_at_index: Option, +} + +impl CompleteConfig { + // TODO(das): add tests where blocks don't have data + + fn custody_failure_at_index(mut self, index: u64) -> Self { + self.custody_failure_at_index = Some(index); + self + } +} + +fn complete() -> CompleteConfig { + CompleteConfig { + block_count: 1, + with_data: true, + custody_failure_at_index: None, + } +} + impl TestRig { + fn our_custody_indices(&self) -> Vec { + self.network_globals + .sampling_columns + .iter() + .copied() + .collect() + } + /// Produce a head peer with an advanced head fn add_head_peer(&mut self) -> PeerId { self.add_head_peer_with_root(Hash256::random()) @@ -77,7 +221,7 @@ impl TestRig { /// Produce a head peer with an advanced head fn add_head_peer_with_root(&mut self, head_root: Hash256) -> PeerId { let local_info = self.local_info(); - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { head_root, head_slot: local_info.head_slot + 1 + Slot::new(SLOT_IMPORT_TOLERANCE as u64), ..local_info @@ -93,7 +237,7 @@ impl TestRig { fn add_finalized_peer_with_root(&mut self, finalized_root: Hash256) -> PeerId { let local_info = self.local_info(); let finalized_epoch = local_info.finalized_epoch + 2; - self.add_random_peer(SyncInfo { + self.add_connected_sync_random_peer(SyncInfo { finalized_epoch, finalized_root, head_slot: finalized_epoch.start_slot(E::slots_per_epoch()), @@ -128,37 +272,22 @@ impl TestRig { } } - fn add_random_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { - let peer_id = self.new_connected_peer(); - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + fn add_connected_sync_peer_not_supernode(&mut self, remote_info: SyncInfo) -> PeerId { + self.add_sync_peer(false, remote_info) } - fn add_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { + fn add_connected_sync_random_peer(&mut self, remote_info: SyncInfo) -> PeerId { // Create valid peer known to network globals // TODO(fulu): Using supernode peers to ensure we have peer across all column // subnets for syncing. Should add tests connecting to full node peers. - let peer_id = self.new_connected_supernode_peer(); - // Send peer to sync - self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); - peer_id + self.add_sync_peer(true, remote_info) } - fn add_random_peers(&mut self, remote_info: SyncInfo, count: usize) { - for _ in 0..count { - let peer = self.new_connected_peer(); - self.add_peer(peer, remote_info.clone()); - } - } - - fn add_peer(&mut self, peer: PeerId, remote_info: SyncInfo) { - self.send_sync_message(SyncMessage::AddPeer(peer, remote_info)); - } - - fn assert_state(&self, state: RangeSyncType) { + fn assert_state(&mut self, state: RangeSyncType) { assert_eq!( self.sync_manager - .range_sync_state() + .range_sync() + .state() .expect("State is ok") .expect("Range should be syncing, there are no chains") .0, @@ -167,15 +296,28 @@ impl TestRig { ); } - fn assert_no_chains_exist(&self) { - if let Some(chain) = self.sync_manager.get_range_sync_chains().unwrap() { + fn get_sync_state(&mut self) -> SyncState { + self.sync_manager.network().network_globals().sync_state() + } + + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + self.sync_manager.range_sync().batches_state() + } + + fn assert_sync_state(&mut self) { + let current_state = self.sync_manager.network().network_globals().sync_state(); + panic!("{:?}", current_state); + } + + fn assert_no_chains_exist(&mut self) { + if let Some(chain) = self.sync_manager.range_sync().state().unwrap() { panic!("There still exists a chain {chain:?}"); } } fn assert_no_failed_chains(&mut self) { assert_eq!( - self.sync_manager.__range_failed_chains(), + self.sync_manager.range_sync().failed_chains(), Vec::::new(), "Expected no failed chains" ) @@ -191,110 +333,359 @@ impl TestRig { } } + fn expect_blocks_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = + self.filter_received_network_events(|ev| request_filter.blocks_by_range_requests(ev)); + if events.is_empty() { + panic!("Expected to find blocks_by_range requests {request_filter:?}") + } + } + + fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { + let events = self + .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); + if !events.is_empty() { + panic!("Expected to not find data_columns_by_range requests {request_filter:?} by found {events:?}") + } + } + + fn expect_active_block_components_by_range_request_on_custody_step(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if requests.is_empty() { + panic!("No active block_components_by_range requests"); + } + for (id, step) in requests { + if !matches!(step, BlockComponentsByRangeRequestStep::CustodyRequest) { + panic!("block_components_by_range request {id} is not on CustodyRequest step: {step:?}"); + } + } + } + + fn expect_no_active_block_components_by_range_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_block_components_by_range_requests(); + if !requests.is_empty() { + panic!("Still active block_components_by_range requests {requests:?}"); + } + } + + fn expect_no_active_rpc_requests(&mut self) { + let requests = self + .sync_manager + .network() + .active_requests() + .collect::>(); + if !requests.is_empty() { + panic!("There are still active RPC requests {requests:?}"); + } + } + + fn expect_all_batches_in_state(&mut self, states: &[BatchStateSummary]) { + let batches = self.get_batch_states(); + if batches.is_empty() { + panic!("no batches"); + } + for batch in &batches { + if !states.contains(&batch.2) { + panic!("batch {batch:?} not in state {states:?}. Batches: {batches:?}"); + } + } + } + + fn expect_all_batches_downloading(&mut self) { + self.expect_all_batches_in_state(&[BatchStateSummary::Downloading]); + } + + fn expect_all_batches_processing_or_awaiting(&mut self) { + self.expect_all_batches_in_state(&[ + BatchStateSummary::Processing, + BatchStateSummary::AwaitingProcessing, + ]); + } + fn update_execution_engine_state(&mut self, state: EngineState) { self.log(&format!("execution engine state updated: {state:?}")); self.sync_manager.update_execution_engine_state(state); } - fn find_blocks_by_range_request( - &mut self, - request_filter: RequestFilter, - ) -> ((BlocksByRangeRequestId, PeerId), ByRangeDataRequestIds) { - let filter_f = |peer: PeerId, start_slot: u64| { - if let Some(expected_epoch) = request_filter.epoch { - let epoch = Slot::new(start_slot).epoch(E::slots_per_epoch()).as_u64(); - if epoch != expected_epoch { - return false; - } + fn zero_block_at_slot(&mut self, slot: Slot, with_data: bool) -> Arc> { + let mut block = BeaconBlock::empty(&self.spec); + if with_data { + if let Ok(blob_kzg_commitments) = block.body_mut().blob_kzg_commitments_mut() { + blob_kzg_commitments + .push(KzgCommitment([0; 48])) + .expect("pushed to empty kzg commitments"); } - if let Some(expected_peer) = request_filter.peer { - if peer != expected_peer { - return false; - } - } - true - }; + } + *block.slot_mut() = slot; + Arc::new(SignedBeaconBlock::from_block(block, Signature::empty())) + } - let block_req = self - .pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::BlocksByRange(OldBlocksByRangeRequest::V2( - OldBlocksByRangeRequestV2 { start_slot, .. }, - )), - app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) + fn last_sent_blocks_by_range( + &mut self, + id: ComponentsByRangeRequestId, + ) -> Vec>> { + self.sent_blocks_by_range + .get(&id) + .cloned() + .unwrap_or_else(|| panic!("No blocks for ComponentsByRangeRequestId {id}")) + } + + fn send_blocks_by_range_response( + &mut self, + req_id: BlocksByRangeRequestId, + peer_id: PeerId, + blocks: &[Arc>], + ) { + let slots = blocks.iter().map(|block| block.slot()).collect::>(); + self.log(&format!( + "Completing BlocksByRange request {req_id} to {peer_id} with blocks {slots:?}" + )); + + for block in blocks { + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: Some(block.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcBlock { + sync_request_id: SyncRequestId::BlocksByRange(req_id), + peer_id, + beacon_block: None, + seen_timestamp: D, + }); + + if self + .sent_blocks_by_range + .insert(req_id.parent_request_id, blocks.to_vec()) + .is_some() + { + panic!("Sent two blocks_by_range requests in the same epoch. We need better tracking"); + } + } + + fn send_data_columns_by_range_response( + &mut self, + id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: &[Arc>], + ) { + let mut ids = data_columns + .iter() + .map(|d| (d.slot().as_u64(), d.index)) + .collect::>(); + ids.sort_unstable(); + self.log(&format!( + "Completing DataColumnsByRange request {id} to {peer_id} with data_columns {ids:?}" + )); + + for data_column in data_columns { + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: Some(data_column.clone()), + seen_timestamp: D, + }); + } + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRange(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + + fn pop_blocks_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> (BlocksByRangeRequestId, PeerId, OldBlocksByRangeRequest) { + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) .unwrap_or_else(|e| { panic!("Should have a BlocksByRange request, filter {request_filter:?}: {e:?}") - }); + }) + } - let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; - while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { - NetworkMessage::SendRequest { - peer_id, - request: - RequestType::DataColumnsByRange(DataColumnsByRangeRequest { - start_slot, .. - }), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), - _ => None, - }) { - data_columns_requests.push(data_columns_request); - } + fn pop_data_columns_by_range_requests( + &mut self, + request_filter: RequestFilter, + ) -> Vec<( + DataColumnsByRangeRequestId, + PeerId, + DataColumnsByRangeRequest, + )> { + let mut data_columns_requests = vec![]; + while let Ok(data_columns_request) = + self.pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + data_columns_requests.push(data_columns_request); + } + data_columns_requests + } + + fn find_data_by_range_request( + &mut self, + request_filter: RequestFilter, + ) -> ByRangeDataRequestIds { + if self.after_fulu() { + let data_columns_requests = self.pop_data_columns_by_range_requests(request_filter); if data_columns_requests.is_empty() { panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); } ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) } else if self.after_deneb() { - let (id, peer) = self + let (id, peer, req) = self .pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, - request: RequestType::BlobsByRange(BlobsByRangeRequest { start_slot, .. }), + request: RequestType::BlobsByRange(req), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), - } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), + } if request_filter.matches_blobs_by_range(peer_id, req) => { + Some((*id, *peer_id, req.clone())) + } _ => None, }) .unwrap_or_else(|e| { panic!("Should have a blobs by range request, filter {request_filter:?}: {e:?}") }); - ByRangeDataRequestIds::PrePeerDAS(id, peer) + ByRangeDataRequestIds::PrePeerDAS(id, peer, req) } else { ByRangeDataRequestIds::PreDeneb - }; + } + } - (block_req, by_range_data_requests) + fn find_and_complete_block_components_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let id = self.find_and_complete_blocks_by_range_request(request_filter, complete_config); + self.find_and_complete_data_by_range_request(request_filter, complete_config); + id } fn find_and_complete_blocks_by_range_request( &mut self, request_filter: RequestFilter, + complete_config: CompleteConfig, ) -> RangeRequestId { - let ((blocks_req_id, block_peer), by_range_data_request_ids) = - self.find_blocks_by_range_request(request_filter); + let (blocks_req_id, block_peer, blocks_req) = + self.pop_blocks_by_range_request(request_filter); - // Complete the request with a single stream termination - self.log(&format!( - "Completing BlocksByRange request {blocks_req_id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcBlock { - sync_request_id: SyncRequestId::BlocksByRange(blocks_req_id), - peer_id: block_peer, - beacon_block: None, - seen_timestamp: D, - }); + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_blocks_by_range_request( + &mut self, + request: BlocksByRangeRequestData, + complete_config: CompleteConfig, + ) -> RangeRequestId { + let (blocks_req_id, block_peer, blocks_req) = request; + let start_slot = Slot::new(*blocks_req.start_slot()); + let blocks = (0..complete_config.block_count) + .map(|i| { + self.zero_block_at_slot(start_slot + Slot::new(i as u64), complete_config.with_data) + }) + .collect::>(); + self.send_blocks_by_range_response(blocks_req_id, block_peer, &blocks); + + blocks_req_id.parent_request_id.requester + } + + fn complete_data_columns_by_range_request( + &mut self, + (id, peer_id, req): DataColumnsByRangeRequestData, + complete_config: CompleteConfig, + ) { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!( + "Forced custody failure at request {id} for peer {peer_id} index {index:?}" + )); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); + } + fn find_and_complete_data_by_range_request( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + let by_range_data_request_ids = self.find_data_by_range_request(request_filter); + self.complete_data_by_range_request(by_range_data_request_ids, complete_config); + } + + fn complete_data_by_range_request( + &mut self, + by_range_data_request_ids: ByRangeDataRequestIds, + complete_config: CompleteConfig, + ) { match by_range_data_request_ids { ByRangeDataRequestIds::PreDeneb => {} - ByRangeDataRequestIds::PrePeerDAS(id, peer_id) => { + ByRangeDataRequestIds::PrePeerDAS(id, peer_id, req) => { // Complete the request with a single stream termination self.log(&format!( - "Completing BlobsByRange request {id:?} with empty stream" + "Completing BlobsByRange request {id} {req:?} with empty stream" )); self.send_sync_message(SyncMessage::RpcBlob { sync_request_id: SyncRequestId::BlobsByRange(id), @@ -305,21 +696,89 @@ impl TestRig { } ByRangeDataRequestIds::PostPeerDAS(data_column_req_ids) => { // Complete the request with a single stream termination - for (id, peer_id) in data_column_req_ids { - self.log(&format!( - "Completing DataColumnsByRange request {id:?} with empty stream" - )); - self.send_sync_message(SyncMessage::RpcDataColumn { - sync_request_id: SyncRequestId::DataColumnsByRange(id), - peer_id, - data_column: None, - seen_timestamp: D, - }); + for (id, peer_id, req) in data_column_req_ids { + // To reply with a valid DataColumnsByRange we need to construct + // DataColumnsByRange for the block root that we requested the block peer, plus + // figure out which exact columns we requested this peer + + let components_by_range_req_id = id.parent_request_id.parent_request_id; + let blocks = self.last_sent_blocks_by_range(components_by_range_req_id); + + let data_columns = blocks + .iter() + .flat_map(|block| { + let kzg_commitments_inclusion_proof = block + .message() + .body() + .kzg_commitments_merkle_proof() + .unwrap(); + let kzg_commitments = block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .clone(); + let signed_block_header = block.signed_block_header(); + + req.columns.iter().filter_map(move |index| { + // Skip column generation if index is marked as failure + if complete_config.custody_failure_at_index == Some(*index) { + return None; + } + + // We need to produce a DataColumn with valid inclusion proof, but can + // be with random KZG proof and data as we won't send it for processing + Some(Arc::new(DataColumnSidecar { + index: *index, + column: VariableList::empty(), + kzg_commitments: kzg_commitments.clone(), + kzg_proofs: VariableList::from(vec![]), + signed_block_header: signed_block_header.clone(), + kzg_commitments_inclusion_proof: + kzg_commitments_inclusion_proof.clone(), + })) + }) + }) + .collect::>(); + + // Need to log here because I can't capture &mut self inside the columns iter + if !blocks.is_empty() { + if let Some(index) = complete_config.custody_failure_at_index { + self.log(&format!("Forced custody failure at request {id} for peer {peer_id} index {index:?}")); + } + } + + self.send_data_columns_by_range_response(id, peer_id, &data_columns); } } } + } - blocks_req_id.parent_request_id.requester + fn progress_until_no_events( + &mut self, + request_filter: RequestFilter, + complete_config: CompleteConfig, + ) { + loop { + if let Ok(request) = + self.pop_received_network_event(|ev| request_filter.blocks_by_range_requests(ev)) + { + self.complete_blocks_by_range_request(request, complete_config); + continue; + } + + if let Ok(request) = self + .pop_received_network_event(|ev| request_filter.data_columns_by_range_requests(ev)) + { + self.complete_data_columns_by_range_request(request, complete_config); + continue; + } + + let sync_state = self.get_sync_state(); + self.log(&format!("Progressed sync, current state: {:?}", sync_state,)); + + return; + } } fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { @@ -344,15 +803,18 @@ impl TestRig { &mut self, last_epoch: u64, request_filter: RequestFilter, + complete_config: CompleteConfig, ) { for epoch in 0..last_epoch { // Note: In this test we can't predict the block peer - let id = - self.find_and_complete_blocks_by_range_request(request_filter.clone().epoch(epoch)); + let id = self.find_and_complete_block_components_by_range_request( + request_filter.epoch(epoch), + complete_config, + ); if let RangeRequestId::RangeSync { batch_id, .. } = id { assert_eq!(batch_id.as_u64(), epoch, "Unexpected batch_id"); } else { - panic!("unexpected RangeRequestId {id:?}"); + panic!("unexpected RangeRequestId {id}"); } let id = match id { @@ -476,14 +938,14 @@ fn head_chain_removed_while_finalized_syncing() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer(); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Fail the head chain by disconnecting the peer. rig.peer_disconnected(head_peer); @@ -510,14 +972,14 @@ async fn state_update_while_purging() { rig.assert_state(RangeSyncType::Head); // Sync should have requested a batch, grab the request. - let _ = rig.find_blocks_by_range_request(filter().peer(head_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(head_peer)); // Now get a peer with an advanced finalized epoch. let finalized_peer = rig.add_finalized_peer_with_root(finalized_peer_root); rig.assert_state(RangeSyncType::Finalized); // Sync should have requested a batch, grab the request - let _ = rig.find_blocks_by_range_request(filter().peer(finalized_peer)); + let _ = rig.pop_blocks_by_range_request(filter().peer(finalized_peer)); // Now the chain knows both chains target roots. rig.remember_block(head_peer_block).await; @@ -536,7 +998,10 @@ fn pause_and_resume_on_ee_offline() { // make the ee offline rig.update_execution_engine_state(EngineState::Offline); // send the response to the request - rig.find_and_complete_blocks_by_range_request(filter().peer(peer1).epoch(0)); + rig.find_and_complete_block_components_by_range_request( + filter().peer(peer1).epoch(0), + complete(), + ); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); @@ -547,7 +1012,7 @@ fn pause_and_resume_on_ee_offline() { // Don't filter requests and the columns requests may be sent to peer1 or peer2 // We need to filter by epoch, because the previous batch eagerly sent requests for the next // epoch for the other batch. So we can either filter by epoch of by sync type. - rig.find_and_complete_blocks_by_range_request(filter().epoch(0)); + rig.find_and_complete_block_components_by_range_request(filter().epoch(0), complete()); // the beacon processor shouldn't have received any work rig.expect_empty_processor(); // make the beacon processor available again. @@ -576,19 +1041,34 @@ fn finalized_sync_enough_global_custody_peers_few_chain_peers() { // Current priorization only sends batches to idle peers, so we need enough peers for each batch // TODO: Test this with a single peer in the chain, it should still work - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS) as usize, - ); + r.add_sync_peer(false, remote_info); r.assert_state(RangeSyncType::Finalized); let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.complete_and_process_range_sync_until(last_epoch, filter(), complete()); } +// Same test with different types of peers: +// - 100 peers +// - 1 supernode +// - perfectly distributed peer ids + #[test] -fn finalized_sync_not_enough_custody_peers_on_start() { - let mut r = TestRig::test_setup(); +fn finalized_sync_not_enough_custody_peers_on_start_supernode_only() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeOnly, + }); +} + +#[test] +fn finalized_sync_not_enough_custody_peers_on_start_supernode_and_random() { + finalized_sync_not_enough_custody_peers_on_start(Config { + peers: PeersConfig::SupernodeAndRandom, + }); +} + +fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { + let mut r = TestRig::test_setup_as_supernode(); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -599,24 +1079,159 @@ fn finalized_sync_not_enough_custody_peers_on_start() { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit - r.add_random_peer_not_supernode(remote_info.clone()); + r.add_connected_sync_peer_not_supernode(remote_info.clone()); r.assert_state(RangeSyncType::Finalized); - // Because we don't have enough peers on all columns we haven't sent any request. - // NOTE: There's a small chance that this single peer happens to custody exactly the set we - // expect, in that case the test will fail. Find a way to make the test deterministic. - r.expect_empty_network(); + // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it + // doesn't have enough peers to cover all columns + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + + // Here we have a batch with partially completed block_components_by_range requests. The batch + // should not have failed, we are still syncing, and there are no downscoring events. + r.expect_no_penalty_for_anyone(); + r.expect_active_block_components_by_range_request_on_custody_step(); // Generate enough peers and supernodes to cover all custody columns - r.new_connected_peers_for_peerdas(); + r.add_sync_peers(config.peers, remote_info.clone()); // Note: not necessary to add this peers to the chain, as we draw from the global pool // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.add_random_peers( - remote_info, - (advanced_epochs + EXTRA_SYNCED_EPOCHS - 1) as usize, + + r.progress_until_no_events(filter(), complete()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work + // events from here easily. What we pop from the beacon processor queue is an opaque closure + // wihtout any information. We don't know what batch it is for. +} + +#[test] +fn finalized_sync_single_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + + r.add_sync_peer(true, remote_info.clone()); + r.assert_state(RangeSyncType::Finalized); + + // Progress all blocks_by_range and columns_by_range requests but respond empty for a single + // column index + r.progress_until_no_events( + filter(), + complete().custody_failure_at_index(column_index_to_fail), + ); + r.expect_penalties("custody_failure"); + + // Some peer had a custody failure, but since there's a single peer in the batch we won't issue + // another request yet. + r.expect_no_active_rpc_requests(); + // Ensure that the block components by range request have not failed + r.expect_active_block_components_by_range_request_on_custody_step(); + r.expect_all_batches_downloading(); + + // After adding a new peer we will try to fetch from it + r.add_sync_peer(true, remote_info.clone()); + r.progress_until_no_events( + // Find the requests first to assert that this is the only request that exists + filter().column_index(column_index_to_fail), + // complete this one request without the custody failure now + complete(), ); - let last_epoch = advanced_epochs + EXTRA_SYNCED_EPOCHS; - r.complete_and_process_range_sync_until(last_epoch, filter()); + r.expect_no_active_rpc_requests(); + r.expect_no_active_block_components_by_range_requests(); + r.expect_all_batches_processing_or_awaiting(); +} + +#[test] +fn finalized_sync_permanent_custody_peer_failure() { + let mut r = TestRig::test_setup(); + // Only run post-PeerDAS + if !r.fork_name.fulu_enabled() { + return; + } + + let advanced_epochs: u64 = 2; + let remote_info = r.finalized_remote_info_advanced_by(advanced_epochs.into()); + let column_index_to_fail = r.our_custody_indices().first().copied().unwrap(); + const PEERS_IN_BATCH: usize = 4; + + for _ in 0..PEERS_IN_BATCH { + r.add_connected_sync_random_peer(remote_info.clone()); + } + r.assert_state(RangeSyncType::Finalized); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. + r.find_and_complete_block_components_by_range_request( + filter().epoch(0), + complete().custody_failure_at_index(column_index_to_fail), + ); + + let mut requested_peers = HashSet::new(); + + for i in 0..PEERS_IN_BATCH - 1 { + r.log(&format!("Loop {i} of custody failure round")); + + // Some peer had a costudy failure at `column_index` so sync should do a single extra request + // for that index and epoch. We want to make sure that the request goes to different peer + // than the attempts before. + let reqs = + r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); + let req_peer = reqs.peer(); + if requested_peers.contains(&req_peer) { + panic!("Re-requested the same peer {req_peer} again after a custody failure"); + } + requested_peers.insert(req_peer); + + // Find the requests first to assert that this is the only request that exists + r.expect_no_data_columns_by_range_requests(filter().epoch(0)); + // complete this one request without the custody failure now + r.complete_data_by_range_request( + reqs, + complete().custody_failure_at_index(column_index_to_fail), + ); + } + + // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch + // 1 successfully +} + +#[test] +#[ignore] +fn mine_peerids() { + let spec = test_spec::(); + let mut rng = ChaCha20Rng::from_seed([0u8; 32]); + + let expected_subnets = (0..3) + .map(|i| DataColumnSubnetId::new(i as u64)) + .collect::>(); + + for i in 0..usize::MAX { + let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); + let enr = Enr::builder().build(&key).unwrap(); + let peer_id = enr.peer_id(); + // Use default custody groups count + let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); + let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) + .expect("should compute custody subnets"); + if expected_subnets == subnets { + panic!("{:?}", subnets); + } else { + let matches = expected_subnets + .iter() + .filter(|index| subnets.contains(index)) + .count(); + if matches > 0 { + println!("{i} {:?}", matches); + } + } + } } diff --git a/consensus/types/src/signed_beacon_block.rs b/consensus/types/src/signed_beacon_block.rs index 85bed35a19c..de572014edc 100644 --- a/consensus/types/src/signed_beacon_block.rs +++ b/consensus/types/src/signed_beacon_block.rs @@ -321,6 +321,10 @@ impl> SignedBeaconBlock .unwrap_or(0) } + pub fn has_data(&self) -> bool { + self.num_expected_blobs() > 0 + } + /// Used for displaying commitments in logs. pub fn commitments_formatted(&self) -> String { let Ok(commitments) = self.message().body().blob_kzg_commitments() else { From 801659d4ae200600305787e0538b6ba0559ac98e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 22 May 2025 01:06:57 -0500 Subject: [PATCH 02/23] Resolve some TODOs --- .../src/sync/network_context/block_components_by_range.rs | 2 -- .../network/src/sync/network_context/custody_by_range.rs | 3 +-- .../network/src/sync/network_context/custody_by_root.rs | 5 +---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 4545806a05e..45e5091665b 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -191,7 +191,6 @@ impl BlockComponentsByRangeRequest { blocks_by_range_request, } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - // TODO(das): use the peer group let peer_group = BatchPeers::new_from_block_peer(*block_peer); let rpc_blocks = couple_blocks_base( blocks.to_vec(), @@ -226,7 +225,6 @@ impl BlockComponentsByRangeRequest { blocks_by_range_request, } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { - // TODO(das): use the peer group let blocks_with_data = blocks .iter() .filter(|block| block.has_data()) diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 9f8e163ba47..22d0d02d984 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -425,8 +425,7 @@ impl ActiveCustodyByRangeRequest { // - Add a new peer that custodies the missing columns // - Call `continue_requests` // - // Otherwise this request should be dropped and failed after some time. - // TODO(das): implement the above + // Otherwise this request will be dropped and failed after some time. } } } diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index c547837fc7f..3b7b373790f 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -249,7 +249,6 @@ impl ActiveCustodyByRootRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); - let mut indices_without_peers = vec![]; // Need to: // - track how many active requests a peer has for load balancing @@ -304,9 +303,7 @@ impl ActiveCustodyByRootRequest { // - Add a new peer that custodies the missing columns // - Call `continue_requests` // - // Otherwise this request should be dropped and failed after some time. - // TODO(das): implement the above - indices_without_peers.push(column_index); + // Otherwise this request will be dropped and failed after some time. } } } From b383f7af536329ef99989fe3390b83659d47339c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 18:37:20 -0500 Subject: [PATCH 03/23] More comments --- beacon_node/network/src/sync/manager.rs | 4 +- .../network/src/sync/network_context.rs | 158 ++++++++++-------- .../block_components_by_range.rs | 11 +- 3 files changed, 100 insertions(+), 73 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 0cf17c7b899..adcc177b8b2 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1301,11 +1301,11 @@ impl SyncManager { range_request_id: ComponentsByRangeRequestId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self + if let Some(result) = self .network .on_block_components_by_range_response(range_request_id, range_block_component) { - match resp { + match result { Ok((blocks, batch_peers)) => { match range_request_id.requester { RangeRequestId::RangeSync { chain_id, batch_id } => { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index d7ad9d3eb7a..3bc81924378 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -507,75 +507,6 @@ impl SyncNetworkContext { Ok(id.id) } - /// Received a blocks by range or blobs by range response for a request that couples blocks ' - /// and blobs. - #[allow(clippy::type_complexity)] - pub fn on_block_components_by_range_response( - &mut self, - id: ComponentsByRangeRequestId, - range_block_component: RangeBlockComponent, - ) -> Option>, BatchPeers), RpcResponseError>> { - // Note: need to remove the request to borrow self again below. Otherwise we can't - // do nested requests - let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { - metrics::inc_counter_vec( - &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, - &["block_components_by_range"], - ); - return None; - }; - - let result = match range_block_component { - RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { - request - .on_blocks_by_range_result(req_id, blocks, peer_id, self) - .map_err(Into::::into) - }), - RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { - request - .on_blobs_by_range_result(req_id, blobs, peer_id, self) - .map_err(Into::::into) - }), - RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { - resp.and_then(|(custody_columns, _)| { - request - .on_custody_by_range_result(req_id, custody_columns, peers, self) - .map_err(Into::::into) - }) - } - }; - - let result = result.transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((blocks, peer_group))) => { - let blocks_with_data = blocks - .iter() - .filter(|block| block.as_block().has_data()) - .count(); - // Don't log the peer_group here, it's very long (could be up to 128 peers). If you - // want to trace which peer sent the column at index X, search for the log: - // `Sync RPC request sent method="DataColumnsByRange" ...` - debug!( - %id, - blocks = blocks.len(), - blocks_with_data, - block_peer = ?peer_group.block(), - "Block components by range request success, removing" - ) - } - Some(Err(e)) => { - debug!(%id, error = ?e, "Block components by range request failure, removing" ) - } - None => { - self.block_components_by_range_requests.insert(id, request); - } - } - result - } - /// Request block of `block_root` if necessary by checking: /// - If the da_checker has a pending block from gossip or a previous request /// @@ -1220,6 +1151,8 @@ impl SyncNetworkContext { // Request handlers + /// Processes a single `RpcEvent` blocks_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] but it converts a `Vec` into a `Block` pub(crate) fn on_single_block_response( &mut self, id: SingleLookupReqId, @@ -1242,6 +1175,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` blobs_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] pub(crate) fn on_single_blob_response( &mut self, id: SingleLookupReqId, @@ -1271,6 +1206,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a data_columns_by_root RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_root_response( &mut self, @@ -1284,6 +1221,10 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } + /// Processes a single `RpcEvent` for a blocks_by_range RPC request. + /// - If the event completes the request, it returns `Some(Ok)` with a vec of blocks + /// - If the event is an error it fails the request and returns `Some(Err)` + /// - else it appends the response chunk to the active request state and returns `None` #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_range_response( &mut self, @@ -1295,6 +1236,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a blobs_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_blobs_by_range_response( &mut self, @@ -1306,6 +1249,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "BlobsByRangeRequest", resp, peer_id, |b| b.len()) } + /// Processes a single `RpcEvent` for a data_columns_by_range RPC request. + /// Same logic as [`on_blocks_by_range_response`] #[allow(clippy::type_complexity)] pub(crate) fn on_data_columns_by_range_response( &mut self, @@ -1319,6 +1264,8 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRange", resp, peer_id, |d| d.len()) } + /// Common logic for `on_*_response` handlers. Ensures we have consistent logging and metrics + /// and peer reporting for all request types. fn on_rpc_response_result usize>( &mut self, id: I, @@ -1475,6 +1422,79 @@ impl SyncNetworkContext { result } + /// Processes the result of an `*_by_range` RPC request issued by a + /// block_components_by_range_request. + /// + /// - If the result completes the request, it returns `Some(Ok)` with a vec of coupled RpcBlocks + /// - If the result fails the request, it returns `Some(Err)`. Note that a failed request may + /// not fail the block_components_by_range_request as it implements retries. + /// - else it appends the result to the active request state and returns `None` + #[allow(clippy::type_complexity)] + pub fn on_block_components_by_range_response( + &mut self, + id: ComponentsByRangeRequestId, + range_block_component: RangeBlockComponent, + ) -> Option>, BatchPeers), RpcResponseError>> { + // Note: need to remove the request to borrow self again below. Otherwise we can't + // do nested requests + let Some(mut request) = self.block_components_by_range_requests.remove(&id) else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["block_components_by_range"], + ); + return None; + }; + + let result = match range_block_component { + RangeBlockComponent::Block(req_id, resp, peer_id) => resp.and_then(|(blocks, _)| { + request + .on_blocks_by_range_result(req_id, blocks, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::Blob(req_id, resp, peer_id) => resp.and_then(|(blobs, _)| { + request + .on_blobs_by_range_result(req_id, blobs, peer_id, self) + .map_err(Into::::into) + }), + RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { + resp.and_then(|(custody_columns, _)| { + request + .on_custody_by_range_result(req_id, custody_columns, peers, self) + .map_err(Into::::into) + }) + } + } + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + .transpose(); + + match result.as_ref() { + Some(Ok((blocks, peer_group))) => { + let blocks_with_data = blocks + .iter() + .filter(|block| block.as_block().has_data()) + .count(); + // Don't log the peer_group here, it's very long (could be up to 128 peers). If you + // want to trace which peer sent the column at index X, search for the log: + // `Sync RPC request sent method="DataColumnsByRange" ...` + debug!( + %id, + blocks = blocks.len(), + blocks_with_data, + block_peer = ?peer_group.block(), + "Block components by range request success, removing" + ) + } + Some(Err(e)) => { + debug!(%id, error = ?e, "Block components by range request failure, removing" ) + } + None => { + self.block_components_by_range_requests.insert(id, request); + } + } + result + } + pub fn send_block_for_processing( &self, id: Id, diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 45e5091665b..7c8e59eb970 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -19,6 +19,10 @@ use types::{ SignedBeaconBlock, Slot, }; +/// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return +/// potentially available RpcBlocks. +/// +/// See [`State`] for the set of `*_by_range` it may issue depending on the fork. pub struct BlockComponentsByRangeRequest { id: ComponentsByRangeRequestId, peers: Arc>>, @@ -31,13 +35,16 @@ enum State { blocks_by_range_request: ByRangeRequest>>>, }, - // Two single concurrent requests for block + blobs + // Two single concurrent requests for block + blobs. As of now we request blocks and blobs to + // the same peer, so we can attribute coupling errors to the same unique peer. DenebEnabled { blocks_by_range_request: ByRangeRequest>>>, blobs_by_range_request: ByRangeRequest>>>, }, - // Request blocks first, then columns + // Request blocks first, then columns. Assuming the block peer is honest we can attribute + // custody failures to the peers serving us columns. We want to get rid of the honest block + // peer assumption in the future, see https://github.com/sigp/lighthouse/issues/6258 FuluEnabled(FuluEnabledState), } From 7d0fb93274cf5bf47e5fd662743ce1813106e497 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 18:49:45 -0500 Subject: [PATCH 04/23] Reduce conversions --- .../network/src/sync/network_context.rs | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3bc81924378..458ff755d2c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1349,22 +1349,20 @@ impl SyncNetworkContext { ); let _enter = span.enter(); - let result = result.map_err(Into::::into).transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((columns, peer_group, _))) => { + match &result { + Ok(Some((columns, peer_group, _))) => { debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } - Some(Err(e)) => { - debug!(%id, error = ?e, "Custody by root request failure, removing" ) + Err(e) => { + debug!(%id, error = ?e, "Custody by root request failure, removing") } - None => { + Ok(None) => { self.custody_by_root_requests.insert(id, request); } } - result + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + result.map_err(Into::::into).transpose() } /// Insert a downloaded column into an active custody request. Then make progress on the @@ -1385,8 +1383,10 @@ impl SyncNetworkContext { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_range_requests.remove(&id) else { - // TOOD(das): This log can happen if the request is error'ed early and dropped - debug!(%id, "Custody by range downloaded event for unknown request"); + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["custody_by_range"], + ); return None; }; @@ -1401,25 +1401,23 @@ impl SyncNetworkContext { request: ActiveCustodyByRangeRequest, result: CustodyByRangeRequestResult, ) -> Option> { - let result = result.map_err(Into::::into).transpose(); - - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - match result.as_ref() { - Some(Ok((columns, _peer_group, _))) => { + match &result { + Ok(Some((columns, _peer_group, _))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: // `Sync RPC request sent method="DataColumnsByRange" ...` debug!(%id, count = columns.len(), "Custody by range request success, removing") } - Some(Err(e)) => { - debug!(%id, error = ?e, "Custody by range request failure, removing" ) + Err(e) => { + debug!(%id, error = ?e, "Custody by range request failure, removing") } - None => { + Ok(None) => { self.custody_by_range_requests.insert(id, request); } } - result + // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to + // an Option first to use in an `if let Some() { act on result }` block. + result.map_err(Into::::into).transpose() } /// Processes the result of an `*_by_range` RPC request issued by a From c8a0c9e37932d67ee74da255a796d367725ea93b Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 19:04:50 -0500 Subject: [PATCH 05/23] Remove CustodyByRoot and CustodyByRange types --- beacon_node/network/src/sync/manager.rs | 7 +- .../network/src/sync/network_context.rs | 68 +++++++++++-------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index adcc177b8b2..b5e936d7e80 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -36,8 +36,7 @@ use super::backfill_sync::{BackFillSync, ProcessResult, SyncStart}; use super::block_lookups::BlockLookups; use super::network_context::{ - CustodyByRangeResult, CustodyByRootResult, RangeBlockComponent, RangeRequestId, RpcEvent, - SyncNetworkContext, + CustodyRequestResult, RangeBlockComponent, RangeRequestId, RpcEvent, SyncNetworkContext, }; use super::peer_sampling::{Sampling, SamplingConfig, SamplingResult}; use super::peer_sync_info::{remote_sync_type, PeerSyncType}; @@ -1236,7 +1235,7 @@ impl SyncManager { fn on_custody_by_range_result( &mut self, id: CustodyByRangeRequestId, - result: CustodyByRangeResult, + result: CustodyRequestResult, ) { // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not // not have to pass a PeerGroup in case of error @@ -1259,7 +1258,7 @@ impl SyncManager { fn on_custody_by_root_result( &mut self, requester: CustodyRequester, - response: CustodyByRootResult, + response: CustodyRequestResult, ) { self.block_lookups .on_download_response::>( diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 458ff755d2c..ce2f91a391d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1,8 +1,8 @@ //! Provides network functionality for the Syncing thread. This fundamentally wraps a network //! channel and stores a global RPC ID to perform requests. -use self::custody_by_range::{ActiveCustodyByRangeRequest, CustodyByRangeRequestResult}; -use self::custody_by_root::{ActiveCustodyByRootRequest, CustodyByRootRequestResult}; +use self::custody_by_range::ActiveCustodyByRangeRequest; +use self::custody_by_root::ActiveCustodyByRootRequest; pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; use super::manager::BlockProcessType; use super::range_sync::BatchPeers; @@ -75,12 +75,12 @@ impl RpcEvent { pub type RpcResponseResult = Result<(T, Duration), RpcResponseError>; -pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; - /// Duration = latest seen timestamp of all received data columns -pub type CustodyByRootResult = RpcResponseBatchResult>; +pub type RpcResponseBatchResult = Result<(T, PeerGroup, Duration), RpcResponseError>; -pub type CustodyByRangeResult = RpcResponseBatchResult>; +/// Common result type for `custody_by_root` and `custody_by_range` requests. The peers are part of +/// the `Ok` response since they are not known until the entire request succeeds. +pub type CustodyRequestResult = RpcResponseBatchResult>; #[derive(Debug, Clone)] pub enum RpcResponseError { @@ -1102,7 +1102,7 @@ impl SyncNetworkContext { /// attempt. pub fn continue_custody_by_root_requests( &mut self, - ) -> Vec<(CustodyRequester, CustodyByRootResult)> { + ) -> Vec<(CustodyRequester, CustodyRequestResult)> { let ids = self .custody_by_root_requests .keys() @@ -1116,7 +1116,10 @@ impl SyncNetworkContext { .custody_by_root_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id, request, result) .map(|result| (id, result)) }) @@ -1128,7 +1131,7 @@ impl SyncNetworkContext { /// attempt. pub fn continue_custody_by_range_requests( &mut self, - ) -> Vec<(CustodyByRangeRequestId, CustodyByRangeResult)> { + ) -> Vec<(CustodyByRangeRequestId, CustodyRequestResult)> { let ids = self .custody_by_range_requests .keys() @@ -1142,7 +1145,10 @@ impl SyncNetworkContext { .custody_by_range_requests .remove(&id) .expect("key of hashmap"); - let result = request.continue_requests(self); + let result = request + .continue_requests(self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_range_result(id, request, result) .map(|result| (id, result)) }) @@ -1313,7 +1319,7 @@ impl SyncNetworkContext { req_id: DataColumnsByRootRequestId, peer_id: PeerId, resp: RpcResponseResult>>>, - ) -> Option> { + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1331,7 +1337,10 @@ impl SyncNetworkContext { return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_root_result(id.requester, request, result) } @@ -1340,8 +1349,8 @@ impl SyncNetworkContext { &mut self, id: CustodyRequester, request: ActiveCustodyByRootRequest, - result: CustodyByRootRequestResult, - ) -> Option> { + result: Option>, + ) -> Option> { let span = span!( Level::INFO, "SyncNetworkContext", @@ -1350,19 +1359,17 @@ impl SyncNetworkContext { let _enter = span.enter(); match &result { - Ok(Some((columns, peer_group, _))) => { + Some(Ok((columns, peer_group, _))) => { debug!(%id, count = columns.len(), peers = ?peer_group, "Custody by root request success, removing") } - Err(e) => { + Some(Err(e)) => { debug!(%id, error = ?e, "Custody by root request failure, removing") } - Ok(None) => { + None => { self.custody_by_root_requests.insert(id, request); } } - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - result.map_err(Into::::into).transpose() + result } /// Insert a downloaded column into an active custody request. Then make progress on the @@ -1379,7 +1386,7 @@ impl SyncNetworkContext { req_id: DataColumnsByRangeRequestId, peer_id: PeerId, resp: RpcResponseResult>>>, - ) -> Option> { + ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests let Some(mut request) = self.custody_by_range_requests.remove(&id) else { @@ -1390,7 +1397,10 @@ impl SyncNetworkContext { return None; }; - let result = request.on_data_column_downloaded(peer_id, req_id, resp, self); + let result = request + .on_data_column_downloaded(peer_id, req_id, resp, self) + .map_err(Into::::into) + .transpose(); self.handle_custody_by_range_result(id, request, result) } @@ -1399,25 +1409,23 @@ impl SyncNetworkContext { &mut self, id: CustodyByRangeRequestId, request: ActiveCustodyByRangeRequest, - result: CustodyByRangeRequestResult, - ) -> Option> { + result: Option>, + ) -> Option> { match &result { - Ok(Some((columns, _peer_group, _))) => { + Some(Ok((columns, _peer_group, _))) => { // Don't log the peer_group here, it's very long (could be up to 128 peers). If you // want to trace which peer sent the column at index X, search for the log: // `Sync RPC request sent method="DataColumnsByRange" ...` debug!(%id, count = columns.len(), "Custody by range request success, removing") } - Err(e) => { + Some(Err(e)) => { debug!(%id, error = ?e, "Custody by range request failure, removing") } - Ok(None) => { + None => { self.custody_by_range_requests.insert(id, request); } } - // Convert a result from internal format of `ActiveCustodyRequest` (error first to use ?) to - // an Option first to use in an `if let Some() { act on result }` block. - result.map_err(Into::::into).transpose() + result } /// Processes the result of an `*_by_range` RPC request issued by a From 01329ab2303fee85ef115fd0f745e47e7662ba47 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Mon, 26 May 2025 19:07:15 -0500 Subject: [PATCH 06/23] Improve RangeBlockComponent type --- beacon_node/network/src/sync/manager.rs | 14 +------------- beacon_node/network/src/sync/network_context.rs | 10 +++------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index b5e936d7e80..5c72ac6d124 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1237,21 +1237,9 @@ impl SyncManager { id: CustodyByRangeRequestId, result: CustodyRequestResult, ) { - // TODO(das): Improve the type of RangeBlockComponent::CustodyColumns, not - // not have to pass a PeerGroup in case of error - let peers = match &result { - Ok((_, peers, _)) => peers.clone(), - // TODO(das): this PeerGroup with no peers incorrect - Err(_) => PeerGroup::from_set(<_>::default()), - }; - self.on_block_components_by_range_response( id.parent_request_id, - RangeBlockComponent::CustodyColumns( - id, - result.map(|(data, _peers, timestamp)| (data, timestamp)), - peers, - ), + RangeBlockComponent::CustodyColumns(id, result), ); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ce2f91a391d..3197fcf13e9 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -240,11 +240,7 @@ pub enum RangeBlockComponent { RpcResponseResult>>>, PeerId, ), - CustodyColumns( - CustodyByRangeRequestId, - RpcResponseResult>>>, - PeerGroup, - ), + CustodyColumns(CustodyByRangeRequestId, CustodyRequestResult), } #[cfg(test)] @@ -1462,8 +1458,8 @@ impl SyncNetworkContext { .on_blobs_by_range_result(req_id, blobs, peer_id, self) .map_err(Into::::into) }), - RangeBlockComponent::CustodyColumns(req_id, resp, peers) => { - resp.and_then(|(custody_columns, _)| { + RangeBlockComponent::CustodyColumns(req_id, resp) => { + resp.and_then(|(custody_columns, peers, _)| { request .on_custody_by_range_result(req_id, custody_columns, peers, self) .map_err(Into::::into) From 34b37b97ed5c0b5aa0665780e216ec3d3a169ad0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 00:37:12 -0500 Subject: [PATCH 07/23] Remove unused module --- .../src/sync/block_sidecar_coupling.rs | 588 ------------------ 1 file changed, 588 deletions(-) delete mode 100644 beacon_node/network/src/sync/block_sidecar_coupling.rs diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs deleted file mode 100644 index 68f15491256..00000000000 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ /dev/null @@ -1,588 +0,0 @@ -use beacon_chain::{ - block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, -}; -use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - }, - PeerId, -}; -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; -use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, - Hash256, RuntimeVariableList, SignedBeaconBlock, Slot, -}; - -use super::range_sync::BatchPeers; - -pub struct RangeBlockComponentsRequest { - /// Blocks we have received awaiting for their corresponding sidecar. - blocks_request: ByRangeRequest>>>, - /// Sidecars we have received awaiting for their corresponding block. - block_data_request: RangeBlockDataRequest, -} - -enum ByRangeRequest { - Active(I), - Complete(T, PeerId), -} - -enum RangeBlockDataRequest { - /// All pre-deneb blocks - NoData, - /// All post-Deneb blocks, regardless of if they have data or not - Blobs(ByRangeRequest>>>), - /// All post-Fulu blocks, regardless of if they have data or not - DataColumns { - requests: HashMap< - DataColumnsByRangeRequestId, - ByRangeRequest>, - >, - expected_column_to_peer: HashMap, - }, -} - -impl RangeBlockComponentsRequest { - pub fn new( - blocks_req_id: BlocksByRangeRequestId, - blobs_req_id: Option, - data_columns: Option<( - Vec, - HashMap, - )>, - ) -> Self { - let block_data_request = if let Some(blobs_req_id) = blobs_req_id { - RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_column_to_peer)) = data_columns { - RangeBlockDataRequest::DataColumns { - requests: requests - .into_iter() - .map(|id| (id, ByRangeRequest::Active(id))) - .collect(), - expected_column_to_peer, - } - } else { - RangeBlockDataRequest::NoData - }; - - Self { - blocks_request: ByRangeRequest::Active(blocks_req_id), - block_data_request, - } - } - - pub fn add_blocks( - &mut self, - req_id: BlocksByRangeRequestId, - blocks: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - self.blocks_request.finish(req_id, blocks, peer_id) - } - - pub fn add_blobs( - &mut self, - req_id: BlobsByRangeRequestId, - blobs: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), - RangeBlockDataRequest::Blobs(ref mut req) => req.finish(req_id, blobs, peer_id), - RangeBlockDataRequest::DataColumns { .. } => { - Err("received blobs but expected data columns".to_owned()) - } - } - } - - pub fn add_custody_columns( - &mut self, - req_id: DataColumnsByRangeRequestId, - columns: Vec>>, - peer_id: PeerId, - ) -> Result<(), String> { - match &mut self.block_data_request { - RangeBlockDataRequest::NoData => { - Err("received data columns but expected no data".to_owned()) - } - RangeBlockDataRequest::Blobs(_) => { - Err("received data columns but expected blobs".to_owned()) - } - RangeBlockDataRequest::DataColumns { - ref mut requests, .. - } => { - let req = requests - .get_mut(&req_id) - .ok_or(format!("unknown data columns by range req_id {req_id}"))?; - req.finish(req_id, columns, peer_id) - } - } - } - - /// If all internal requests are complete returns a Vec of coupled RpcBlocks - #[allow(clippy::type_complexity)] - pub fn responses( - &self, - spec: &ChainSpec, - ) -> Option>, BatchPeers), String>> { - let Some((blocks, &block_peer)) = self.blocks_request.to_finished() else { - return None; - }; - - match &self.block_data_request { - RangeBlockDataRequest::NoData => Some( - Self::responses_with_blobs(blocks.to_vec(), vec![], spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ), - RangeBlockDataRequest::Blobs(request) => { - let Some((blobs, _blob_peer)) = request.to_finished() else { - return None; - }; - Some( - Self::responses_with_blobs(blocks.to_vec(), blobs.to_vec(), spec) - .map(|blocks| (blocks, BatchPeers::new_from_block_peer(block_peer))), - ) - } - RangeBlockDataRequest::DataColumns { - requests, - expected_column_to_peer, - } => { - let mut data_columns = vec![]; - let mut column_peers = HashMap::new(); - for req in requests.values() { - let Some((resp_columns, column_peer)) = req.to_finished() else { - return None; - }; - data_columns.extend(resp_columns.clone()); - for column in resp_columns { - column_peers.insert(column.index, *column_peer); - } - } - - Some( - Self::responses_with_custody_columns( - blocks.to_vec(), - data_columns, - expected_column_to_peer.clone(), - spec, - ) - .map(|blocks| (blocks, BatchPeers::new(block_peer, column_peers))), - ) - } - } - } - - fn responses_with_blobs( - blocks: Vec>>, - blobs: Vec>>, - spec: &ChainSpec, - ) -> Result>, String> { - // There can't be more more blobs than blocks. i.e. sending any blob (empty - // included) for a skipped slot is not permitted. - let mut responses = Vec::with_capacity(blocks.len()); - let mut blob_iter = blobs.into_iter().peekable(); - for block in blocks.into_iter() { - let max_blobs_per_block = spec.max_blobs_per_block(block.epoch()) as usize; - let mut blob_list = Vec::with_capacity(max_blobs_per_block); - while { - let pair_next_blob = blob_iter - .peek() - .map(|sidecar| sidecar.slot() == block.slot()) - .unwrap_or(false); - pair_next_blob - } { - blob_list.push(blob_iter.next().ok_or("Missing next blob".to_string())?); - } - - let mut blobs_buffer = vec![None; max_blobs_per_block]; - for blob in blob_list { - let blob_index = blob.index as usize; - let Some(blob_opt) = blobs_buffer.get_mut(blob_index) else { - return Err("Invalid blob index".to_string()); - }; - if blob_opt.is_some() { - return Err("Repeat blob index".to_string()); - } else { - *blob_opt = Some(blob); - } - } - let blobs = RuntimeVariableList::new( - blobs_buffer.into_iter().flatten().collect::>(), - max_blobs_per_block, - ) - .map_err(|_| "Blobs returned exceeds max length".to_string())?; - responses.push(RpcBlock::new(None, block, Some(blobs)).map_err(|e| format!("{e:?}"))?) - } - - // if accumulated sidecars is not empty, throw an error. - if blob_iter.next().is_some() { - return Err("Received sidecars that don't pair well".to_string()); - } - - Ok(responses) - } - - fn responses_with_custody_columns( - blocks: Vec>>, - data_columns: DataColumnSidecarList, - expected_custody_columns: HashMap, - spec: &ChainSpec, - ) -> Result>, String> { - // Group data columns by block_root and index - let mut custody_columns_by_block = HashMap::>>::new(); - let mut block_roots_by_slot = HashMap::>::new(); - let expected_custody_indices = expected_custody_columns.keys().cloned().collect::>(); - - for column in data_columns { - let block_root = column.block_root(); - let index = column.index; - - block_roots_by_slot - .entry(column.slot()) - .or_default() - .insert(block_root); - - // Sanity check before casting to `CustodyDataColumn`. But this should never happen - if !expected_custody_columns.contains_key(&index) { - return Err(format!( - "Received column not in expected custody indices {index}" - )); - } - - custody_columns_by_block - .entry(block_root) - .or_default() - .push(CustodyDataColumn::from_asserted_custody(column)); - } - - // Now iterate all blocks ensuring that the block roots of each block and data column match, - // plus we have columns for our custody requirements - let rpc_blocks = blocks - .into_iter() - .map(|block| { - let block_root = get_block_root(&block); - block_roots_by_slot - .entry(block.slot()) - .or_default() - .insert(block_root); - - let custody_columns = custody_columns_by_block - .remove(&block_root) - .unwrap_or_default(); - - RpcBlock::new_with_custody_columns( - Some(block_root), - block, - custody_columns, - expected_custody_indices.clone(), - spec, - ) - .map_err(|e| format!("{e:?}")) - }) - .collect::, _>>()?; - - // Assert that there are no columns left for other blocks - if !custody_columns_by_block.is_empty() { - let remaining_roots = custody_columns_by_block.keys().collect::>(); - return Err(format!("Not all columns consumed: {remaining_roots:?}")); - } - - for (_slot, block_roots) in block_roots_by_slot { - if block_roots.len() > 1 { - // TODO: Some peer(s) are faulty or malicious. This batch will fail processing but - // we want to send it to the process to better attribute fault. Maybe warn log for - // now and track it in a metric? - } - } - - Ok(rpc_blocks) - } -} - -impl ByRangeRequest { - fn finish(&mut self, id: I, data: T, peer_id: PeerId) -> Result<(), String> { - match self { - Self::Active(expected_id) => { - if expected_id != &id { - return Err(format!("unexpected req_id expected {expected_id} got {id}")); - } - *self = Self::Complete(data, peer_id); - Ok(()) - } - Self::Complete(_, _) => Err("request already complete".to_owned()), - } - } - - fn to_finished(&self) -> Option<(&T, &PeerId)> { - match self { - Self::Active(_) => None, - Self::Complete(data, peer_id) => Some((data, peer_id)), - } - } -} - -#[cfg(test)] -mod tests { - use super::RangeBlockComponentsRequest; - use beacon_chain::test_utils::{ - generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, NumBlobs, - }; - use lighthouse_network::{ - service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, - }, - PeerId, - }; - use rand::SeedableRng; - use std::{collections::HashMap, sync::Arc}; - use types::{test_utils::XorShiftRng, Epoch, ForkName, MinimalEthSpec as E, SignedBeaconBlock}; - - fn components_id() -> ComponentsByRangeRequestId { - ComponentsByRangeRequestId { - id: 0, - requester: RangeRequestId::RangeSync { - chain_id: 1, - batch_id: Epoch::new(0), - }, - } - } - - fn blocks_id(parent_request_id: ComponentsByRangeRequestId) -> BlocksByRangeRequestId { - BlocksByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn blobs_id(parent_request_id: ComponentsByRangeRequestId) -> BlobsByRangeRequestId { - BlobsByRangeRequestId { - id: 1, - parent_request_id, - } - } - - fn columns_id( - id: Id, - parent_request_id: ComponentsByRangeRequestId, - ) -> DataColumnsByRangeRequestId { - DataColumnsByRangeRequestId { - id, - parent_request_id, - } - } - - fn is_finished(info: &RangeBlockComponentsRequest) -> bool { - let spec = test_spec::(); - info.responses(&spec).is_some() - } - - #[test] - fn no_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_blobs::(ForkName::Base, NumBlobs::None, &mut rng, &spec) - .0 - .into() - }) - .collect::>>>(); - - let blocks_req_id = blocks_id(components_id()); - let mut info = RangeBlockComponentsRequest::::new(blocks_req_id, None, None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn empty_blobs_into_responses() { - let spec = test_spec::(); - let peer = PeerId::random(); - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - // Always generate some blobs. - generate_rand_block_and_blobs::( - ForkName::Deneb, - NumBlobs::Number(3), - &mut rng, - &spec, - ) - .0 - .into() - }) - .collect::>>>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let blobs_req_id = blobs_id(components_id); - let mut info = - RangeBlockComponentsRequest::::new(blocks_req_id, Some(blobs_req_id), None); - - // Send blocks and complete terminate response - info.add_blocks(blocks_req_id, blocks, peer).unwrap(); - // Expect no blobs returned - info.add_blobs(blobs_req_id, vec![], peer).unwrap(); - - // Assert response is finished and RpcBlocks can be constructed, even if blobs weren't returned. - // This makes sure we don't expect blobs here when they have expired. Checking this logic should - // be hendled elsewhere. - info.responses(&test_spec::()).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns() { - let spec = test_spec::(); - let peer = PeerId::random(); - let expects_custody_columns = [1, 2, 3, 4]; - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = expects_custody_columns - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let column_to_peer = expects_custody_columns - .iter() - .map(|index| (*index, peer)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), column_to_peer)), - ); - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - // Send data columns - for (i, &column_index) in expects_custody_columns.iter().enumerate() { - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| b.1.iter().filter(|d| d.index == column_index).cloned()) - .collect(), - peer, - ) - .unwrap(); - - if i < expects_custody_columns.len() - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } - - #[test] - fn rpc_block_with_custody_columns_batched() { - let spec = test_spec::(); - let peer = PeerId::random(); - let batched_column_requests = [vec![1_u64, 2], vec![3, 4]]; - let expects_custody_columns = batched_column_requests - .iter() - .flatten() - .map(|index| (*index, peer)) - .collect::>(); - let custody_column_request_ids = - (0..batched_column_requests.len() as u32).collect::>(); - let num_of_data_column_requests = custody_column_request_ids.len(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = batched_column_requests - .iter() - .enumerate() - .map(|(i, _)| columns_id(i as Id, components_id)) - .collect::>(); - - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), expects_custody_columns.clone())), - ); - - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..4) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - // Send blocks and complete terminate response - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - peer, - ) - .unwrap(); - // Assert response is not finished - assert!(!is_finished(&info)); - - for (i, column_indices) in batched_column_requests.iter().enumerate() { - // Send the set of columns in the same batch request - info.add_custody_columns( - columns_req_id.get(i).copied().unwrap(), - blocks - .iter() - .flat_map(|b| { - b.1.iter() - .filter(|d| column_indices.contains(&d.index)) - .cloned() - }) - .collect::>(), - peer, - ) - .unwrap(); - - if i < num_of_data_column_requests - 1 { - assert!( - !is_finished(&info), - "requested should not be finished at loop {i}" - ); - } - } - - // All completed construct response - info.responses(&spec).unwrap().unwrap(); - } -} From 8f74adc66f711790bd41ca423c2a454197ef72b6 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 00:43:38 -0500 Subject: [PATCH 08/23] Use DataColumnSidecarList --- .../network/src/network_beacon_processor/mod.rs | 2 +- beacon_node/network/src/sync/network_context.rs | 6 +++--- .../network_context/block_components_by_range.rs | 10 +++++----- .../src/sync/network_context/custody_by_range.rs | 6 ++---- .../src/sync/network_context/custody_by_root.rs | 4 +--- .../requests/data_columns_by_range.rs | 4 ++-- beacon_node/network/src/sync/tests/lookups.rs | 12 +++++------- 7 files changed, 19 insertions(+), 25 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 7a4d6978800..e026d04776b 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -554,7 +554,7 @@ impl NetworkBeaconProcessor { pub fn send_rpc_validate_data_columns( self: &Arc, block_root: Hash256, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, seen_timestamp: Duration, id: SamplingId, ) -> Result<(), Error> { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3197fcf13e9..f4db7e22566 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1216,7 +1216,7 @@ impl SyncNetworkContext { id: DataColumnsByRootRequestId, peer_id: PeerId, rpc_event: RpcEvent>>, - ) -> Option>>>> { + ) -> Option>> { let resp = self .data_columns_by_root_requests .on_response(id, rpc_event); @@ -1314,7 +1314,7 @@ impl SyncNetworkContext { id: CustodyId, req_id: DataColumnsByRootRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, + resp: RpcResponseResult>, ) -> Option> { let span = span!( Level::INFO, @@ -1381,7 +1381,7 @@ impl SyncNetworkContext { id: CustodyByRangeRequestId, req_id: DataColumnsByRangeRequestId, peer_id: PeerId, - resp: RpcResponseResult>>>, + resp: RpcResponseResult>, ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 7c8e59eb970..00f64f2e39c 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -15,8 +15,8 @@ use parking_lot::RwLock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, EthSpec, Hash256, RuntimeVariableList, - SignedBeaconBlock, Slot, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecarList, EthSpec, Hash256, + RuntimeVariableList, SignedBeaconBlock, Slot, }; /// Given a `BlocksByRangeRequest` (a range of slots) fetches all necessary data to return @@ -57,7 +57,7 @@ enum FuluEnabledState { blocks: Vec>>, block_peer: PeerId, custody_by_range_request: - ByRangeRequest>>, PeerGroup>, + ByRangeRequest, PeerGroup>, }, } @@ -389,7 +389,7 @@ impl BlockComponentsByRangeRequest { pub fn on_custody_by_range_result( &mut self, id: CustodyByRangeRequestId, - data: Vec>>, + data: DataColumnSidecarList, peers: PeerGroup, cx: &mut SyncNetworkContext, ) -> BlockComponentsByRangeRequestResult { @@ -483,7 +483,7 @@ fn couple_blocks_deneb( fn couple_blocks_fulu( blocks: Vec>>, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, custody_column_indices: Vec, spec: &ChainSpec, ) -> Result>, Error> { diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 22d0d02d984..6b4d2331889 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -16,8 +16,8 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, Epoch, EthSpec, Hash256, - SignedBeaconBlockHeader, Slot, + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + Hash256, SignedBeaconBlockHeader, Slot, }; use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -25,8 +25,6 @@ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; const REQUEST_EXPIRY_SECONDS: u64 = 300; -type DataColumnSidecarList = Vec>>; - pub struct ActiveCustodyByRangeRequest { start_time: Instant, id: CustodyByRangeRequestId, diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 3b7b373790f..489b9c3b11b 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -15,7 +15,7 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use strum::IntoStaticStr; use tracing::{debug, warn}; -use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256}; +use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -24,8 +24,6 @@ const REQUEST_EXPIRY_SECONDS: u64 = 300; /// TODO(das): this attempt count is nested into the existing lookup request count. const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; -type DataColumnSidecarList = Vec>>; - pub struct ActiveCustodyByRootRequest { start_time: Instant, block_root: Hash256, diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs index 276ede93c12..54ff0c1c735 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_range.rs @@ -1,13 +1,13 @@ use super::{ActiveRequestItems, LookupVerifyError}; use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use std::sync::Arc; -use types::{DataColumnSidecar, EthSpec, Slot}; +use types::{DataColumnSidecar, DataColumnSidecarList, EthSpec, Slot}; /// Accumulates results of a data_columns_by_range request. Only returns items after receiving the /// stream termination. pub struct DataColumnsByRangeRequestItems { request: DataColumnsByRangeRequest, - items: Vec>>, + items: DataColumnSidecarList, } impl DataColumnsByRangeRequestItems { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 3e83605a276..d85504d4654 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -43,8 +43,8 @@ use tracing::info; use types::{ data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, }; const D: Duration = Duration::new(0, 0); @@ -216,9 +216,7 @@ impl TestRig { generate_rand_block_and_blobs::(fork_name, num_blobs, rng, &self.spec) } - fn rand_block_and_data_columns( - &mut self, - ) -> (SignedBeaconBlock, Vec>>) { + fn rand_block_and_data_columns(&mut self) -> (SignedBeaconBlock, DataColumnSidecarList) { let num_blobs = NumBlobs::Number(1); generate_rand_block_and_data_columns::( self.fork_name, @@ -721,7 +719,7 @@ impl TestRig { fn complete_valid_sampling_column_requests( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, ) { for id in ids { self.log(&format!("return valid data column for {id:?}")); @@ -766,7 +764,7 @@ impl TestRig { fn complete_valid_custody_request( &mut self, ids: DCByRootIds, - data_columns: Vec>>, + data_columns: DataColumnSidecarList, missing_components: bool, ) { let lookup_id = if let SyncRequestId::DataColumnsByRoot(DataColumnsByRootRequestId { From 86ad87eced677bd1e045d7f4eb6be78441291fbd Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 12:21:42 -0500 Subject: [PATCH 09/23] Lint tests --- beacon_node/network/src/sync/tests/range.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index c82e4f97769..1fb19e15ef1 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -304,9 +304,16 @@ impl TestRig { self.sync_manager.range_sync().batches_state() } - fn assert_sync_state(&mut self) { + fn assert_sync_state(&mut self, expected_state: SyncState) { let current_state = self.sync_manager.network().network_globals().sync_state(); - panic!("{:?}", current_state); + assert_eq!(current_state, expected_state); + } + + fn assert_syncing_finalized(&mut self) { + self.assert_sync_state(SyncState::SyncingFinalized { + start_slot: Slot::new(0), + target_slot: Slot::new(0), + }); } fn assert_no_chains_exist(&mut self) { @@ -333,14 +340,6 @@ impl TestRig { } } - fn expect_blocks_by_range_requests(&mut self, request_filter: RequestFilter) { - let events = - self.filter_received_network_events(|ev| request_filter.blocks_by_range_requests(ev)); - if events.is_empty() { - panic!("Expected to find blocks_by_range requests {request_filter:?}") - } - } - fn expect_no_data_columns_by_range_requests(&mut self, request_filter: RequestFilter) { let events = self .filter_received_network_events(|ev| request_filter.data_columns_by_range_requests(ev)); @@ -1080,7 +1079,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // Unikely that the single peer we added has enough columns for us. Tests are determinstic and // this error should never be hit r.add_connected_sync_peer_not_supernode(remote_info.clone()); - r.assert_state(RangeSyncType::Finalized); + r.assert_syncing_finalized(); // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it // doesn't have enough peers to cover all columns From 52722b7b2ee627b76af82ee7357437b01e6ea2c0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 14:13:31 -0500 Subject: [PATCH 10/23] Resolve TODO(das) --- .../network/src/sync/backfill_sync/mod.rs | 3 -- .../network/src/sync/block_lookups/mod.rs | 2 +- .../network/src/sync/network_context.rs | 47 +++++++++---------- .../block_components_by_range.rs | 7 +-- .../sync/network_context/custody_by_range.rs | 27 +++++------ .../sync/network_context/custody_by_root.rs | 3 +- .../network/src/sync/range_sync/batch.rs | 13 ++--- beacon_node/network/src/sync/tests/range.rs | 7 +-- 8 files changed, 48 insertions(+), 61 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 45b9c61641b..e4bf1d93ef7 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -948,9 +948,6 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { - // TODO(das): block_components_by_range requests can now hang out indefinitely. - // Is that fine? Maybe we should fail the requests from the network_context - // level without involving the BackfillSync itself. RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 2c59f710d04..f676068326b 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -724,7 +724,7 @@ impl BlockLookups { // Collect all peers that sent a column that was invalid. Must // run .unique as a single peer can send multiple invalid // columns. Penalize once to avoid insta-bans - .flat_map(|(index, _)| peer_group.of_index((*index) as usize)) + .flat_map(|(index, _)| peer_group.of_index(&(*index as usize))) .unique() .collect(), _ => peer_group.all().collect(), diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index f4db7e22566..61f223d938c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -48,7 +48,7 @@ use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, + BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, Hash256, SignedBeaconBlock, SignedBeaconBlockHeader, Slot, }; @@ -124,42 +124,41 @@ pub struct PeerGroup { /// Peers group by which indexed section of the block component they served. For example: /// - PeerA served = [blob index 0, blob index 2] /// - PeerA served = [blob index 1] - peers: HashMap>, + peers: HashMap, } impl PeerGroup { + pub fn empty() -> Self { + Self { + peers: HashMap::new(), + } + } + /// Return a peer group where a single peer returned all parts of a block component. For /// example, a block has a single component (the block = index 0/1). pub fn from_single(peer: PeerId) -> Self { Self { - peers: HashMap::from_iter([(peer, vec![0])]), + peers: HashMap::from_iter([(0, peer)]), } } - pub fn from_set(peers: HashMap>) -> Self { + pub fn from_set(peer_to_indices: HashMap>) -> Self { + let mut peers = HashMap::new(); + for (peer, indices) in peer_to_indices { + for index in indices { + peers.insert(index, peer); + } + } Self { peers } } pub fn all(&self) -> impl Iterator + '_ { - self.peers.keys() + self.peers.values() } - pub fn of_index(&self, index: usize) -> impl Iterator + '_ { - self.peers.iter().filter_map(move |(peer, indices)| { - if indices.contains(&index) { - Some(peer) - } else { - None - } - }) + pub fn of_index(&self, index: &usize) -> Option<&PeerId> { + self.peers.get(index) } - pub fn as_reversed_map(&self) -> HashMap { - // TODO(das): should we change PeerGroup to hold this map? - let mut index_to_peer = HashMap::::new(); - for (peer, indices) in self.peers.iter() { - for &index in indices { - index_to_peer.insert(index as u64, *peer); - } - } - index_to_peer + pub fn as_map(&self) -> &HashMap { + &self.peers } } @@ -953,7 +952,7 @@ impl SyncNetworkContext { &mut self, parent_id: ComponentsByRangeRequestId, blocks_with_data: Vec, - epoch: Epoch, + request: BlocksByRangeRequest, column_indices: Vec, lookup_peers: Arc>>, ) -> Result { @@ -970,7 +969,7 @@ impl SyncNetworkContext { let mut request = ActiveCustodyByRangeRequest::new( id, - epoch, + request, blocks_with_data, &column_indices, lookup_peers, diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 00f64f2e39c..fc08bcdb9c5 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -144,7 +144,6 @@ impl BlockComponentsByRangeRequest { else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. - // TODO(das): add test for this. return Err(RpcRequestSendError::InternalError( "A batch peer set should never be empty".to_string(), )); @@ -270,8 +269,7 @@ impl BlockComponentsByRangeRequest { .send_custody_by_range_request( self.id, blocks_with_data, - Slot::new(*self.request.start_slot()) - .epoch(T::EthSpec::slots_per_epoch()), + self.request.clone(), column_indices, self.peers.clone(), ) @@ -309,8 +307,7 @@ impl BlockComponentsByRangeRequest { .copied() .collect(); - let peer_group = - BatchPeers::new(*block_peer, column_peers.as_reversed_map()); + let peer_group = BatchPeers::new(*block_peer, column_peers.clone()); let rpc_blocks = couple_blocks_fulu( blocks.to_vec(), columns.to_vec(), diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 6b4d2331889..18dea2070f2 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -3,7 +3,7 @@ use crate::sync::network_context::RpcResponseError; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; +use lighthouse_network::rpc::{methods::DataColumnsByRangeRequest, BlocksByRangeRequest}; use lighthouse_network::service::api_types::{ CustodyByRangeRequestId, DataColumnsByRangeRequestId, }; @@ -16,8 +16,8 @@ use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; use tracing::{debug, warn}; use types::{ - data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - Hash256, SignedBeaconBlockHeader, Slot, + data_column_sidecar::ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Hash256, + SignedBeaconBlockHeader, Slot, }; use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; @@ -28,8 +28,7 @@ const REQUEST_EXPIRY_SECONDS: u64 = 300; pub struct ActiveCustodyByRangeRequest { start_time: Instant, id: CustodyByRangeRequestId, - // TODO(das): Pass a better type for the by_range request - epoch: Epoch, + request: BlocksByRangeRequest, /// Blocks that we expect peers to serve data columns for blocks_with_data: Vec, /// List of column indices this request needs to download to complete successfully @@ -74,7 +73,7 @@ enum ColumnResponseError { impl ActiveCustodyByRangeRequest { pub(crate) fn new( id: CustodyByRangeRequestId, - epoch: Epoch, + request: BlocksByRangeRequest, blocks_with_data: Vec, column_indices: &[ColumnIndex], lookup_peers: Arc>>, @@ -82,7 +81,7 @@ impl ActiveCustodyByRangeRequest { Self { start_time: Instant::now(), id, - epoch, + request, blocks_with_data, column_requests: HashMap::from_iter( column_indices @@ -350,7 +349,6 @@ impl ActiveCustodyByRangeRequest { }) .collect::, _>>()? // Flatten Vec> to Vec - // TODO(das): maybe not optimal for the coupling logic later .into_iter() .flatten() .collect(); @@ -375,8 +373,9 @@ impl ActiveCustodyByRangeRequest { return Err(Error::TooManyDownloadErrors(last_error)); } - // TODO(das): When is a fork and only a subset of your peers know about a block, we should - // only query the peers on that fork. Should this case be handled? How to handle it? + // TODO(das): We should only query peers that are likely to know about this block. + // For by_range requests, only peers in the SyncingChain peer set. Else consider a + // fallback to the peers that are synced up to the epoch we want to query. let custodial_peers = cx.get_custodial_peers(*column_index); // We draw from the total set of peers, but prioritize those peers who we have @@ -433,12 +432,8 @@ impl ActiveCustodyByRangeRequest { .send_data_columns_by_range_request( peer_id, DataColumnsByRangeRequest { - // TODO(das): generalize with constants from batch - start_slot: self - .epoch - .start_slot(T::EthSpec::slots_per_epoch()) - .as_u64(), - count: T::EthSpec::slots_per_epoch(), + start_slot: *self.request.start_slot(), + count: *self.request.count(), columns: indices.clone(), }, self.id, diff --git a/beacon_node/network/src/sync/network_context/custody_by_root.rs b/beacon_node/network/src/sync/network_context/custody_by_root.rs index 489b9c3b11b..1ca2a55a13a 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_root.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_root.rs @@ -21,7 +21,8 @@ use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContex const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; const REQUEST_EXPIRY_SECONDS: u64 = 300; -/// TODO(das): this attempt count is nested into the existing lookup request count. +/// TODO(das): Reconsider this retry count, it was choosen as a placeholder value. Each +/// `custody_by_*` request is already retried multiple inside of a lookup or batch const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; pub struct ActiveCustodyByRootRequest { diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 81f33352f50..8ee9748ebcc 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,9 +1,10 @@ +use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Sub; @@ -22,17 +23,17 @@ const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, - column_peers: HashMap, + column_peers: PeerGroup, } impl BatchPeers { pub fn new_from_block_peer(block_peer: PeerId) -> Self { Self { block_peer, - column_peers: <_>::default(), + column_peers: PeerGroup::empty(), } } - pub fn new(block_peer: PeerId, column_peers: HashMap) -> Self { + pub fn new(block_peer: PeerId, column_peers: PeerGroup) -> Self { Self { block_peer, column_peers, @@ -44,12 +45,12 @@ impl BatchPeers { } pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { - self.column_peers.get(index) + self.column_peers.of_index(&((*index) as usize)) } pub fn iter_unique_peers(&self) -> impl Iterator { std::iter::once(&self.block_peer) - .chain(self.column_peers.values()) + .chain(self.column_peers.all()) .unique() } } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 1fb19e15ef1..09c99d07d8c 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -188,8 +188,6 @@ struct CompleteConfig { } impl CompleteConfig { - // TODO(das): add tests where blocks don't have data - fn custody_failure_at_index(mut self, index: u64) -> Self { self.custody_failure_at_index = Some(index); self @@ -1192,15 +1190,14 @@ fn finalized_sync_permanent_custody_peer_failure() { // Find the requests first to assert that this is the only request that exists r.expect_no_data_columns_by_range_requests(filter().epoch(0)); - // complete this one request without the custody failure now r.complete_data_by_range_request( reqs, complete().custody_failure_at_index(column_index_to_fail), ); } - // TODO(das): send batch 1 for completing processing and check that SyncingChain processed batch - // 1 successfully + // custody_by_range request is still active waiting for a new peer to connect + r.expect_active_block_components_by_range_request_on_custody_step(); } #[test] From fc3922f854113bdb6a4c449224229cdf10d39ec9 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:28:03 -0500 Subject: [PATCH 11/23] Resolve more TODOs --- .../network/src/sync/backfill_sync/mod.rs | 33 ++++---- beacon_node/network/src/sync/manager.rs | 2 + .../network/src/sync/network_context.rs | 3 +- .../block_components_by_range.rs | 11 +-- .../sync/network_context/custody_by_range.rs | 82 +++++-------------- .../network/src/sync/range_sync/chain.rs | 47 ++++++----- 6 files changed, 66 insertions(+), 112 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index e4bf1d93ef7..47810d536e5 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -21,8 +21,9 @@ use beacon_chain::{BeaconChain, BeaconChainTypes}; use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; -use lighthouse_network::PeerAction; +use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use parking_lot::RwLock; use std::collections::{ btree_map::{BTreeMap, Entry}, HashMap, HashSet, @@ -135,6 +136,8 @@ pub struct BackFillSync { /// This signifies that we are able to attempt to restart a failed chain. restart_failed_sync: bool, + peers: Arc>>, + /// Reference to the beacon chain to obtain initial starting points for the backfill sync. beacon_chain: Arc>, @@ -179,6 +182,7 @@ impl BackFillSync { current_processing_batch: None, validated_batches: 0, restart_failed_sync: false, + peers: <_>::default(), beacon_chain, }; @@ -218,14 +222,7 @@ impl BackFillSync { match self.state() { BackFillState::Syncing => {} // already syncing ignore. BackFillState::Paused => { - if self - .network_globals - .peers - .read() - .synced_peers() - .next() - .is_some() - { + if !self.peers.read().is_empty() { // If there are peers to resume with, begin the resume. debug!(start_epoch = ?self.current_start, awaiting_batches = self.batches.len(), processing_target = ?self.processing_target, "Resuming backfill sync"); self.set_state(BackFillState::Syncing); @@ -298,6 +295,14 @@ impl BackFillSync { } } + pub fn add_peer(&mut self, peer_id: PeerId) { + self.peers.write().insert(peer_id); + } + + pub fn peer_disconnected(&mut self, peer_id: &PeerId) { + self.peers.write().remove(peer_id); + } + /// An RPC error has occurred. /// /// If the batch exists it is re-requested. @@ -920,20 +925,12 @@ impl BackFillSync { batch_id: BatchId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { - let synced_peers = self - .network_globals - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, - &synced_peers, + self.peers.clone(), &failed_peers, // Does not track total requests per peers for now &HashMap::new(), diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 5c72ac6d124..1fc46b95762 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -413,6 +413,7 @@ impl SyncManager { PeerSyncType::Advanced => { self.range_sync .add_peer(&mut self.network, local, peer_id, remote); + self.backfill_sync.add_peer(peer_id); } PeerSyncType::FullySynced => { // Sync considers this peer close enough to the head to not trigger range sync. @@ -530,6 +531,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); + self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 61f223d938c..7a4175f2708 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -478,7 +478,7 @@ impl SyncNetworkContext { &mut self, request: BlocksByRangeRequest, requester: RangeRequestId, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, total_requests_per_peer: &HashMap, ) -> Result { @@ -498,7 +498,6 @@ impl SyncNetworkContext { self.block_components_by_range_requests.insert(id, req); - // TODO: use ID Ok(id.id) } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index fc08bcdb9c5..bb981e31543 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -91,7 +91,7 @@ impl From for RpcRequestSendError { } } -/// FOR TESTING ONLY +/// Used to typesafe assertions of state in range sync tests #[cfg(test)] #[derive(Debug)] pub enum BlockComponentsByRangeRequestStep { @@ -103,7 +103,7 @@ impl BlockComponentsByRangeRequest { pub fn new( id: ComponentsByRangeRequestId, request: BlocksByRangeRequest, - peers: &HashSet, + peers: Arc>>, peers_to_deprioritize: &HashSet, total_requests_per_peer: &HashMap, cx: &mut SyncNetworkContext, @@ -123,6 +123,7 @@ impl BlockComponentsByRangeRequest { // will request all blocks for the first 5 epochs to that same single peer. Before we would // query only idle peers in the syncing chain. let Some(block_peer) = peers + .read() .iter() .map(|peer| { ( @@ -180,9 +181,7 @@ impl BlockComponentsByRangeRequest { Ok(Self { id, - // TODO(das): share the rwlock with the range sync batch. Are peers added to the batch - // after being created? - peers: Arc::new(RwLock::new(peers.clone())), + peers, request, state, }) @@ -511,8 +510,6 @@ fn couple_blocks_fulu( .remove(&block_root) .unwrap_or_default(); - // TODO(das): Change RpcBlock to holding a Vec of DataColumnSidecars so we don't need - // the spec here. RpcBlock::new_with_custody_columns( Some(block_root), block, diff --git a/beacon_node/network/src/sync/network_context/custody_by_range.rs b/beacon_node/network/src/sync/network_context/custody_by_range.rs index 18dea2070f2..ed796155e26 100644 --- a/beacon_node/network/src/sync/network_context/custody_by_range.rs +++ b/beacon_node/network/src/sync/network_context/custody_by_range.rs @@ -1,5 +1,4 @@ use super::custody_by_root::{ColumnRequest, Error}; -use crate::sync::network_context::RpcResponseError; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; @@ -22,7 +21,7 @@ use types::{ use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; -const TEMPORARY_FAULT_EXPIRY_SECONDS: u64 = 15; +const FAILED_PEERS_EXPIRY_SECONDS: u64 = 15; const REQUEST_EXPIRY_SECONDS: u64 = 300; pub struct ActiveCustodyByRangeRequest { @@ -41,13 +40,7 @@ pub struct ActiveCustodyByRangeRequest { FnvHashMap, /// Peers that have recently failed to successfully respond to a columns by root request. /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - peers_with_custody_failures: LRUTimeCache, - peers_with_temporary_faults: LRUTimeCache, - // TODO(das): does this HashSet has an OOM risk? We should either: make sure that this request - // structs are dropped after some time, that disconnected peers are pruned (but we may want to - // retain faulty information if they just disconnect and reconnect) or make this an LRUTimeCache - // with a long time (like 5 minutes). - peers_with_permanent_faults: HashSet, + failed_peers: LRUTimeCache, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, @@ -89,13 +82,7 @@ impl ActiveCustodyByRangeRequest { .map(|index| (*index, ColumnRequest::new())), ), active_batch_columns_requests: <_>::default(), - peers_with_custody_failures: LRUTimeCache::new(Duration::from_secs( - TEMPORARY_FAULT_EXPIRY_SECONDS, - )), - peers_with_temporary_faults: LRUTimeCache::new(Duration::from_secs( - TEMPORARY_FAULT_EXPIRY_SECONDS, - )), - peers_with_permanent_faults: HashSet::new(), + failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_EXPIRY_SECONDS)), lookup_peers, _phantom: PhantomData, } @@ -138,7 +125,7 @@ impl ActiveCustodyByRangeRequest { } // Accumulate columns that the peer does not have to issue a single log per request - let mut missing_column_indexes = vec![]; + let mut missing_column_indices = vec![]; let mut incorrect_column_indices = vec![]; let mut imported_column_indices = vec![]; @@ -178,14 +165,8 @@ impl ActiveCustodyByRangeRequest { // - peer custodies this column `index` // - peer claims to be synced to at least `slot` // - // Therefore not returning this column is an protocol violation that we - // penalize and mark the peer as failed to retry with another peer. - // - // TODO(das) do not consider this case a success. We know for sure the block has - // data. However we allow the peer to return empty as we can't attribute fault. - // TODO(das): Should track which columns are missing and eventually give up - // TODO(das): If the peer is in the lookup peer set it claims to have imported - // the block AND its custody columns. So in this case we can downscore + // Then we penalize the faulty peer, mark it as failed and try with + // another. Err(ColumnResponseError::MissingColumn(slot)) } }) @@ -219,15 +200,15 @@ impl ActiveCustodyByRangeRequest { )); } ColumnResponseError::MissingColumn(slot) => { - missing_column_indexes.push((index, slot)); + missing_column_indices.push((index, slot)); } } } } } - // Log missing_column_indexes and incorrect_column_indices here in batch per request - // to make this logs more compact and less noisy. + // Log `imported_column_indices`, `missing_column_indexes` and + // `incorrect_column_indices` once per request to make the logs less noisy. if !imported_column_indices.is_empty() { // TODO(das): this log may be redundant. We already log on DataColumnsByRange // completed, and on DataColumnsByRange sent we log the column indices @@ -246,21 +227,18 @@ impl ActiveCustodyByRangeRequest { } if !incorrect_column_indices.is_empty() { - // Note: Batch logging that columns are missing to not spam logger debug!( id = %self.id, data_columns_by_range_req_id = %req_id, %peer_id, - // TODO(das): this property can become very noisy, being the full range 0..128 - incorrect_columns = ?incorrect_column_indices, + ?incorrect_column_indices, "Custody by range peer returned non-matching columns" ); // Returning a non-canonical column is not a permanent fault. We should not // retry the peer for some time but the peer may return a canonical column in // the future. - // TODO(das): if this finalized sync the fault is permanent - self.peers_with_temporary_faults.insert(peer_id); + self.failed_peers.insert(peer_id); cx.report_peer( peer_id, PeerAction::MidToleranceError, @@ -268,19 +246,17 @@ impl ActiveCustodyByRangeRequest { ); } - if !missing_column_indexes.is_empty() { - // Note: Batch logging that columns are missing to not spam logger + if !missing_column_indices.is_empty() { debug!( id = %self.id, data_columns_by_range_req_id = %req_id, %peer_id, - // TODO(das): this property can become very noisy, being the full range 0..128 - ?missing_column_indexes, + ?missing_column_indices, "Custody by range peer claims to not have some data" ); // Not having columns is not a permanent fault. The peer may be backfilling. - self.peers_with_custody_failures.insert(peer_id); + self.failed_peers.insert(peer_id); cx.report_peer(peer_id, PeerAction::MidToleranceError, "custody_failure"); } } @@ -293,7 +269,6 @@ impl ActiveCustodyByRangeRequest { "Custody by range download error" ); - // TODO(das): Should mark peer as failed and try from another peer for column_index in &batch_request.indices { self.column_requests .get_mut(column_index) @@ -301,22 +276,8 @@ impl ActiveCustodyByRangeRequest { .on_download_error_and_mark_failure(req_id, err.clone())?; } - match err { - // Verify errors are correctness errors against our request or about the - // returned data itself. This peer is faulty or malicious, should not be - // retried. - RpcResponseError::VerifyError(_) => { - self.peers_with_permanent_faults.insert(peer_id); - } - // Network errors are not permanent faults and worth retrying - RpcResponseError::RpcError(_) => { - self.peers_with_temporary_faults.insert(peer_id); - } - // Do nothing for internal errors - RpcResponseError::InternalError(_) => {} - // unreachable - RpcResponseError::RequestExpired(_) => {} - } + // An RpcResponseError is already downscored in network_context + self.failed_peers.insert(peer_id); } }; @@ -386,18 +347,13 @@ impl ActiveCustodyByRangeRequest { let mut priorized_peers = custodial_peers .iter() .filter(|peer| { - // Never request again peers with permanent faults - // Do not request peers with custody failures for some time - !self.peers_with_permanent_faults.contains(peer) - && !self.peers_with_custody_failures.contains(peer) + // Do not request faulty peers for some time + !self.failed_peers.contains(peer) }) .map(|peer| { ( // Prioritize peers that claim to know have imported this block if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently, but allow to immediatelly request them again - self.peers_with_temporary_faults.contains(peer), // Prefer peers with fewer requests to load balance across peers. // We batch requests to the same peer, so count existence in the // `columns_to_request_by_peer` as a single 1 request. @@ -411,7 +367,7 @@ impl ActiveCustodyByRangeRequest { .collect::>(); priorized_peers.sort_unstable(); - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some((_, _, _, peer_id)) = priorized_peers.first() { columns_to_request_by_peer .entry(*peer_id) .or_default() diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index abea407b0ed..b62ed2c9dd2 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -10,7 +10,9 @@ use itertools::Itertools; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use parking_lot::RwLock; use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; +use std::sync::Arc; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -91,7 +93,11 @@ pub struct SyncingChain { /// /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain /// `HashMap` - peers: HashMap, + peers: Arc>>, + + /// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data + /// from all peers to prevent eclipse attacks + requests_per_peer: HashMap, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -173,7 +179,8 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers: HashMap::from_iter([(peer_id, <_>::default())]), + peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), + requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -191,7 +198,7 @@ impl SyncingChain { /// Check if the chain has peers from which to process batches. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn available_peers(&self) -> usize { - self.peers.len() + self.peers.read().len() } /// Get the chain's id. @@ -203,7 +210,12 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.keys().cloned() + self.peers + .read() + .iter() + .copied() + .collect::>() + .into_iter() } /// Progress in epochs made by the chain @@ -227,9 +239,10 @@ impl SyncingChain { /// If the peer has active batches, those are considered failed and re-requested. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { - self.peers.remove(peer_id); + self.peers.write().remove(peer_id); + self.requests_per_peer.remove(peer_id); - if self.peers.is_empty() { + if self.peers.read().is_empty() { Err(RemoveChain::EmptyPeerPool) } else { Ok(KeepChain) @@ -259,7 +272,7 @@ impl SyncingChain { // Account for one more requests to this peer // TODO(das): this code assumes that we do a single request per peer per RpcBlock for peer in batch_peers.iter_unique_peers() { - *self.peers.entry(*peer).or_default() += 1; + *self.requests_per_peer.entry(*peer).or_default() += 1; } // check if we have this batch @@ -613,7 +626,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for (peer, _) in self.peers.drain() { + for peer in self.peers.write().drain() { network.report_peer(peer, penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -878,7 +891,8 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - self.peers.insert(peer_id, <_>::default()); + self.peers.write().insert(peer_id); + self.requests_per_peer.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -952,26 +966,15 @@ impl SyncingChain { let request = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_block_peers(); - // TODO(das): we should request only from peers that are part of this SyncingChain. - // However, then we hit the NoPeer error frequently which causes the batch to fail and - // the SyncingChain to be dropped. We need to handle this case more gracefully. - let synced_peers = network - .network_globals() - .peers - .read() - .synced_peers() - .cloned() - .collect::>(); - match network.block_components_by_range_request( request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, - &synced_peers, + self.peers.clone(), &failed_peers, - &self.peers, + &self.requests_per_peer, ) { Ok(request_id) => { // inform the batch about the new request From 0ef95dd7f834f67198ae8886ebf6f0754f0e3c37 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:33:39 -0500 Subject: [PATCH 12/23] Remove stale TODO --- beacon_node/network/src/sync/range_sync/batch.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 8ee9748ebcc..99ee4fb6be2 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -19,7 +19,6 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; -// TODO(das): Consider merging with PeerGroup #[derive(Clone, Debug)] pub struct BatchPeers { block_peer: PeerId, From 144b83e6257918d3b66f9a078946cd43112d0b12 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 15:52:14 -0500 Subject: [PATCH 13/23] Remove BatchStateSummary --- .../network/src/sync/range_sync/chain.rs | 30 ++-------------- .../network/src/sync/range_sync/mod.rs | 2 -- .../network/src/sync/range_sync/range.rs | 6 ++-- beacon_node/network/src/sync/tests/range.rs | 34 +++++++++++++------ 4 files changed, 29 insertions(+), 43 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b62ed2c9dd2..76721ec5aa3 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -129,37 +129,13 @@ pub enum ChainSyncingState { Syncing, } -#[cfg(test)] -#[derive(Debug, Eq, PartialEq)] -pub enum BatchStateSummary { - Downloading, - Processing, - AwaitingProcessing, - AwaitingValidation, - Unexpected(&'static str), -} - impl SyncingChain { - /// Returns a summary of batch states for assertions in tests. + /// Leaks the state of all active batches for assertions in tests. #[cfg(test)] - pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> { + pub fn batches_state(&self) -> Vec<(BatchId, &BatchState)> { self.batches .iter() - .map(|(id, batch)| { - let state = match batch.state() { - // A batch is never left in this state, it's only the initial value - BatchState::AwaitingDownload => { - BatchStateSummary::Unexpected("AwaitingDownload") - } - BatchState::Downloading { .. } => BatchStateSummary::Downloading, - BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing, - BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"), - BatchState::Processing { .. } => BatchStateSummary::Processing, - BatchState::Failed => BatchStateSummary::Unexpected("Failed"), - BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation, - }; - (*id, state) - }) + .map(|(id, batch)| (*id, batch.state())) .collect() } diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index e9fb0219c45..225b536d1de 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -10,8 +10,6 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, }; -#[cfg(test)] -pub use chain::BatchStateSummary; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 473e2066cee..62d18252683 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,8 +39,6 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -#[cfg(test)] -use super::chain::BatchStateSummary; use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; @@ -48,6 +46,8 @@ use super::BatchPeers; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; +#[cfg(test)] +use crate::sync::range_sync::BatchState; use crate::sync::BatchProcessResult; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -107,7 +107,7 @@ where } #[cfg(test)] - pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, &BatchState)> { self.chains .iter() .flat_map(|chain| { diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 09c99d07d8c..75ad7d2767e 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -3,7 +3,7 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; -use crate::sync::range_sync::{BatchId, BatchStateSummary, RangeSyncType}; +use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; @@ -298,7 +298,7 @@ impl TestRig { self.sync_manager.network().network_globals().sync_state() } - fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, BatchStateSummary)> { + fn get_batch_states(&mut self) -> Vec<(ChainId, BatchId, &BatchState)> { self.sync_manager.range_sync().batches_state() } @@ -382,27 +382,39 @@ impl TestRig { } } - fn expect_all_batches_in_state(&mut self, states: &[BatchStateSummary]) { + fn expect_all_batches_in_state) -> bool>( + &mut self, + predicate: F, + expected_state: &'static str, + ) { let batches = self.get_batch_states(); if batches.is_empty() { panic!("no batches"); } - for batch in &batches { - if !states.contains(&batch.2) { - panic!("batch {batch:?} not in state {states:?}. Batches: {batches:?}"); + for (chain_id, batch_id, state) in &batches { + if !predicate(state) { + panic!("batch {chain_id} {batch_id} not in state {expected_state}, {state}"); } } } fn expect_all_batches_downloading(&mut self) { - self.expect_all_batches_in_state(&[BatchStateSummary::Downloading]); + self.expect_all_batches_in_state( + |state| matches!(state, BatchState::Downloading { .. }), + "Downloading", + ); } fn expect_all_batches_processing_or_awaiting(&mut self) { - self.expect_all_batches_in_state(&[ - BatchStateSummary::Processing, - BatchStateSummary::AwaitingProcessing, - ]); + self.expect_all_batches_in_state( + |state| { + matches!( + state, + BatchState::Processing { .. } | BatchState::AwaitingProcessing { .. } + ) + }, + "Processing or AwaitingProcessing", + ); } fn update_execution_engine_state(&mut self, state: EngineState) { From 02d97377a5510bd6659abe6c4836f994e62eec49 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 27 May 2025 16:07:45 -0500 Subject: [PATCH 14/23] Address review comments --- beacon_node/network/src/sync/manager.rs | 5 +--- .../network/src/sync/network_context.rs | 5 ++-- beacon_node/network/src/sync/tests/lookups.rs | 23 +++++++++---------- beacon_node/network/src/sync/tests/range.rs | 1 + 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 1fc46b95762..dfafc884050 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1225,10 +1225,7 @@ impl SyncManager { // custody_by_range accumulates the results of multiple data_columns_by_range requests // returning a bigger list of data columns across all the column indices this node has // to custody - if let Some(result) = - self.network - .on_custody_by_range_response(id.parent_request_id, id, peer_id, resp) - { + if let Some(result) = self.network.on_custody_by_range_response(id, peer_id, resp) { self.on_custody_by_range_result(id.parent_request_id, result); } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 7a4175f2708..58eb3053034 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1376,14 +1376,13 @@ impl SyncNetworkContext { #[allow(clippy::type_complexity)] pub fn on_custody_by_range_response( &mut self, - id: CustodyByRangeRequestId, req_id: DataColumnsByRangeRequestId, peer_id: PeerId, resp: RpcResponseResult>, ) -> Option> { // Note: need to remove the request to borrow self again below. Otherwise we can't // do nested requests - let Some(mut request) = self.custody_by_range_requests.remove(&id) else { + let Some(mut request) = self.custody_by_range_requests.remove(&id.parent_request_id) else { metrics::inc_counter_vec( &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["custody_by_range"], @@ -1396,7 +1395,7 @@ impl SyncNetworkContext { .map_err(Into::::into) .transpose(); - self.handle_custody_by_range_result(id, request, result) + self.handle_custody_by_range_result(id.parent_request_id, request, result) } fn handle_custody_by_range_result( diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index d85504d4654..1a37c231861 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -368,16 +368,19 @@ impl TestRig { self.expect_empty_network(); } - // Don't make pub, use `add_connected_peer_testing_only` + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor fn new_connected_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(false) } - // Don't make pub, use `add_connected_peer_testing_only` + // Note: prefer to use `add_connected_peer_testing_only`. This is currently extensively used in + // lookup tests. We should consolidate this "add peer" methods in a future refactor fn new_connected_supernode_peer(&mut self) -> PeerId { self.add_connected_peer_testing_only(true) } + /// Add a random connected peer that is not known by the sync module pub fn add_connected_peer_testing_only(&mut self, supernode: bool) -> PeerId { let key = self.determinstic_key(); let peer_id = self @@ -401,6 +404,7 @@ impl TestRig { peer_id } + /// Add a random connected peer + add it to sync with a specific remote Status pub fn add_sync_peer(&mut self, supernode: bool, remote_info: SyncInfo) -> PeerId { let peer_id = self.add_connected_peer_testing_only(supernode); self.send_sync_message(SyncMessage::AddPeer(peer_id, remote_info)); @@ -887,7 +891,7 @@ impl TestRig { } } - // Find, not pop + /// Similar to `pop_received_network_events` but finds matching events without removing them. pub fn filter_received_network_events) -> Option>( &mut self, predicate_transform: F, @@ -1149,15 +1153,10 @@ impl TestRig { } pub fn expect_no_penalty_for_anyone(&mut self) { - self.drain_network_rx(); - let downscore_events = self - .network_rx_queue - .iter() - .filter_map(|ev| match ev { - NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), - _ => None, - }) - .collect::>(); + let downscore_events = self.filter_received_network_events(|ev| match ev { + NetworkMessage::ReportPeer { peer_id, msg, .. } => Some((peer_id, msg)), + _ => None, + }); if !downscore_events.is_empty() { panic!("Expected no downscoring events but found: {downscore_events:?}"); } diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 75ad7d2767e..642f92ee664 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -51,6 +51,7 @@ enum ByRangeDataRequestIds { } impl ByRangeDataRequestIds { + /// If there's a single active request, returns its peer, else panics fn peer(&self) -> PeerId { match self { Self::PreDeneb => panic!("no requests PreDeneb"), From ae0ef8f92926e9c99189a271e23736f7cfa148d2 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 4 Jun 2025 22:20:45 -0600 Subject: [PATCH 15/23] Fix finalized_sync_permanent_custody_peer_failure --- beacon_node/network/src/sync/manager.rs | 5 +- .../network/src/sync/range_sync/chain.rs | 19 +++-- .../src/sync/range_sync/chain_collection.rs | 6 +- .../network/src/sync/range_sync/mod.rs | 2 +- .../network/src/sync/range_sync/range.rs | 4 +- beacon_node/network/src/sync/tests/lookups.rs | 23 +++++-- beacon_node/network/src/sync/tests/range.rs | 69 +++++++------------ 7 files changed, 66 insertions(+), 62 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index dfafc884050..f21576372d4 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -50,6 +50,7 @@ use crate::sync::block_lookups::{ BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, }; use crate::sync::network_context::PeerGroup; +use crate::sync::range_sync::BATCH_BUFFER_SIZE; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ @@ -280,6 +281,7 @@ pub fn spawn( sync_recv, SamplingConfig::Default, fork_context, + BATCH_BUFFER_SIZE, ); // spawn the sync manager thread @@ -302,6 +304,7 @@ impl SyncManager { sync_recv: mpsc::UnboundedReceiver>, sampling_config: SamplingConfig, fork_context: Arc, + batch_buffer_size: usize, ) -> Self { let network_globals = beacon_processor.network_globals.clone(); Self { @@ -313,7 +316,7 @@ impl SyncManager { beacon_chain.clone(), fork_context.clone(), ), - range_sync: RangeSync::new(beacon_chain.clone()), + range_sync: RangeSync::new(beacon_chain.clone(), batch_buffer_size), backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 76721ec5aa3..44b2b1937d6 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -26,7 +26,7 @@ use types::{Epoch, EthSpec, Hash256, Slot}; pub const EPOCHS_PER_BATCH: u64 = 1; /// The maximum number of batches to queue before requesting more. -const BATCH_BUFFER_SIZE: u8 = 5; +pub const BATCH_BUFFER_SIZE: usize = 5; /// A return type for functions that act on a `Chain` which informs the caller whether the chain /// has been completed and should be removed or to be kept if further processing is @@ -119,6 +119,9 @@ pub struct SyncingChain { /// The current processing batch, if any. current_processing_batch: Option, + + /// The maximum number of batches to queue before requesting more. + batch_buffer_size: usize, } #[derive(PartialEq, Debug)] @@ -147,6 +150,7 @@ impl SyncingChain { target_head_root: Hash256, peer_id: PeerId, chain_type: SyncingChainType, + batch_buffer_size: usize, ) -> Self { SyncingChain { id, @@ -163,6 +167,7 @@ impl SyncingChain { attempted_optimistic_starts: HashSet::default(), state: ChainSyncingState::Stopped, current_processing_batch: None, + batch_buffer_size, } } @@ -1075,7 +1080,7 @@ impl SyncingChain { .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) .count() - > BATCH_BUFFER_SIZE as usize + >= self.batch_buffer_size as usize { return None; } @@ -1105,28 +1110,28 @@ impl SyncingChain { /// batch states. See [BatchState::visualize] for symbol definitions. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity((BATCH_BUFFER_SIZE * 3) as usize); + let mut visualization_string = String::with_capacity((self.batch_buffer_size * 3) as usize); // Start of the block visualization_string.push('['); - for mut batch_index in 0..BATCH_BUFFER_SIZE { + for mut batch_index in 0..self.batch_buffer_size { if let Some(batch) = self .batches .get(&(self.processing_target + batch_index as u64 * EPOCHS_PER_BATCH)) { visualization_string.push(batch.visualize()); - if batch_index != BATCH_BUFFER_SIZE { + if batch_index != self.batch_buffer_size { // Add a comma in between elements visualization_string.push(','); } } else { // No batch exists, it is on our list to be downloaded // Fill in the rest of the gaps - while batch_index < BATCH_BUFFER_SIZE { + while batch_index < self.batch_buffer_size { visualization_string.push('E'); // Add a comma between the empty batches - if batch_index < BATCH_BUFFER_SIZE.saturating_sub(1) { + if batch_index < self.batch_buffer_size.saturating_sub(1) { visualization_string.push(',') } batch_index += 1; diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index 454f7c02d15..44ce43d56aa 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -51,6 +51,8 @@ pub struct ChainCollection { head_chains: FnvHashMap>, /// The current sync state of the process. state: RangeSyncState, + /// The maximum number of batches to queue before requesting more. + batch_buffer_size: usize, } impl ChainCollection { @@ -61,12 +63,13 @@ impl ChainCollection { .chain(self.head_chains.values()) } - pub fn new(beacon_chain: Arc>) -> Self { + pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { ChainCollection { beacon_chain, finalized_chains: FnvHashMap::default(), head_chains: FnvHashMap::default(), state: RangeSyncState::Idle, + batch_buffer_size, } } @@ -504,6 +507,7 @@ impl ChainCollection { target_head_root, peer, sync_type.into(), + self.batch_buffer_size, ); debug!( diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 225b536d1de..67479f9a1e0 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -10,6 +10,6 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, }; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; +pub use chain::{BatchId, ChainId, BATCH_BUFFER_SIZE, EPOCHS_PER_BATCH}; pub use range::RangeSync; pub use sync_type::RangeSyncType; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 62d18252683..8f52fa7a496 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -90,10 +90,10 @@ where name = "range_sync", skip_all )] - pub fn new(beacon_chain: Arc>) -> Self { + pub fn new(beacon_chain: Arc>, batch_buffer_size: usize) -> Self { RangeSync { beacon_chain: beacon_chain.clone(), - chains: ChainCollection::new(beacon_chain), + chains: ChainCollection::new(beacon_chain, batch_buffer_size), failed_chains: LRUTimeCache::new(std::time::Duration::from_secs( FAILED_CHAINS_EXPIRY_SECONDS, )), diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index f26a467f273..8477b46958f 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -2,6 +2,7 @@ use crate::network_beacon_processor::NetworkBeaconProcessor; use crate::sync::block_lookups::{ BlockLookupSummary, PARENT_DEPTH_TOLERANCE, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS, }; +use crate::sync::range_sync::BATCH_BUFFER_SIZE; use crate::sync::{ manager::{BlockProcessType, BlockProcessingResult, SyncManager}, peer_sampling::SamplingConfig, @@ -59,16 +60,29 @@ pub enum PeersConfig { SupernodeOnly, } +pub struct TestOptions { + /// If the node created by this test harness is a supernode + pub is_supernode: bool, + /// The maximum number of batches to queue before requesting more. + pub batch_buffer_size: usize, +} + impl TestRig { pub fn test_setup() -> Self { - Self::test_setup_with_options(false) + Self::test_setup_with_options(TestOptions { + is_supernode: false, + batch_buffer_size: BATCH_BUFFER_SIZE, + }) } pub fn test_setup_as_supernode() -> Self { - Self::test_setup_with_options(true) + Self::test_setup_with_options(TestOptions { + is_supernode: true, + batch_buffer_size: BATCH_BUFFER_SIZE, + }) } - fn test_setup_with_options(is_supernode: bool) -> Self { + pub fn test_setup_with_options(options: TestOptions) -> Self { // Use `fork_from_env` logic to set correct fork epochs let spec = test_spec::(); @@ -101,7 +115,7 @@ impl TestRig { Vec::new(), network_config, chain.spec.clone(), - is_supernode, + options.is_supernode, )); let (beacon_processor, beacon_processor_rx) = NetworkBeaconProcessor::null_for_testing( globals, @@ -143,6 +157,7 @@ impl TestRig { required_successes: vec![SAMPLING_REQUIRED_SUCCESSES], }, fork_context, + options.batch_buffer_size, ), harness, fork_name, diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 642f92ee664..382965ec97b 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -4,12 +4,12 @@ use crate::status::ToStatusMessage; use crate::sync::manager::SLOT_IMPORT_TOLERANCE; use crate::sync::network_context::{BlockComponentsByRangeRequestStep, RangeRequestId}; use crate::sync::range_sync::{BatchId, BatchState, RangeSyncType}; +use crate::sync::tests::lookups::TestOptions; use crate::sync::{ChainId, SyncMessage}; use beacon_chain::data_column_verification::CustodyDataColumn; -use beacon_chain::test_utils::{test_spec, AttestationStrategy, BlockStrategy}; +use beacon_chain::test_utils::{AttestationStrategy, BlockStrategy}; use beacon_chain::{block_verification_types::RpcBlock, EngineState, NotifyExecutionLayer}; use beacon_processor::WorkType; -use lighthouse_network::discovery::{peer_id_to_node_id, CombinedKey}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, DataColumnsByRangeRequest, OldBlocksByRangeRequest, }; @@ -19,16 +19,13 @@ use lighthouse_network::service::api_types::{ DataColumnsByRangeRequestId, SyncRequestId, }; use lighthouse_network::types::SyncState; -use lighthouse_network::{Enr, EnrExt, PeerId, SyncInfo}; -use rand::SeedableRng; -use rand_chacha::ChaCha20Rng; +use lighthouse_network::{PeerId, SyncInfo}; use std::collections::HashSet; use std::time::Duration; -use types::data_column_custody_group::compute_subnets_for_node; use types::{ - BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, - DataColumnSubnetId, Epoch, EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, - SignedBeaconBlock, SignedBeaconBlockHash, Slot, VariableList, + BeaconBlock, BlobSidecarList, BlockImportSource, ColumnIndex, DataColumnSidecar, Epoch, + EthSpec, Hash256, KzgCommitment, MinimalEthSpec as E, Signature, SignedBeaconBlock, + SignedBeaconBlockHash, Slot, VariableList, }; const D: Duration = Duration::new(0, 0); @@ -93,6 +90,12 @@ struct RequestFilter { column_index: Option, } +const NO_FILTER: RequestFilter = RequestFilter { + peer: None, + epoch: None, + column_index: None, +}; + impl RequestFilter { fn peer(mut self, peer: PeerId) -> Self { self.peer = Some(peer); @@ -1094,7 +1097,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // The SyncingChain has a single peer, so it can issue blocks_by_range requests. However, it // doesn't have enough peers to cover all columns - r.progress_until_no_events(filter(), complete()); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); // Here we have a batch with partially completed block_components_by_range requests. The batch @@ -1108,7 +1111,7 @@ fn finalized_sync_not_enough_custody_peers_on_start(config: Config) { // We still need to add enough peers to trigger batch downloads with idle peers. Same issue as // the test above. - r.progress_until_no_events(filter(), complete()); + r.progress_until_no_events(NO_FILTER, complete()); r.expect_no_active_rpc_requests(); r.expect_no_active_block_components_by_range_requests(); // TOOD(das): For now this tests don't complete sync. We can't track beacon processor Work @@ -1134,7 +1137,7 @@ fn finalized_sync_single_custody_peer_failure() { // Progress all blocks_by_range and columns_by_range requests but respond empty for a single // column index r.progress_until_no_events( - filter(), + NO_FILTER, complete().custody_failure_at_index(column_index_to_fail), ); r.expect_penalties("custody_failure"); @@ -1162,7 +1165,13 @@ fn finalized_sync_single_custody_peer_failure() { #[test] fn finalized_sync_permanent_custody_peer_failure() { - let mut r = TestRig::test_setup(); + let mut r = TestRig::test_setup_with_options(TestOptions { + is_supernode: false, + // The default buffer size is 5, but we want to manually complete only the batch for epoch + // 0. By setting this buffer to 1 sync will create a single batch until it completes. We can + // do better assertions of state assuming there's only one batch and logs are cleaner. + batch_buffer_size: 1, + }); // Only run post-PeerDAS if !r.fork_name.fulu_enabled() { return; @@ -1192,7 +1201,7 @@ fn finalized_sync_permanent_custody_peer_failure() { // Some peer had a costudy failure at `column_index` so sync should do a single extra request // for that index and epoch. We want to make sure that the request goes to different peer - // than the attempts before. + // than the attempted before. let reqs = r.find_data_by_range_request(filter().epoch(0).column_index(column_index_to_fail)); let req_peer = reqs.peer(); @@ -1212,35 +1221,3 @@ fn finalized_sync_permanent_custody_peer_failure() { // custody_by_range request is still active waiting for a new peer to connect r.expect_active_block_components_by_range_request_on_custody_step(); } - -#[test] -#[ignore] -fn mine_peerids() { - let spec = test_spec::(); - let mut rng = ChaCha20Rng::from_seed([0u8; 32]); - - let expected_subnets = (0..3) - .map(|i| DataColumnSubnetId::new(i as u64)) - .collect::>(); - - for i in 0..usize::MAX { - let key: CombinedKey = k256::ecdsa::SigningKey::random(&mut rng).into(); - let enr = Enr::builder().build(&key).unwrap(); - let peer_id = enr.peer_id(); - // Use default custody groups count - let node_id = peer_id_to_node_id(&peer_id).expect("convert peer_id to node_id"); - let subnets = compute_subnets_for_node(node_id.raw(), spec.custody_requirement, &spec) - .expect("should compute custody subnets"); - if expected_subnets == subnets { - panic!("{:?}", subnets); - } else { - let matches = expected_subnets - .iter() - .filter(|index| subnets.contains(index)) - .count(); - if matches > 0 { - println!("{i} {:?}", matches); - } - } - } -} From 28d9d8b8e2993b27f7cac279613b97858092afc8 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:02:37 +0200 Subject: [PATCH 16/23] lint --- beacon_node/network/src/sync/range_sync/chain.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 44b2b1937d6..9e0363c379f 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1080,7 +1080,7 @@ impl SyncingChain { .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) .count() - >= self.batch_buffer_size as usize + >= self.batch_buffer_size { return None; } @@ -1110,7 +1110,7 @@ impl SyncingChain { /// batch states. See [BatchState::visualize] for symbol definitions. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] fn visualize_batch_state(&self) -> String { - let mut visualization_string = String::with_capacity((self.batch_buffer_size * 3) as usize); + let mut visualization_string = String::with_capacity(self.batch_buffer_size * 3); // Start of the block visualization_string.push('['); From 7a035787954e2406a4e1040c46f346783704fa4c Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:21:12 +0200 Subject: [PATCH 17/23] Remove total_requests_per_peer --- .../network/src/sync/backfill_sync/mod.rs | 4 +--- .../network/src/sync/network_context.rs | 11 ++--------- .../block_components_by_range.rs | 9 +-------- .../network/src/sync/range_sync/batch.rs | 7 ------- .../network/src/sync/range_sync/chain.rs | 19 +------------------ 5 files changed, 5 insertions(+), 45 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 47810d536e5..0a68dc2ce8a 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -26,7 +26,7 @@ use logging::crit; use parking_lot::RwLock; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashMap, HashSet, + HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -932,8 +932,6 @@ impl BackFillSync { RangeRequestId::BackfillSync { batch_id }, self.peers.clone(), &failed_peers, - // Does not track total requests per peers for now - &HashMap::new(), ) { Ok(request_id) => { // inform the batch about the new request diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 5bb277d996c..f66f6668427 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -480,21 +480,14 @@ impl SyncNetworkContext { requester: RangeRequestId, peers: Arc>>, peers_to_deprioritize: &HashSet, - total_requests_per_peer: &HashMap, ) -> Result { let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - let req = BlockComponentsByRangeRequest::new( - id, - request, - peers, - peers_to_deprioritize, - total_requests_per_peer, - self, - )?; + let req = + BlockComponentsByRangeRequest::new(id, request, peers, peers_to_deprioritize, self)?; self.block_components_by_range_requests.insert(id, req); diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index bb981e31543..07132f5ac1c 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -105,7 +105,6 @@ impl BlockComponentsByRangeRequest { request: BlocksByRangeRequest, peers: Arc>>, peers_to_deprioritize: &HashSet, - total_requests_per_peer: &HashMap, cx: &mut SyncNetworkContext, ) -> Result { // Induces a compile time panic if this doesn't hold true. @@ -129,19 +128,13 @@ impl BlockComponentsByRangeRequest { ( // If contains -> 1 (order after), not contains -> 0 (order first) peers_to_deprioritize.contains(peer), - // TODO(das): Should we use active_request_count_by_peer? - // Prefer peers with less overall requests - // active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Prefer peers with less total cummulative requests, so we fetch data from a - // diverse set of peers - total_requests_per_peer.get(peer).copied().unwrap_or(0), // Random factor to break ties, otherwise the PeerID breaks ties rand::random::(), peer, ) }) .min() - .map(|(_, _, _, peer)| *peer) + .map(|(_, _, peer)| *peer) else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 99ee4fb6be2..ab9fd40babd 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,6 +1,5 @@ use crate::sync::network_context::PeerGroup; use beacon_chain::block_verification_types::RpcBlock; -use itertools::Itertools; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; use lighthouse_network::PeerId; @@ -46,12 +45,6 @@ impl BatchPeers { pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> { self.column_peers.of_index(&((*index) as usize)) } - - pub fn iter_unique_peers(&self) -> impl Iterator { - std::iter::once(&self.block_peer) - .chain(self.column_peers.all()) - .unique() - } } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 9e0363c379f..87e00bc91a2 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -11,7 +11,7 @@ use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use parking_lot::RwLock; -use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet}; +use std::collections::{btree_map::Entry, BTreeMap, HashSet}; use std::sync::Arc; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; @@ -90,15 +90,8 @@ pub struct SyncingChain { /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from. - /// - /// Also, For each peer tracks the total requests done per peer as part of this SyncingChain - /// `HashMap` peers: Arc>>, - /// Tracks the total requests done to each peer for this SyncingChain. Forces us to fetch data - /// from all peers to prevent eclipse attacks - requests_per_peer: HashMap, - /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -160,7 +153,6 @@ impl SyncingChain { target_head_root, batches: BTreeMap::new(), peers: Arc::new(RwLock::new(HashSet::from_iter([peer_id]))), - requests_per_peer: HashMap::from_iter([(peer_id, <_>::default())]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -221,7 +213,6 @@ impl SyncingChain { #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { self.peers.write().remove(peer_id); - self.requests_per_peer.remove(peer_id); if self.peers.read().is_empty() { Err(RemoveChain::EmptyPeerPool) @@ -250,12 +241,6 @@ impl SyncingChain { request_id: Id, blocks: Vec>, ) -> ProcessingResult { - // Account for one more requests to this peer - // TODO(das): this code assumes that we do a single request per peer per RpcBlock - for peer in batch_peers.iter_unique_peers() { - *self.requests_per_peer.entry(*peer).or_default() += 1; - } - // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { None => { @@ -873,7 +858,6 @@ impl SyncingChain { peer_id: PeerId, ) -> ProcessingResult { self.peers.write().insert(peer_id); - self.requests_per_peer.insert(peer_id, <_>::default()); self.request_batches(network) } @@ -955,7 +939,6 @@ impl SyncingChain { }, self.peers.clone(), &failed_peers, - &self.requests_per_peer, ) { Ok(request_id) => { // inform the batch about the new request From 4e13b3be0f77d50eb5db04d4af335b4ee440f62e Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:49:25 +0200 Subject: [PATCH 18/23] Fix failed_peers post fulu --- .../network/src/sync/backfill_sync/mod.rs | 11 +++- .../network/src/sync/range_sync/batch.rs | 58 +++++++++---------- .../network/src/sync/range_sync/chain.rs | 12 +++- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 0a68dc2ce8a..5037cf48605 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -617,9 +617,12 @@ impl BackFillSync { error, } => { // TODO(sync): De-dup between back and forwards sync + let mut failed_peers = vec![]; + if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + failed_peers.push(batch_peers.block()); } // Penalize each peer only once. Currently a peer_action does not mix different @@ -635,9 +638,11 @@ impl BackFillSync { .unique() { network.report_peer(peer, penalty, "faulty_batch_column"); + failed_peers.push(peer); } - match batch.processing_completed(BatchProcessingResult::FaultyFailure) { + match batch.processing_completed(BatchProcessingResult::FaultyFailure(failed_peers)) + { Err(e) => { // Batch was in the wrong state self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)) @@ -926,12 +931,12 @@ impl BackFillSync { ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( request, RangeRequestId::BackfillSync { batch_id }, self.peers.clone(), - &failed_peers, + failed_peers, ) { Ok(request_id) => { // inform the batch about the new request diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index ab9fd40babd..5267ba56ba5 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -112,7 +112,7 @@ pub enum BatchOperationOutcome { pub enum BatchProcessingResult { Success, - FaultyFailure, + FaultyFailure(Vec), NonFaultyFailure, } @@ -128,7 +128,9 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec>, + failed_download_attempts: usize, + /// Peers that returned bad data, and we want to de-prioritize + failed_peers: HashSet, /// State of the batch. state: BatchState, /// Pin the generic @@ -197,7 +199,8 @@ impl BatchInfo { start_slot, end_slot, failed_processing_attempts: Vec::new(), - failed_download_attempts: Vec::new(), + failed_download_attempts: 0, + failed_peers: <_>::default(), non_faulty_processing_attempts: 0, state: BatchState::AwaitingDownload, marker: std::marker::PhantomData, @@ -206,23 +209,8 @@ impl BatchInfo { /// Gives a list of peers from which this batch has had a failed download or processing /// attempt. - /// - /// TODO(das): Returns only block peers to keep the mainnet path equivalent. The failed peers - /// mechanism is broken for PeerDAS and will be fixed with https://github.com/sigp/lighthouse/issues/6258 - pub fn failed_block_peers(&self) -> HashSet { - let mut peers = HashSet::with_capacity( - self.failed_processing_attempts.len() + self.failed_download_attempts.len(), - ); - - for attempt in &self.failed_processing_attempts { - peers.insert(attempt.peers.block()); - } - - for peer in self.failed_download_attempts.iter().flatten() { - peers.insert(*peer); - } - - peers + pub fn failed_peers(&self) -> &HashSet { + &self.failed_peers } /// Verifies if an incoming block belongs to this batch. @@ -272,8 +260,7 @@ impl BatchInfo { match self.state { BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Failed => BatchOperationOutcome::Failed { - blacklist: self.failed_processing_attempts.len() - > self.failed_download_attempts.len(), + blacklist: self.failed_processing_attempts.len() > self.failed_download_attempts, }, _ => BatchOperationOutcome::Continue, } @@ -325,15 +312,19 @@ impl BatchInfo { match self.state.poison() { BatchState::Downloading(_request_id) => { // register the attempt and check if the batch can be tried again - self.failed_download_attempts.push(peer); - self.state = if self.failed_download_attempts.len() - >= B::max_batch_download_attempts() as usize - { - BatchState::Failed - } else { - // drop the blocks - BatchState::AwaitingDownload - }; + if let Some(peer) = peer { + self.failed_peers.insert(peer); + } + + self.failed_download_attempts += 1; + + self.state = + if self.failed_download_attempts >= B::max_batch_download_attempts() as usize { + BatchState::Failed + } else { + // drop the blocks + BatchState::AwaitingDownload + }; Ok(self.outcome()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -390,9 +381,12 @@ impl BatchInfo { BatchState::Processing(attempt) => { self.state = match procesing_result { BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), - BatchProcessingResult::FaultyFailure => { + BatchProcessingResult::FaultyFailure(failed_peers) => { // register the failed attempt self.failed_processing_attempts.push(attempt); + for peer in failed_peers { + self.failed_peers.insert(peer); + } // check if the batch can be downloaded again if self.failed_processing_attempts.len() diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 87e00bc91a2..17bce62a7c7 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -539,10 +539,13 @@ impl SyncingChain { // TODO(sync): propagate error in logs error: _, } => { + let mut failed_peers = vec![]; + // TODO(sync): De-dup between back and forwards sync if let Some(penalty) = peer_action.block_peer { // Penalize the peer appropiately. network.report_peer(batch_peers.block(), penalty, "faulty_batch"); + failed_peers.push(batch_peers.block()); } // Penalize each peer only once. Currently a peer_action does not mix different @@ -558,10 +561,13 @@ impl SyncingChain { .unique() { network.report_peer(peer, penalty, "faulty_batch_column"); + failed_peers.push(peer); } // Check if this batch is allowed to continue - match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { + match batch + .processing_completed(BatchProcessingResult::FaultyFailure(failed_peers))? + { BatchOperationOutcome::Continue => { // Chain can continue. Check if it can be moved forward. if *imported_blocks > 0 { @@ -929,7 +935,7 @@ impl SyncingChain { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let request = batch.to_blocks_by_range_request(); - let failed_peers = batch.failed_block_peers(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( request, @@ -938,7 +944,7 @@ impl SyncingChain { batch_id, }, self.peers.clone(), - &failed_peers, + failed_peers, ) { Ok(request_id) => { // inform the batch about the new request From e426e45455bd457a5735e722b08f11dd42630fc0 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:38:55 +0200 Subject: [PATCH 19/23] Don't use failed_peers for download errors, rely on randomness to skip potentially faulty peers --- beacon_node/network/src/sync/backfill_sync/mod.rs | 6 ++---- beacon_node/network/src/sync/range_sync/batch.rs | 10 +--------- beacon_node/network/src/sync/range_sync/chain.rs | 8 ++------ 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 5037cf48605..70d6573264b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -330,9 +330,7 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - // TODO(das): Is it necessary for the batch to track failed peers? Can we make this - // mechanism compatible with PeerDAS and before PeerDAS? - match batch.download_failed(None) { + match batch.download_failed() { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => self.fail_sync(match err { RpcResponseError::RpcError(_) @@ -956,7 +954,7 @@ impl BackFillSync { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } - match batch.download_failed(None) { + match batch.download_failed() { Err(e) => { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? } diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 5267ba56ba5..8834c74c08b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -305,17 +305,9 @@ impl BatchInfo { /// The `peer` parameter, when set to None, does not increment the failed attempts of /// this batch and register the peer, rather attempts a re-download. #[must_use = "Batch may have failed"] - pub fn download_failed( - &mut self, - peer: Option, - ) -> Result { + pub fn download_failed(&mut self) -> Result { match self.state.poison() { BatchState::Downloading(_request_id) => { - // register the attempt and check if the batch can be tried again - if let Some(peer) = peer { - self.failed_peers.insert(peer); - } - self.failed_download_attempts += 1; self.state = diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 17bce62a7c7..921d134c681 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -902,11 +902,7 @@ impl SyncingChain { %request_id, "Batch download error" ); - if let BatchOperationOutcome::Failed { blacklist } = - // TODO(das): Is it necessary for the batch to track failed peers? Can we make this - // mechanism compatible with PeerDAS and before PeerDAS? - batch.download_failed(None)? - { + if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed()? { return Err(RemoveChain::ChainFailed { blacklist, failing_batch: batch_id, @@ -966,7 +962,7 @@ impl SyncingChain { warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried batch.start_downloading(1)?; // fake request_id = 1 is not relevant - match batch.download_failed(None)? { + match batch.download_failed()? { BatchOperationOutcome::Failed { blacklist } => { return Err(RemoveChain::ChainFailed { blacklist, From 82c8e82fe1a65eddcbb24734b2f1903e872fdc45 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 11 Jun 2025 16:46:18 +0200 Subject: [PATCH 20/23] Re-add NoPeers error --- .../network/src/sync/backfill_sync/mod.rs | 17 +++++++++++++++++ beacon_node/network/src/sync/network_context.rs | 1 + .../block_components_by_range.rs | 8 +++++--- .../network/src/sync/range_sync/chain.rs | 2 +- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 70d6573264b..0aaea4d65fd 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -301,6 +301,14 @@ impl BackFillSync { pub fn peer_disconnected(&mut self, peer_id: &PeerId) { self.peers.write().remove(peer_id); + + if self.peers.read().is_empty() { + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + } } /// An RPC error has occurred. @@ -946,6 +954,15 @@ impl BackFillSync { return Ok(()); } Err(e) => match e { + RpcRequestSendError::NoPeers => { + // If we are here the chain has no more synced peers + info!( + "reason" = "insufficient_synced_peers", + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + return Err(BackFillError::Paused); + } RpcRequestSendError::InternalError(e) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index f66f6668427..a81591e58f0 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -98,6 +98,7 @@ pub enum RpcRequestSendError { // If RpcRequestSendError has a single variant `InternalError` it's to signal to downstream // consumers that sends are expected to be infallible. If this assumption changes in the future, // add a new variant. + NoPeers, } #[derive(Debug, PartialEq, Eq)] diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 07132f5ac1c..913b798d8ed 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -138,9 +138,7 @@ impl BlockComponentsByRangeRequest { else { // When a peer disconnects and is removed from the SyncingChain peer set, if the set // reaches zero the SyncingChain is removed. - return Err(RpcRequestSendError::InternalError( - "A batch peer set should never be empty".to_string(), - )); + return Err(RpcRequestSendError::NoPeers); }; let blocks_req_id = cx.send_blocks_by_range_request(block_peer, request.clone(), id)?; @@ -269,6 +267,10 @@ impl BlockComponentsByRangeRequest { RpcRequestSendError::InternalError(e) => { Error::InternalError(e) } + RpcRequestSendError::NoPeers => Error::InternalError( + "send_custody_by_range_request does not error with NoPeers" + .to_owned(), + ), })?; *state = FuluEnabledState::CustodyRequest { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 921d134c681..83a9dc07b71 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -957,7 +957,7 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => match e { - RpcRequestSendError::InternalError(e) => { + e @ (RpcRequestSendError::NoPeers | RpcRequestSendError::InternalError(_)) => { // NOTE: under normal conditions this shouldn't happen but we handle it anyway warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); // register the failed download and check if the batch can be retried From 56fcf289ec7bca2d9a77d776675108fad41a2900 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 15:45:36 +0200 Subject: [PATCH 21/23] lint --- .../lighthouse_network/src/types/globals.rs | 4 ++++ .../block_components_by_range.rs | 22 +++++-------------- beacon_node/network/src/sync/tests/range.rs | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index 0c023442087..1c11e7aa1f0 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -248,6 +248,10 @@ impl NetworkGlobals { } } + pub fn sampling_columns_count(&self) -> usize { + self.sampling_columns.read().len() + } + pub fn sampling_columns(&self) -> HashSet { self.sampling_columns.read().clone() } diff --git a/beacon_node/network/src/sync/network_context/block_components_by_range.rs b/beacon_node/network/src/sync/network_context/block_components_by_range.rs index 913b798d8ed..f896589f85a 100644 --- a/beacon_node/network/src/sync/network_context/block_components_by_range.rs +++ b/beacon_node/network/src/sync/network_context/block_components_by_range.rs @@ -188,10 +188,7 @@ impl BlockComponentsByRangeRequest { } => { if let Some((blocks, block_peer)) = blocks_by_range_request.to_finished() { let peer_group = BatchPeers::new_from_block_peer(*block_peer); - let rpc_blocks = couple_blocks_base( - blocks.to_vec(), - cx.network_globals().sampling_columns.len(), - ); + let rpc_blocks = couple_blocks_base(blocks.to_vec()); Ok(Some((rpc_blocks, peer_group))) } else { // Wait for blocks_by_range requests to complete @@ -230,8 +227,7 @@ impl BlockComponentsByRangeRequest { if blocks_with_data.is_empty() { let custody_column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect(); @@ -248,8 +244,7 @@ impl BlockComponentsByRangeRequest { } else { let mut column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect::>(); @@ -295,8 +290,7 @@ impl BlockComponentsByRangeRequest { if let Some((columns, column_peers)) = custody_by_range_request.to_finished() { let custody_column_indices = cx .network_globals() - .sampling_columns - .clone() + .sampling_columns() .iter() .copied() .collect(); @@ -425,13 +419,10 @@ impl BlockComponentsByRangeRequest { } } -fn couple_blocks_base( - blocks: Vec>>, - custody_columns_count: usize, -) -> Vec> { +fn couple_blocks_base(blocks: Vec>>) -> Vec> { blocks .into_iter() - .map(|block| RpcBlock::new_without_blobs(None, block, custody_columns_count)) + .map(|block| RpcBlock::new_without_blobs(None, block)) .collect() } @@ -509,7 +500,6 @@ fn couple_blocks_fulu( Some(block_root), block, data_columns_with_block_root, - custody_column_indices.clone(), spec, ) .map_err(Error::InternalError) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index 6de5902bb16..599c808befa 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -209,7 +209,7 @@ fn complete() -> CompleteConfig { impl TestRig { fn our_custody_indices(&self) -> Vec { self.network_globals - .sampling_columns + .sampling_columns() .iter() .copied() .collect() From aa726cc72cce7703073da8754abda794664d72a3 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:29:14 +0200 Subject: [PATCH 22/23] lint --- beacon_node/beacon_chain/tests/store_tests.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 9f8c14f3398..73e2a9025c7 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2603,7 +2603,7 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .deconstruct(); if wss_fork.fulu_enabled() { info!(block_slot = %block.slot(), ?block_root, "Corrupting data column KZG proof"); - let (mut data_columns, expected_column_indices) = cols.unwrap(); + let mut data_columns = cols.unwrap(); assert!( !data_columns.is_empty(), "data column sidecars shouldn't be empty" @@ -2618,7 +2618,6 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { Some(block_root), block, data_columns.to_vec(), - expected_column_indices, &harness.spec, ) .unwrap() @@ -3819,7 +3818,6 @@ fn available_to_rpc_block(block: AvailableBlock, spec: &ChainSpec .into_iter() .map(|d| CustodyDataColumn::from_asserted_custody(d)) .collect(), - vec![], spec, ) .unwrap(), From cb5f76f13700970ac9f87554369870236fada881 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:41:20 +0200 Subject: [PATCH 23/23] Add peers to backfill if FullySynced --- beacon_node/network/src/sync/manager.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index c7d727c63fc..94599a072ee 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -416,7 +416,6 @@ impl SyncManager { PeerSyncType::Advanced => { self.range_sync .add_peer(&mut self.network, local, peer_id, remote); - self.backfill_sync.add_peer(peer_id); } PeerSyncType::FullySynced => { // Sync considers this peer close enough to the head to not trigger range sync. @@ -434,6 +433,13 @@ impl SyncManager { } } } + + match sync_type { + PeerSyncType::Behind => {} + PeerSyncType::Advanced | PeerSyncType::FullySynced => { + self.backfill_sync.add_peer(peer_id); + } + } } self.update_sync_state();