Skip to content

Commit 5f07325

Browse files
lexnvdmitry-markin
andauthored
kad: Improve robustness of addresses from the routing table (#369)
This PR improves the robustness of the kademlia addresses stored in the routing table. The implementation uses an eventually-consistent address store, based on the information provided by the transport manager. This has the benefit of not sharing the transport manager address store with the routing table, therefore not paying for the performance hit of locking the addresses (globally) on kademlia queries. - transport manager exposes all dialed addresses properly to the installed protocols - the kademlia peer contains an address store instead of a plain vector (bounded to 32 entries -- please note that the transport manager holds up to 64 entries) - similar to the transport manager, the addresses are sorted by score depending on success / failure dials - when an address score is negative it is subject to removal - on successful dial, the score is increased for the address of said peer - on dial failures, the score of the failed addresses is decreased - known good addresses are always provided first - all kademlia queries rely on this information and the addresses are provided sorted by their score This resolved the following issue, with the mention that undialed or dialed but failed addresses are still provided if the address store has enough capacity. This ensures that addresses that fail due to temporary network errors, or errors specific to our node (ie, behind firewall -- other peers may reach the address) are provided: - #209 cc @paritytech/networking --------- Signed-off-by: Alexandru Vasile <[email protected]> Co-authored-by: Dmitry Markin <[email protected]>
1 parent 54a8e06 commit 5f07325

File tree

15 files changed

+380
-100
lines changed

15 files changed

+380
-100
lines changed

src/protocol/libp2p/kademlia/bucket.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use crate::{
2727
};
2828

2929
/// K-bucket entry.
30-
#[derive(Debug, PartialEq, Eq)]
30+
#[derive(Debug)]
3131
pub enum KBucketEntry<'a> {
3232
/// Entry points to local node.
3333
LocalNode,
@@ -48,7 +48,7 @@ impl<'a> KBucketEntry<'a> {
4848
if let KBucketEntry::Vacant(old) = self {
4949
old.peer = new.peer;
5050
old.key = Key::from(new.peer);
51-
old.addresses = new.addresses;
51+
old.address_store = new.address_store;
5252
old.connection = new.connection;
5353
}
5454
}
@@ -105,7 +105,7 @@ impl KBucket {
105105
pub fn closest_iter<K: Clone>(&self, target: &Key<K>) -> impl Iterator<Item = &KademliaPeer> {
106106
let mut nodes: Vec<_> = self.nodes.iter().collect();
107107
nodes.sort_by(|a, b| target.distance(&a.key).cmp(&target.distance(&b.key)));
108-
nodes.into_iter().filter(|peer| !peer.addresses.is_empty())
108+
nodes.into_iter().filter(|peer| !peer.address_store.is_empty())
109109
}
110110
}
111111

src/protocol/libp2p/kademlia/mod.rs

Lines changed: 137 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use crate::{
3636
Direction, TransportEvent, TransportService,
3737
},
3838
substream::Substream,
39+
transport::Endpoint,
3940
types::SubstreamId,
4041
PeerId,
4142
};
@@ -220,14 +221,18 @@ impl Kademlia {
220221
}
221222

222223
/// Connection established to remote peer.
223-
fn on_connection_established(&mut self, peer: PeerId) -> crate::Result<()> {
224+
fn on_connection_established(&mut self, peer: PeerId, endpoint: Endpoint) -> crate::Result<()> {
224225
tracing::trace!(target: LOG_TARGET, ?peer, "connection established");
225226

226227
match self.peers.entry(peer) {
227228
Entry::Vacant(entry) => {
228-
if let KBucketEntry::Occupied(entry) = self.routing_table.entry(Key::from(peer)) {
229-
entry.connection = ConnectionType::Connected;
230-
}
229+
// Set the conenction type to connected and potentially save the address in the
230+
// table.
231+
//
232+
// Note: this happens regardless of the state of the kademlia managed peers, because
233+
// an already occupied entry in the `self.peers` map does not mean that we are
234+
// no longer interested in the address / connection type of the peer.
235+
self.routing_table.on_connection_established(Key::from(peer), endpoint);
231236

232237
let Some(actions) = self.pending_dials.remove(&peer) else {
233238
entry.insert(PeerContext::new());
@@ -262,7 +267,21 @@ impl Kademlia {
262267
entry.insert(context);
263268
Ok(())
264269
}
265-
Entry::Occupied(_) => Err(Error::PeerAlreadyExists(peer)),
270+
Entry::Occupied(_) => {
271+
tracing::warn!(
272+
target: LOG_TARGET,
273+
?peer,
274+
?endpoint,
275+
"connection already exists, discarding opening substreams, this is unexpected"
276+
);
277+
278+
// Update the connection in the routing table, similar as above. The function call
279+
// happens in two places to avoid unnecessary cloning of the endpoint for logging
280+
// purposes.
281+
self.routing_table.on_connection_established(Key::from(peer), endpoint);
282+
283+
Err(Error::PeerAlreadyExists(peer))
284+
}
266285
}
267286
}
268287

@@ -395,12 +414,13 @@ impl Kademlia {
395414
.await;
396415

397416
for info in peers {
398-
self.service.add_known_address(&info.peer, info.addresses.iter().cloned());
417+
let addresses = info.addresses();
418+
self.service.add_known_address(&info.peer, addresses.clone().into_iter());
399419

400420
if std::matches!(self.update_mode, RoutingTableUpdateMode::Automatic) {
401421
self.routing_table.add_known_peer(
402422
info.peer,
403-
info.addresses.clone(),
423+
addresses,
404424
self.peers
405425
.get(&info.peer)
406426
.map_or(ConnectionType::NotConnected, |_| ConnectionType::Connected),
@@ -529,13 +549,15 @@ impl Kademlia {
529549
);
530550

531551
match (providers.len(), providers.pop()) {
532-
(1, Some(provider)) =>
552+
(1, Some(provider)) => {
553+
let addresses = provider.addresses();
554+
533555
if provider.peer == peer {
534556
self.store.put_provider(
535557
key.clone(),
536558
ContentProvider {
537559
peer,
538-
addresses: provider.addresses.clone(),
560+
addresses: addresses.clone(),
539561
},
540562
);
541563

@@ -545,7 +567,7 @@ impl Kademlia {
545567
provided_key: key,
546568
provider: ContentProvider {
547569
peer: provider.peer,
548-
addresses: provider.addresses,
570+
addresses,
549571
},
550572
})
551573
.await;
@@ -556,7 +578,8 @@ impl Kademlia {
556578
provider = ?provider.peer,
557579
"ignoring `ADD_PROVIDER` message with `publisher` != `provider`"
558580
)
559-
},
581+
}
582+
}
560583
(n, _) => {
561584
tracing::trace!(
562585
target: LOG_TARGET,
@@ -669,8 +692,10 @@ impl Kademlia {
669692
}
670693

671694
/// Handle dial failure.
672-
fn on_dial_failure(&mut self, peer: PeerId, address: Multiaddr) {
673-
tracing::trace!(target: LOG_TARGET, ?peer, ?address, "failed to dial peer");
695+
fn on_dial_failure(&mut self, peer: PeerId, addresses: Vec<Multiaddr>) {
696+
tracing::trace!(target: LOG_TARGET, ?peer, ?addresses, "failed to dial peer");
697+
698+
self.routing_table.on_dial_failure(Key::from(peer), &addresses);
674699

675700
let Some(actions) = self.pending_dials.remove(&peer) else {
676701
return;
@@ -682,7 +707,7 @@ impl Kademlia {
682707
target: LOG_TARGET,
683708
?peer,
684709
query = ?query_id,
685-
?address,
710+
?addresses,
686711
"report failure for pending query",
687712
);
688713

@@ -774,7 +799,10 @@ impl Kademlia {
774799
.send(KademliaEvent::FindNodeSuccess {
775800
target,
776801
query_id: query,
777-
peers: peers.into_iter().map(|info| (info.peer, info.addresses)).collect(),
802+
peers: peers
803+
.into_iter()
804+
.map(|info| (info.peer, info.addresses()))
805+
.collect(),
778806
})
779807
.await;
780808
Ok(())
@@ -889,8 +917,8 @@ impl Kademlia {
889917

890918
tokio::select! {
891919
event = self.service.next() => match event {
892-
Some(TransportEvent::ConnectionEstablished { peer, .. }) => {
893-
if let Err(error) = self.on_connection_established(peer) {
920+
Some(TransportEvent::ConnectionEstablished { peer, endpoint }) => {
921+
if let Err(error) = self.on_connection_established(peer, endpoint) {
894922
tracing::debug!(
895923
target: LOG_TARGET,
896924
?error,
@@ -923,8 +951,8 @@ impl Kademlia {
923951
Some(TransportEvent::SubstreamOpenFailure { substream, error }) => {
924952
self.on_substream_open_failure(substream, error).await;
925953
}
926-
Some(TransportEvent::DialFailure { peer, address, .. }) =>
927-
self.on_dial_failure(peer, address),
954+
Some(TransportEvent::DialFailure { peer, addresses }) =>
955+
self.on_dial_failure(peer, addresses),
928956
None => return Err(Error::EssentialTaskClosed),
929957
},
930958
context = self.executor.next() => {
@@ -1048,7 +1076,7 @@ impl Kademlia {
10481076

10491077
match self.routing_table.entry(Key::from(peer)) {
10501078
KBucketEntry::Occupied(entry) => Some(entry.clone()),
1051-
KBucketEntry::Vacant(entry) if !entry.addresses.is_empty() =>
1079+
KBucketEntry::Vacant(entry) if !entry.address_store.is_empty() =>
10521080
Some(entry.clone()),
10531081
_ => None,
10541082
}
@@ -1237,8 +1265,11 @@ mod tests {
12371265
KEEP_ALIVE_TIMEOUT,
12381266
},
12391267
types::protocol::ProtocolName,
1240-
BandwidthSink,
1268+
BandwidthSink, ConnectionId,
12411269
};
1270+
use multiaddr::Protocol;
1271+
use multihash::Multihash;
1272+
use std::str::FromStr;
12421273
use tokio::sync::mpsc::channel;
12431274

12441275
#[allow(unused)]
@@ -1379,4 +1410,89 @@ mod tests {
13791410
// Check the local storage should not get updated.
13801411
assert!(kademlia.store.get(&key).is_none());
13811412
}
1413+
1414+
#[tokio::test]
1415+
async fn check_address_store_routing_table_updates() {
1416+
let (mut kademlia, _context, _manager) = make_kademlia();
1417+
1418+
let peer = PeerId::random();
1419+
let address_a = Multiaddr::from_str("/dns/domain1.com/tcp/30333").unwrap().with(
1420+
Protocol::P2p(Multihash::from_bytes(&peer.to_bytes()).unwrap()),
1421+
);
1422+
let address_b = Multiaddr::from_str("/dns/domain1.com/tcp/30334").unwrap().with(
1423+
Protocol::P2p(Multihash::from_bytes(&peer.to_bytes()).unwrap()),
1424+
);
1425+
let address_c = Multiaddr::from_str("/dns/domain1.com/tcp/30339").unwrap().with(
1426+
Protocol::P2p(Multihash::from_bytes(&peer.to_bytes()).unwrap()),
1427+
);
1428+
1429+
// Added only with address a.
1430+
kademlia.routing_table.add_known_peer(
1431+
peer,
1432+
vec![address_a.clone()],
1433+
ConnectionType::NotConnected,
1434+
);
1435+
1436+
// Check peer addresses.
1437+
match kademlia.routing_table.entry(Key::from(peer)) {
1438+
KBucketEntry::Occupied(entry) => {
1439+
assert_eq!(entry.addresses(), vec![address_a.clone()]);
1440+
}
1441+
_ => panic!("Peer not found in routing table"),
1442+
};
1443+
1444+
// Report successful connection with address b via dialer endpoint.
1445+
let _ = kademlia.on_connection_established(
1446+
peer,
1447+
Endpoint::Dialer {
1448+
address: address_b.clone(),
1449+
connection_id: ConnectionId::from(0),
1450+
},
1451+
);
1452+
1453+
// Address B has a higher priority, as it was detected via the dialing mechanism of the
1454+
// transport manager, while address A is not dialed yet.
1455+
match kademlia.routing_table.entry(Key::from(peer)) {
1456+
KBucketEntry::Occupied(entry) => {
1457+
assert_eq!(
1458+
entry.addresses(),
1459+
vec![address_b.clone(), address_a.clone()]
1460+
);
1461+
}
1462+
_ => panic!("Peer not found in routing table"),
1463+
};
1464+
1465+
// Report successful connection with a random address via listener endpoint.
1466+
let _ = kademlia.on_connection_established(
1467+
peer,
1468+
Endpoint::Listener {
1469+
address: address_c.clone(),
1470+
connection_id: ConnectionId::from(0),
1471+
},
1472+
);
1473+
// Address C was not added, as the peer has dialed us possibly on an ephemeral port.
1474+
match kademlia.routing_table.entry(Key::from(peer)) {
1475+
KBucketEntry::Occupied(entry) => {
1476+
assert_eq!(
1477+
entry.addresses(),
1478+
vec![address_b.clone(), address_a.clone()]
1479+
);
1480+
}
1481+
_ => panic!("Peer not found in routing table"),
1482+
};
1483+
1484+
// Address B fails two times (which gives it a lower score than A) and
1485+
// makes it subject to removal.
1486+
kademlia.on_dial_failure(peer, vec![address_b.clone(), address_b.clone()]);
1487+
1488+
match kademlia.routing_table.entry(Key::from(peer)) {
1489+
KBucketEntry::Occupied(entry) => {
1490+
assert_eq!(
1491+
entry.addresses(),
1492+
vec![address_a.clone(), address_b.clone()]
1493+
);
1494+
}
1495+
_ => panic!("Peer not found in routing table"),
1496+
};
1497+
}
13821498
}

src/protocol/libp2p/kademlia/query/find_node.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ mod tests {
317317
KademliaPeer {
318318
peer,
319319
key: Key::from(peer),
320-
addresses: vec![],
320+
address_store: Default::default(),
321321
connection: ConnectionType::Connected,
322322
}
323323
}
@@ -353,7 +353,12 @@ mod tests {
353353
let mut context = FindNodeContext::new(config, VecDeque::new());
354354
assert!(context.is_done());
355355
let event = context.next_action().unwrap();
356-
assert_eq!(event, QueryAction::QueryFailed { query: QueryId(0) });
356+
match event {
357+
QueryAction::QueryFailed { query, .. } => {
358+
assert_eq!(query, QueryId(0));
359+
}
360+
_ => panic!("Unexpected event"),
361+
};
357362
}
358363

359364
#[test]
@@ -522,7 +527,12 @@ mod tests {
522527

523528
// Produces the result.
524529
let event = context.next_action().unwrap();
525-
assert_eq!(event, QueryAction::QuerySucceeded { query: QueryId(0) });
530+
match event {
531+
QueryAction::QuerySucceeded { query, .. } => {
532+
assert_eq!(query, QueryId(0));
533+
}
534+
_ => panic!("Unexpected event"),
535+
};
526536
}
527537

528538
#[test]
@@ -550,7 +560,12 @@ mod tests {
550560
context.register_response(closest, vec![]);
551561

552562
let event = context.next_action().unwrap();
553-
assert_eq!(event, QueryAction::QuerySucceeded { query: QueryId(0) });
563+
match event {
564+
QueryAction::QuerySucceeded { query } => {
565+
assert_eq!(query, QueryId(0));
566+
}
567+
_ => panic!("Unexpected event"),
568+
};
554569
}
555570

556571
#[test]
@@ -602,7 +617,12 @@ mod tests {
602617
context.register_response(closest, vec![]);
603618

604619
let event = context.next_action().unwrap();
605-
assert_eq!(event, QueryAction::QuerySucceeded { query: QueryId(0) });
620+
match event {
621+
QueryAction::QuerySucceeded { query, .. } => {
622+
assert_eq!(query, QueryId(0));
623+
}
624+
_ => panic!("Unexpected event"),
625+
};
606626
}
607627

608628
#[test]
@@ -665,7 +685,12 @@ mod tests {
665685

666686
// Produces the result.
667687
let event = context.next_action().unwrap();
668-
assert_eq!(event, QueryAction::QuerySucceeded { query: QueryId(0) });
688+
match event {
689+
QueryAction::QuerySucceeded { query } => {
690+
assert_eq!(query, QueryId(0));
691+
}
692+
_ => panic!("Unexpected event"),
693+
};
669694

670695
// Because the FindNode query keeps a window of the best K (3 in this case) peers,
671696
// we expect to produce the best K peers. As opposed to having only the last entry

0 commit comments

Comments
 (0)