@@ -78,33 +78,91 @@ impl<T: Transport + 'static, ER: RaftEngine> ProxyForwarder<T, ER> {
78
78
)
79
79
}
80
80
81
+ // Returns whether we should ignore the MsgSnapshot.
82
+ #[ allow( clippy:: collapsible_if) ]
83
+ fn snapshot_filter ( & self , msg : & RaftMessage ) -> bool {
84
+ let inner_msg = msg. get_message ( ) ;
85
+ let region_id = msg. get_region_id ( ) ;
86
+ let new_peer_id = msg. get_to_peer ( ) . get_id ( ) ;
87
+ let mut should_skip = false ;
88
+ let f = |info : MapEntry < u64 , Arc < CachedRegionInfo > > | {
89
+ match info {
90
+ MapEntry :: Occupied ( mut o) => {
91
+ // If the peer is bootstrapped, we will accept the MsgSnapshot.
92
+ if o. get ( ) . inited_or_fallback . load ( Ordering :: SeqCst ) {
93
+ return ;
94
+ }
95
+ let has_already_inited = self . is_initialized ( region_id) ;
96
+ if has_already_inited {
97
+ o. get_mut ( ) . inited_or_fallback . store ( true , Ordering :: SeqCst ) ;
98
+ }
99
+ if o. get ( ) . fast_add_peer_start . load ( Ordering :: SeqCst ) != 0 {
100
+ if o. get ( ) . snapshot_inflight . load ( Ordering :: SeqCst ) == 0 {
101
+ // If the FAP snapshot is building, skip this MsgSnapshot.
102
+ // We will wait until the FAP is succeed or fallbacked.
103
+ info ! ( "fast path: ongoing {}:{} {}, MsgSnapshot rejected" ,
104
+ self . store_id, region_id, new_peer_id;
105
+ "to_peer_id" => msg. get_to_peer( ) . get_id( ) ,
106
+ "from_peer_id" => msg. get_from_peer( ) . get_id( ) ,
107
+ "region_id" => region_id,
108
+ "inner_msg" => Self :: format_msg( inner_msg) ,
109
+ "has_already_inited" => has_already_inited,
110
+ "inited_or_fallback" => o. get( ) . inited_or_fallback. load( Ordering :: SeqCst ) ,
111
+ "snapshot_inflight" => o. get( ) . snapshot_inflight. load( Ordering :: SeqCst ) ,
112
+ "fast_add_peer_start" => o. get( ) . fast_add_peer_start. load( Ordering :: SeqCst ) ,
113
+ ) ;
114
+ should_skip = true ;
115
+ }
116
+ // Otherwise, this snapshot could be either FAP
117
+ // snapshot, or normal snapshot.
118
+ // In each case, we should handle them.
119
+ }
120
+ }
121
+ MapEntry :: Vacant ( _) => { }
122
+ }
123
+ } ;
124
+
125
+ match self . get_cached_manager ( ) . get_inited_or_fallback ( region_id) {
126
+ Some ( true ) => {
127
+ // Most cases, when the peer is already inited.
128
+ }
129
+ None | Some ( false ) => self
130
+ . get_cached_manager ( )
131
+ . access_cached_region_info_mut ( region_id, f)
132
+ . unwrap ( ) ,
133
+ } ;
134
+
135
+ should_skip
136
+ }
137
+
81
138
// Returns whether we need to ignore this message and run fast path instead.
82
139
pub fn maybe_fast_path_tick ( & self , msg : & RaftMessage ) -> bool {
83
140
if !self . packed_envs . engine_store_cfg . enable_fast_add_peer {
84
141
// fast path not enabled
85
142
return false ;
86
143
}
87
144
let inner_msg = msg. get_message ( ) ;
145
+ let region_id = msg. get_region_id ( ) ;
146
+ let new_peer_id = msg. get_to_peer ( ) . get_id ( ) ;
88
147
if inner_msg. get_commit ( ) == 0 && inner_msg. get_msg_type ( ) == MessageType :: MsgHeartbeat {
89
148
return false ;
90
149
} else if inner_msg. get_msg_type ( ) == MessageType :: MsgAppend {
150
+ // Go on to following logic to see if we should filter.
151
+ } else if inner_msg. get_msg_type ( ) == MessageType :: MsgSnapshot {
152
+ return self . snapshot_filter ( msg) ;
91
153
} else {
154
+ // We only handles the first MsgAppend.
92
155
return false ;
93
156
}
94
- // We don't need to recover all region infomation from restart,
157
+ // We don't need to recover all region information from restart,
95
158
// since we have `has_already_inited`.
96
- let inner_msg = msg. get_message ( ) ;
97
- if inner_msg. get_msg_type ( ) != MessageType :: MsgAppend {
98
- // we only handles the first MsgAppend
99
- return false ;
100
- }
101
- let region_id = msg. get_region_id ( ) ;
102
- let new_peer_id = msg. get_to_peer ( ) . get_id ( ) ;
159
+
103
160
let cached_manager = self . get_cached_manager ( ) ;
104
161
let mut is_first = false ;
105
162
let mut is_replicated = false ;
106
163
let mut has_already_inited = None ;
107
164
let mut early_skip = false ;
165
+
108
166
let f = |info : MapEntry < u64 , Arc < CachedRegionInfo > > | {
109
167
let current = SystemTime :: now ( )
110
168
. duration_since ( SystemTime :: UNIX_EPOCH )
@@ -406,11 +464,15 @@ impl<T: Transport + 'static, ER: RaftEngine> ProxyForwarder<T, ER> {
406
464
self . store_id, region_id, new_peer_id, s;
407
465
"region_id" => region_id,
408
466
) ;
409
- // We don't fallback if the fap snapshot is persisted,
410
- // Because it has been sent, or has not been sent.
411
- // So we can't decide whether to use fallback to clean the previous
412
- // snapshot. Any later error will cause fap snapshot
413
- // mismatch.
467
+ // We call fallback here even if the fap is persisted and sent.
468
+ // Because the sent snapshot is only to be handled if (idnex, term) matches,
469
+ // even if there is another normal snapshot. Because both snapshots are
470
+ // idendical. TODO However, we can retry FAP for
471
+ // several times before we fail. However,
472
+ // the cases here is rare. We have only observed several raft logs missing
473
+ // problem.
474
+ let cached_manager = self . get_cached_manager ( ) ;
475
+ cached_manager. fallback_to_slow_path ( region_id) ;
414
476
return false ;
415
477
}
416
478
} ;
@@ -421,6 +483,8 @@ impl<T: Transport + 'static, ER: RaftEngine> ProxyForwarder<T, ER> {
421
483
self . store_id, region_id, new_peer_id, e;
422
484
"region_id" => region_id,
423
485
) ;
486
+ let cached_manager = self . get_cached_manager ( ) ;
487
+ cached_manager. fallback_to_slow_path ( region_id) ;
424
488
return false ;
425
489
}
426
490
} ;
@@ -484,7 +548,11 @@ impl<T: Transport + 'static, ER: RaftEngine> ProxyForwarder<T, ER> {
484
548
// Find term of entry at applied_index.
485
549
let applied_index = apply_state. get_applied_index ( ) ;
486
550
let applied_term =
487
- self . check_entry_at_index ( region_id, applied_index, new_peer_id, "applied_index" ) ?;
551
+ match self . check_entry_at_index ( region_id, applied_index, new_peer_id, "applied_index" )
552
+ {
553
+ Ok ( x) => x,
554
+ Err ( e) => return Err ( e) ,
555
+ } ;
488
556
// Will otherwise cause "got message with lower index than committed" loop.
489
557
// Maybe this can be removed, since fb0917bfa44ec1fc55967 can pass if we remove
490
558
// this constraint.
0 commit comments