eth/protocols/snap: use ephemeral channels to avoid cross-sync delveries

This commit is contained in:
Péter Szilágyi 2021-04-15 21:01:16 +03:00
parent 1e207342b5
commit 9553c98de8
No known key found for this signature in database
GPG Key ID: E9AE538CEDF8293D
1 changed files with 112 additions and 101 deletions

View File

@ -106,9 +106,11 @@ type accountRequest struct {
peer string // Peer to which this request is assigned peer string // Peer to which this request is assigned
id uint64 // Request ID of this request id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation deliver chan *accountResponse // Channel to deliver successful response on
timeout *time.Timer // Timer to track delivery timeout revert chan *accountRequest // Channel to deliver request failure on
stale chan struct{} // Channel to signal the request was dropped cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
origin common.Hash // First account requested to allow continuation checks origin common.Hash // First account requested to allow continuation checks
limit common.Hash // Last account requested to allow non-overlapping chunking limit common.Hash // Last account requested to allow non-overlapping chunking
@ -147,9 +149,11 @@ type bytecodeRequest struct {
peer string // Peer to which this request is assigned peer string // Peer to which this request is assigned
id uint64 // Request ID of this request id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation deliver chan *bytecodeResponse // Channel to deliver successful response on
timeout *time.Timer // Timer to track delivery timeout revert chan *bytecodeRequest // Channel to deliver request failure on
stale chan struct{} // Channel to signal the request was dropped cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Bytecode hashes to validate responses hashes []common.Hash // Bytecode hashes to validate responses
task *accountTask // Task which this request is filling (only access fields through the runloop!!) task *accountTask // Task which this request is filling (only access fields through the runloop!!)
@ -176,9 +180,11 @@ type storageRequest struct {
peer string // Peer to which this request is assigned peer string // Peer to which this request is assigned
id uint64 // Request ID of this request id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation deliver chan *storageResponse // Channel to deliver successful response on
timeout *time.Timer // Timer to track delivery timeout revert chan *storageRequest // Channel to deliver request failure on
stale chan struct{} // Channel to signal the request was dropped cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
accounts []common.Hash // Account hashes to validate responses accounts []common.Hash // Account hashes to validate responses
roots []common.Hash // Storage roots to validate responses roots []common.Hash // Storage roots to validate responses
@ -224,9 +230,11 @@ type trienodeHealRequest struct {
peer string // Peer to which this request is assigned peer string // Peer to which this request is assigned
id uint64 // Request ID of this request id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation deliver chan *trienodeHealResponse // Channel to deliver successful response on
timeout *time.Timer // Timer to track delivery timeout revert chan *trienodeHealRequest // Channel to deliver request failure on
stale chan struct{} // Channel to signal the request was dropped cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Trie node hashes to validate responses hashes []common.Hash // Trie node hashes to validate responses
paths []trie.SyncPath // Trie node paths requested for rescheduling paths []trie.SyncPath // Trie node paths requested for rescheduling
@ -256,9 +264,11 @@ type bytecodeHealRequest struct {
peer string // Peer to which this request is assigned peer string // Peer to which this request is assigned
id uint64 // Request ID of this request id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation deliver chan *bytecodeHealResponse // Channel to deliver successful response on
timeout *time.Timer // Timer to track delivery timeout revert chan *bytecodeHealRequest // Channel to deliver request failure on
stale chan struct{} // Channel to signal the request was dropped cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Bytecode hashes to validate responses hashes []common.Hash // Bytecode hashes to validate responses
task *healTask // Task which this request is filling (only access fields through the runloop!!) task *healTask // Task which this request is filling (only access fields through the runloop!!)
@ -399,14 +409,6 @@ type Syncer struct {
bytecodeReqs map[uint64]*bytecodeRequest // Bytecode requests currently running bytecodeReqs map[uint64]*bytecodeRequest // Bytecode requests currently running
storageReqs map[uint64]*storageRequest // Storage requests currently running storageReqs map[uint64]*storageRequest // Storage requests currently running
accountReqFails chan *accountRequest // Failed account range requests to revert
bytecodeReqFails chan *bytecodeRequest // Failed bytecode requests to revert
storageReqFails chan *storageRequest // Failed storage requests to revert
accountResps chan *accountResponse // Account sub-tries to integrate into the database
bytecodeResps chan *bytecodeResponse // Bytecodes to integrate into the database
storageResps chan *storageResponse // Storage sub-tries to integrate into the database
accountSynced uint64 // Number of accounts downloaded accountSynced uint64 // Number of accounts downloaded
accountBytes common.StorageSize // Number of account trie bytes persisted to disk accountBytes common.StorageSize // Number of account trie bytes persisted to disk
bytecodeSynced uint64 // Number of bytecodes downloaded bytecodeSynced uint64 // Number of bytecodes downloaded
@ -421,12 +423,6 @@ type Syncer struct {
trienodeHealReqs map[uint64]*trienodeHealRequest // Trie node requests currently running trienodeHealReqs map[uint64]*trienodeHealRequest // Trie node requests currently running
bytecodeHealReqs map[uint64]*bytecodeHealRequest // Bytecode requests currently running bytecodeHealReqs map[uint64]*bytecodeHealRequest // Bytecode requests currently running
trienodeHealReqFails chan *trienodeHealRequest // Failed trienode requests to revert
bytecodeHealReqFails chan *bytecodeHealRequest // Failed bytecode requests to revert
trienodeHealResps chan *trienodeHealResponse // Trie nodes to integrate into the database
bytecodeHealResps chan *bytecodeHealResponse // Bytecodes to integrate into the database
trienodeHealSynced uint64 // Number of state trie nodes downloaded trienodeHealSynced uint64 // Number of state trie nodes downloaded
trienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk trienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk
trienodeHealDups uint64 // Number of state trie nodes already processed trienodeHealDups uint64 // Number of state trie nodes already processed
@ -464,26 +460,16 @@ func NewSyncer(db ethdb.KeyValueStore) *Syncer {
storageIdlers: make(map[string]struct{}), storageIdlers: make(map[string]struct{}),
bytecodeIdlers: make(map[string]struct{}), bytecodeIdlers: make(map[string]struct{}),
accountReqs: make(map[uint64]*accountRequest), accountReqs: make(map[uint64]*accountRequest),
storageReqs: make(map[uint64]*storageRequest), storageReqs: make(map[uint64]*storageRequest),
bytecodeReqs: make(map[uint64]*bytecodeRequest), bytecodeReqs: make(map[uint64]*bytecodeRequest),
accountReqFails: make(chan *accountRequest),
storageReqFails: make(chan *storageRequest),
bytecodeReqFails: make(chan *bytecodeRequest),
accountResps: make(chan *accountResponse),
storageResps: make(chan *storageResponse),
bytecodeResps: make(chan *bytecodeResponse),
trienodeHealIdlers: make(map[string]struct{}), trienodeHealIdlers: make(map[string]struct{}),
bytecodeHealIdlers: make(map[string]struct{}), bytecodeHealIdlers: make(map[string]struct{}),
trienodeHealReqs: make(map[uint64]*trienodeHealRequest), trienodeHealReqs: make(map[uint64]*trienodeHealRequest),
bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest), bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest),
trienodeHealReqFails: make(chan *trienodeHealRequest), stateWriter: db.NewBatch(),
bytecodeHealReqFails: make(chan *bytecodeHealRequest),
trienodeHealResps: make(chan *trienodeHealResponse),
bytecodeHealResps: make(chan *bytecodeHealResponse),
stateWriter: db.NewBatch(),
} }
} }
@ -611,6 +597,21 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
peerDropSub := s.peerDrop.Subscribe(peerDrop) peerDropSub := s.peerDrop.Subscribe(peerDrop)
defer peerDropSub.Unsubscribe() defer peerDropSub.Unsubscribe()
// Create a set of unique channels for this sync cycle. We need these to be
// ephemeral so a data race doesn't accidentally deliver something stale on
// a persistent channel across syncs (yup, this happened)
var (
accountReqFails = make(chan *accountRequest)
storageReqFails = make(chan *storageRequest)
bytecodeReqFails = make(chan *bytecodeRequest)
accountResps = make(chan *accountResponse)
storageResps = make(chan *storageResponse)
bytecodeResps = make(chan *bytecodeResponse)
trienodeHealReqFails = make(chan *trienodeHealRequest)
bytecodeHealReqFails = make(chan *bytecodeHealRequest)
trienodeHealResps = make(chan *trienodeHealResponse)
bytecodeHealResps = make(chan *bytecodeHealResponse)
)
for { for {
// Remove all completed tasks and terminate sync if everything's done // Remove all completed tasks and terminate sync if everything's done
s.cleanStorageTasks() s.cleanStorageTasks()
@ -619,14 +620,14 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
return nil return nil
} }
// Assign all the data retrieval tasks to any free peers // Assign all the data retrieval tasks to any free peers
s.assignAccountTasks(cancel) s.assignAccountTasks(accountResps, accountReqFails, cancel)
s.assignBytecodeTasks(cancel) s.assignBytecodeTasks(bytecodeResps, bytecodeReqFails, cancel)
s.assignStorageTasks(cancel) s.assignStorageTasks(storageResps, storageReqFails, cancel)
if len(s.tasks) == 0 { if len(s.tasks) == 0 {
// Sync phase done, run heal phase // Sync phase done, run heal phase
s.assignTrienodeHealTasks(cancel) s.assignTrienodeHealTasks(trienodeHealResps, trienodeHealReqFails, cancel)
s.assignBytecodeHealTasks(cancel) s.assignBytecodeHealTasks(bytecodeHealResps, bytecodeHealReqFails, cancel)
} }
// Wait for something to happen // Wait for something to happen
select { select {
@ -639,26 +640,26 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
case <-cancel: case <-cancel:
return ErrCancelled return ErrCancelled
case req := <-s.accountReqFails: case req := <-accountReqFails:
s.revertAccountRequest(req) s.revertAccountRequest(req)
case req := <-s.bytecodeReqFails: case req := <-bytecodeReqFails:
s.revertBytecodeRequest(req) s.revertBytecodeRequest(req)
case req := <-s.storageReqFails: case req := <-storageReqFails:
s.revertStorageRequest(req) s.revertStorageRequest(req)
case req := <-s.trienodeHealReqFails: case req := <-trienodeHealReqFails:
s.revertTrienodeHealRequest(req) s.revertTrienodeHealRequest(req)
case req := <-s.bytecodeHealReqFails: case req := <-bytecodeHealReqFails:
s.revertBytecodeHealRequest(req) s.revertBytecodeHealRequest(req)
case res := <-s.accountResps: case res := <-accountResps:
s.processAccountResponse(res) s.processAccountResponse(res)
case res := <-s.bytecodeResps: case res := <-bytecodeResps:
s.processBytecodeResponse(res) s.processBytecodeResponse(res)
case res := <-s.storageResps: case res := <-storageResps:
s.processStorageResponse(res) s.processStorageResponse(res)
case res := <-s.trienodeHealResps: case res := <-trienodeHealResps:
s.processTrienodeHealResponse(res) s.processTrienodeHealResponse(res)
case res := <-s.bytecodeHealResps: case res := <-bytecodeHealResps:
s.processBytecodeHealResponse(res) s.processBytecodeHealResponse(res)
} }
// Report stats if something meaningful happened // Report stats if something meaningful happened
@ -801,7 +802,7 @@ func (s *Syncer) cleanStorageTasks() {
// assignAccountTasks attempts to match idle peers to pending account range // assignAccountTasks attempts to match idle peers to pending account range
// retrievals. // retrievals.
func (s *Syncer) assignAccountTasks(cancel chan struct{}) { func (s *Syncer) assignAccountTasks(success chan *accountResponse, fail chan *accountRequest, cancel chan struct{}) {
s.lock.Lock() s.lock.Lock()
defer s.lock.Unlock() defer s.lock.Unlock()
@ -847,13 +848,15 @@ func (s *Syncer) assignAccountTasks(cancel chan struct{}) {
} }
// Generate the network query and send it to the peer // Generate the network query and send it to the peer
req := &accountRequest{ req := &accountRequest{
peer: idle, peer: idle,
id: reqid, id: reqid,
cancel: cancel, deliver: success,
stale: make(chan struct{}), revert: fail,
origin: task.Next, cancel: cancel,
limit: task.Last, stale: make(chan struct{}),
task: task, origin: task.Next,
limit: task.Last,
task: task,
} }
req.timeout = time.AfterFunc(requestTimeout, func() { req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Account range request timed out", "reqid", reqid) peer.Log().Debug("Account range request timed out", "reqid", reqid)
@ -879,7 +882,7 @@ func (s *Syncer) assignAccountTasks(cancel chan struct{}) {
} }
// assignBytecodeTasks attempts to match idle peers to pending code retrievals. // assignBytecodeTasks attempts to match idle peers to pending code retrievals.
func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) { func (s *Syncer) assignBytecodeTasks(success chan *bytecodeResponse, fail chan *bytecodeRequest, cancel chan struct{}) {
s.lock.Lock() s.lock.Lock()
defer s.lock.Unlock() defer s.lock.Unlock()
@ -937,12 +940,14 @@ func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) {
} }
} }
req := &bytecodeRequest{ req := &bytecodeRequest{
peer: idle, peer: idle,
id: reqid, id: reqid,
cancel: cancel, deliver: success,
stale: make(chan struct{}), revert: fail,
hashes: hashes, cancel: cancel,
task: task, stale: make(chan struct{}),
hashes: hashes,
task: task,
} }
req.timeout = time.AfterFunc(requestTimeout, func() { req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Bytecode request timed out", "reqid", reqid) peer.Log().Debug("Bytecode request timed out", "reqid", reqid)
@ -966,7 +971,7 @@ func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) {
// assignStorageTasks attempts to match idle peers to pending storage range // assignStorageTasks attempts to match idle peers to pending storage range
// retrievals. // retrievals.
func (s *Syncer) assignStorageTasks(cancel chan struct{}) { func (s *Syncer) assignStorageTasks(success chan *storageResponse, fail chan *storageRequest, cancel chan struct{}) {
s.lock.Lock() s.lock.Lock()
defer s.lock.Unlock() defer s.lock.Unlock()
@ -1059,6 +1064,8 @@ func (s *Syncer) assignStorageTasks(cancel chan struct{}) {
req := &storageRequest{ req := &storageRequest{
peer: idle, peer: idle,
id: reqid, id: reqid,
deliver: success,
revert: fail,
cancel: cancel, cancel: cancel,
stale: make(chan struct{}), stale: make(chan struct{}),
accounts: accounts, accounts: accounts,
@ -1101,7 +1108,7 @@ func (s *Syncer) assignStorageTasks(cancel chan struct{}) {
// assignTrienodeHealTasks attempts to match idle peers to trie node requests to // assignTrienodeHealTasks attempts to match idle peers to trie node requests to
// heal any trie errors caused by the snap sync's chunked retrieval model. // heal any trie errors caused by the snap sync's chunked retrieval model.
func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) { func (s *Syncer) assignTrienodeHealTasks(success chan *trienodeHealResponse, fail chan *trienodeHealRequest, cancel chan struct{}) {
s.lock.Lock() s.lock.Lock()
defer s.lock.Unlock() defer s.lock.Unlock()
@ -1179,13 +1186,15 @@ func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) {
} }
} }
req := &trienodeHealRequest{ req := &trienodeHealRequest{
peer: idle, peer: idle,
id: reqid, id: reqid,
cancel: cancel, deliver: success,
stale: make(chan struct{}), revert: fail,
hashes: hashes, cancel: cancel,
paths: paths, stale: make(chan struct{}),
task: s.healer, hashes: hashes,
paths: paths,
task: s.healer,
} }
req.timeout = time.AfterFunc(requestTimeout, func() { req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Trienode heal request timed out", "reqid", reqid) peer.Log().Debug("Trienode heal request timed out", "reqid", reqid)
@ -1209,7 +1218,7 @@ func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) {
// assignBytecodeHealTasks attempts to match idle peers to bytecode requests to // assignBytecodeHealTasks attempts to match idle peers to bytecode requests to
// heal any trie errors caused by the snap sync's chunked retrieval model. // heal any trie errors caused by the snap sync's chunked retrieval model.
func (s *Syncer) assignBytecodeHealTasks(cancel chan struct{}) { func (s *Syncer) assignBytecodeHealTasks(success chan *bytecodeHealResponse, fail chan *bytecodeHealRequest, cancel chan struct{}) {
s.lock.Lock() s.lock.Lock()
defer s.lock.Unlock() defer s.lock.Unlock()
@ -1280,12 +1289,14 @@ func (s *Syncer) assignBytecodeHealTasks(cancel chan struct{}) {
} }
} }
req := &bytecodeHealRequest{ req := &bytecodeHealRequest{
peer: idle, peer: idle,
id: reqid, id: reqid,
cancel: cancel, deliver: success,
stale: make(chan struct{}), revert: fail,
hashes: hashes, cancel: cancel,
task: s.healer, stale: make(chan struct{}),
hashes: hashes,
task: s.healer,
} }
req.timeout = time.AfterFunc(requestTimeout, func() { req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Bytecode heal request timed out", "reqid", reqid) peer.Log().Debug("Bytecode heal request timed out", "reqid", reqid)
@ -1366,7 +1377,7 @@ func (s *Syncer) revertRequests(peer string) {
// request and return all failed retrieval tasks to the scheduler for reassignment. // request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertAccountRequest(req *accountRequest) { func (s *Syncer) scheduleRevertAccountRequest(req *accountRequest) {
select { select {
case s.accountReqFails <- req: case req.revert <- req:
// Sync event loop notified // Sync event loop notified
case <-req.cancel: case <-req.cancel:
// Sync cycle got cancelled // Sync cycle got cancelled
@ -1407,7 +1418,7 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) {
// and return all failed retrieval tasks to the scheduler for reassignment. // and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertBytecodeRequest(req *bytecodeRequest) { func (s *Syncer) scheduleRevertBytecodeRequest(req *bytecodeRequest) {
select { select {
case s.bytecodeReqFails <- req: case req.revert <- req:
// Sync event loop notified // Sync event loop notified
case <-req.cancel: case <-req.cancel:
// Sync cycle got cancelled // Sync cycle got cancelled
@ -1448,7 +1459,7 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment. // request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertStorageRequest(req *storageRequest) { func (s *Syncer) scheduleRevertStorageRequest(req *storageRequest) {
select { select {
case s.storageReqFails <- req: case req.revert <- req:
// Sync event loop notified // Sync event loop notified
case <-req.cancel: case <-req.cancel:
// Sync cycle got cancelled // Sync cycle got cancelled
@ -1493,7 +1504,7 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment. // request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertTrienodeHealRequest(req *trienodeHealRequest) { func (s *Syncer) scheduleRevertTrienodeHealRequest(req *trienodeHealRequest) {
select { select {
case s.trienodeHealReqFails <- req: case req.revert <- req:
// Sync event loop notified // Sync event loop notified
case <-req.cancel: case <-req.cancel:
// Sync cycle got cancelled // Sync cycle got cancelled
@ -1534,7 +1545,7 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment. // request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertBytecodeHealRequest(req *bytecodeHealRequest) { func (s *Syncer) scheduleRevertBytecodeHealRequest(req *bytecodeHealRequest) {
select { select {
case s.bytecodeHealReqFails <- req: case req.revert <- req:
// Sync event loop notified // Sync event loop notified
case <-req.cancel: case <-req.cancel:
// Sync cycle got cancelled // Sync cycle got cancelled
@ -2147,7 +2158,7 @@ func (s *Syncer) OnAccounts(peer SyncPeer, id uint64, hashes []common.Hash, acco
cont: cont, cont: cont,
} }
select { select {
case s.accountResps <- response: case req.deliver <- response:
case <-req.cancel: case <-req.cancel:
case <-req.stale: case <-req.stale:
} }
@ -2253,7 +2264,7 @@ func (s *Syncer) onByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) error
codes: codes, codes: codes,
} }
select { select {
case s.bytecodeResps <- response: case req.deliver <- response:
case <-req.cancel: case <-req.cancel:
case <-req.stale: case <-req.stale:
} }
@ -2411,7 +2422,7 @@ func (s *Syncer) OnStorage(peer SyncPeer, id uint64, hashes [][]common.Hash, slo
cont: cont, cont: cont,
} }
select { select {
case s.storageResps <- response: case req.deliver <- response:
case <-req.cancel: case <-req.cancel:
case <-req.stale: case <-req.stale:
} }
@ -2505,7 +2516,7 @@ func (s *Syncer) OnTrieNodes(peer SyncPeer, id uint64, trienodes [][]byte) error
nodes: nodes, nodes: nodes,
} }
select { select {
case s.trienodeHealResps <- response: case req.deliver <- response:
case <-req.cancel: case <-req.cancel:
case <-req.stale: case <-req.stale:
} }
@ -2598,7 +2609,7 @@ func (s *Syncer) onHealByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) e
codes: codes, codes: codes,
} }
select { select {
case s.bytecodeHealResps <- response: case req.deliver <- response:
case <-req.cancel: case <-req.cancel:
case <-req.stale: case <-req.stale:
} }