Ci race detector handle failing tests (#19143)

* swarm/storage: increase mget timeout in common_test.go

 TestDbStoreCorrect_1k sometimes timed out with -race on Travis.

--- FAIL: TestDbStoreCorrect_1k (24.63s)
    common_test.go:194: testStore failed: timed out after 10s

* swarm: remove unused vars from TestSnapshotSyncWithServer

nodeCount and chunkCount is returned from setupSim and those values
we use.

* swarm: move race/norace helpers from stream to testutil

As we will need to use the flag in other packages, too.

* swarm: refactor TestSwarmNetwork case

Extract long running test cases for better visibility.

* swarm/network: skip TestSyncingViaGlobalSync with -race

As panics on Travis.

panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x7e351b]

* swarm: run TestSwarmNetwork with fewer nodes with -race

As otherwise we always get test failure with `network_test.go:374:
context deadline exceeded` even with raised `Timeout`.

* swarm/network: run TestDeliveryFromNodes with fewer nodes with -race

Test on Travis times out with 8 or more nodes if -race flag is present.

* swarm/network: smaller node count for discovery tests with -race

TestDiscoveryPersistenceSimulationSimAdapters failed on Travis with
`-race` flag present. The failure was due to extensive memory usage,
coming from the CGO runtime. Using a smaller node count resolves the
issue.

=== RUN   TestDiscoveryPersistenceSimulationSimAdapter
==7227==ERROR: ThreadSanitizer failed to allocate 0x80000 (524288) bytes of clock allocator (error code: 12)
FATAL: ThreadSanitizer CHECK failed: ./gotsan.cc:6976 "((0 && "unable to mmap")) != (0)" (0x0, 0x0)
FAIL    github.com/ethereum/go-ethereum/swarm/network/simulations/discovery     804.826s

* swarm/network: run TestFileRetrieval with fewer nodes with -race

Otherwise we get a failure due to extensive memory usage, as the CGO
runtime cannot allocate more bytes.

=== RUN   TestFileRetrieval
==7366==ERROR: ThreadSanitizer failed to allocate 0x80000 (524288) bytes of clock allocator (error code: 12)
FATAL: ThreadSanitizer CHECK failed: ./gotsan.cc:6976 "((0 && "unable to mmap")) != (0)" (0x0, 0x0)
FAIL	github.com/ethereum/go-ethereum/swarm/network/stream	155.165s

* swarm/network: run TestRetrieval with fewer nodes with -race

Otherwise we get a failure due to extensive memory usage, as the CGO
runtime cannot allocate more bytes ("ThreadSanitizer failed to
allocate").

* swarm/network: skip flaky TestGetSubscriptionsRPC on Travis w/ -race

Test fails a lot with something like:
 streamer_test.go:1332: Real subscriptions and expected amount don't match; real: 0, expected: 20

* swarm/storage: skip TestDB_SubscribePull* tests on Travis w/ -race

Travis just hangs...

ok  	github.com/ethereum/go-ethereum/swarm/storage/feed/lookup	1.307s
keepalive
keepalive
keepalive

or panics after a while.

Without these tests the race detector job is now stable. Let's
invetigate these tests in a separate issue:
https://github.com/ethersphere/go-ethereum/issues/1245
This commit is contained in:
Ferenc Szabo 2019-02-20 22:57:42 +01:00 committed by Viktor Trón
parent d36e974ba3
commit e38b227ce6
13 changed files with 187 additions and 132 deletions

View File

@ -27,6 +27,8 @@ import (
"testing"
"time"
"github.com/ethereum/go-ethereum/swarm/testutil"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/node"
@ -82,12 +84,19 @@ func getDbStore(nodeID string) (*state.DBStore, error) {
}
var (
nodeCount = flag.Int("nodes", 32, "number of nodes to create (default 32)")
nodeCount = flag.Int("nodes", defaultNodeCount(), "number of nodes to create (default 32)")
initCount = flag.Int("conns", 1, "number of originally connected peers (default 1)")
loglevel = flag.Int("loglevel", 3, "verbosity of logs")
rawlog = flag.Bool("rawlog", false, "remove terminal formatting from logs")
)
func defaultNodeCount() int {
if testutil.RaceEnabled {
return 8
}
return 32
}
func init() {
flag.Parse()
// register the discovery service which will run as a devp2p

View File

@ -446,6 +446,12 @@ func TestDeliveryFromNodes(t *testing.T) {
testDeliveryFromNodes(t, 2, dataChunkCount, false)
testDeliveryFromNodes(t, 4, dataChunkCount, true)
testDeliveryFromNodes(t, 4, dataChunkCount, false)
if testutil.RaceEnabled {
// Travis cannot handle more nodes with -race; would time out.
return
}
testDeliveryFromNodes(t, 8, dataChunkCount, true)
testDeliveryFromNodes(t, 8, dataChunkCount, false)
testDeliveryFromNodes(t, 16, dataChunkCount, true)

View File

@ -22,6 +22,8 @@ import (
"testing"
"time"
"github.com/ethereum/go-ethereum/swarm/testutil"
"github.com/ethereum/go-ethereum/node"
"github.com/ethereum/go-ethereum/p2p/enode"
"github.com/ethereum/go-ethereum/p2p/simulations/adapters"
@ -43,23 +45,24 @@ const (
//Files are uploaded to nodes, other nodes try to retrieve the file
//Number of nodes can be provided via commandline too.
func TestFileRetrieval(t *testing.T) {
var nodeCount []int
if *nodes != 0 {
err := runFileRetrievalTest(*nodes)
if err != nil {
t.Fatal(err)
}
nodeCount = []int{*nodes}
} else {
nodeCnt := []int{16}
//if the `longrunning` flag has been provided
//run more test combinations
nodeCount = []int{16}
if *longrunning {
nodeCnt = append(nodeCnt, 32, 64, 128)
nodeCount = append(nodeCount, 32, 64, 128)
} else if testutil.RaceEnabled {
nodeCount = []int{4}
}
for _, n := range nodeCnt {
err := runFileRetrievalTest(n)
if err != nil {
t.Fatal(err)
}
for _, nc := range nodeCount {
if err := runFileRetrievalTest(nc); err != nil {
t.Error(err)
}
}
}
@ -79,18 +82,17 @@ func TestRetrieval(t *testing.T) {
t.Fatal(err)
}
} else {
var nodeCnt []int
var chnkCnt []int
//if the `longrunning` flag has been provided
//run more test combinations
nodeCnt := []int{16}
chnkCnt := []int{32}
if *longrunning {
nodeCnt = []int{16, 32, 128}
chnkCnt = []int{4, 32, 256}
} else {
//default test
nodeCnt = []int{16}
chnkCnt = []int{32}
} else if testutil.RaceEnabled {
nodeCnt = []int{4}
chnkCnt = []int{4}
}
for _, n := range nodeCnt {
for _, c := range chnkCnt {
t.Run(fmt.Sprintf("TestRetrieval_%d_%d", n, c), func(t *testing.T) {

View File

@ -19,7 +19,6 @@ import (
"context"
"errors"
"fmt"
"io/ioutil"
"os"
"runtime"
"sync"
@ -42,8 +41,6 @@ import (
"github.com/ethereum/go-ethereum/swarm/testutil"
)
const MaxTimeout = 600
type synctestConfig struct {
addrs [][]byte
hashes []storage.Address
@ -80,37 +77,31 @@ func TestSyncingViaGlobalSync(t *testing.T) {
if runtime.GOOS == "darwin" && os.Getenv("TRAVIS") == "true" {
t.Skip("Flaky on mac on travis")
}
if testutil.RaceEnabled {
t.Skip("Segfaults on Travis with -race")
}
//if nodes/chunks have been provided via commandline,
//run the tests with these values
if *nodes != 0 && *chunks != 0 {
log.Info(fmt.Sprintf("Running test with %d chunks and %d nodes...", *chunks, *nodes))
testSyncingViaGlobalSync(t, *chunks, *nodes)
} else {
var nodeCnt []int
var chnkCnt []int
chunkCounts := []int{4, 32}
nodeCounts := []int{32, 16}
//if the `longrunning` flag has been provided
//run more test combinations
if *longrunning {
chnkCnt = []int{1, 8, 32, 256, 1024}
nodeCnt = []int{16, 32, 64, 128, 256}
} else if raceTest {
// TestSyncingViaGlobalSync allocates a lot of memory
// with race detector. By reducing the number of chunks
// and nodes, memory consumption is lower and data races
// are still checked, while correctness of syncing is
// tested with more chunks and nodes in regular (!race)
// tests.
chnkCnt = []int{4}
nodeCnt = []int{16}
} else {
//default test
chnkCnt = []int{4, 32}
nodeCnt = []int{32, 16}
chunkCounts = []int{1, 8, 32, 256, 1024}
nodeCounts = []int{16, 32, 64, 128, 256}
}
for _, chnk := range chnkCnt {
for _, n := range nodeCnt {
log.Info(fmt.Sprintf("Long running test with %d chunks and %d nodes...", chnk, n))
testSyncingViaGlobalSync(t, chnk, n)
for _, chunkCount := range chunkCounts {
for _, n := range nodeCounts {
log.Info(fmt.Sprintf("Long running test with %d chunks and %d nodes...", chunkCount, n))
testSyncingViaGlobalSync(t, chunkCount, n)
}
}
}
@ -123,21 +114,7 @@ var simServiceMap = map[string]simulation.ServiceFunc{
return nil, nil, err
}
var dir string
var store *state.DBStore
if raceTest {
// Use on-disk DBStore to reduce memory consumption in race tests.
dir, err = ioutil.TempDir("", "swarm-stream-")
if err != nil {
return nil, nil, err
}
store, err = state.NewDBStore(dir)
if err != nil {
return nil, nil, err
}
} else {
store = state.NewInmemoryStore()
}
store := state.NewInmemoryStore()
r := NewRegistry(addr.ID(), delivery, netStore, store, &RegistryOptions{
Retrieval: RetrievalDisabled,

View File

@ -21,12 +21,15 @@ import (
"context"
"errors"
"fmt"
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/ethereum/go-ethereum/swarm/testutil"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/node"
@ -1178,6 +1181,11 @@ stream registration, then tests that there are subscriptions.
*/
func TestGetSubscriptionsRPC(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("flaky with -race on Travis")
// Note: related ticket https://github.com/ethersphere/go-ethereum/issues/1234
}
// arbitrarily set to 4
nodeCount := 4
// run with more nodes if `longrunning` flag is set

View File

@ -48,7 +48,7 @@ func TestSyncerSimulation(t *testing.T) {
// race detector. Allow it to finish successfully by
// reducing its scope, and still check for data races
// with the smallest number of nodes.
if !raceTest {
if !testutil.RaceEnabled {
testSyncBetweenNodes(t, 4, dataChunkCount, true, 1)
testSyncBetweenNodes(t, 8, dataChunkCount, true, 1)
testSyncBetweenNodes(t, 16, dataChunkCount, true, 1)
@ -88,7 +88,7 @@ func testSyncBetweenNodes(t *testing.T, nodes, chunkCount int, skipCheck bool, p
var dir string
var store *state.DBStore
if raceTest {
if testutil.RaceEnabled {
// Use on-disk DBStore to reduce memory consumption in race tests.
dir, err = ioutil.TempDir("", "swarm-stream-")
if err != nil {

View File

@ -136,16 +136,6 @@ func TestSnapshotSyncWithServer(t *testing.T) {
//define a wrapper object to be able to pass around data
wrapper := &netWrapper{}
nodeCount := *nodes
chunkCount := *chunks
if nodeCount == 0 || chunkCount == 0 {
nodeCount = 32
chunkCount = 1
}
log.Info(fmt.Sprintf("Running the simulation with %d nodes and %d chunks", nodeCount, chunkCount))
sim := simulation.New(map[string]simulation.ServiceFunc{
"streamer": func(ctx *adapters.ServiceContext, bucket *sync.Map) (s node.Service, cleanup func(), err error) {
addr, netStore, delivery, clean, err := newNetStoreAndDeliveryWithRequestFunc(ctx, bucket, dummyRequestFromPeers)
@ -178,6 +168,7 @@ func TestSnapshotSyncWithServer(t *testing.T) {
nodeCount, chunkCount, sim := setupSim(simServiceMap)
defer sim.Close()
log.Info(fmt.Sprintf("Running the simulation with %d nodes and %d chunks", nodeCount, chunkCount))
log.Info("Initializing test config")
conf := &synctestConfig{}

View File

@ -28,6 +28,8 @@ import (
"testing"
"time"
"github.com/ethereum/go-ethereum/swarm/testutil"
"github.com/ethereum/go-ethereum/crypto"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/node"
@ -36,7 +38,7 @@ import (
"github.com/ethereum/go-ethereum/swarm/api"
"github.com/ethereum/go-ethereum/swarm/network/simulation"
"github.com/ethereum/go-ethereum/swarm/storage"
colorable "github.com/mattn/go-colorable"
"github.com/mattn/go-colorable"
)
var (
@ -57,12 +59,7 @@ func init() {
// static and dynamic Swarm nodes in network simulation, by
// uploading files to every node and retrieving them.
func TestSwarmNetwork(t *testing.T) {
for _, tc := range []struct {
name string
steps []testSwarmNetworkStep
options *testSwarmNetworkOptions
disabled bool
}{
var tests = []testSwarmNetworkCase{
{
name: "10_nodes",
steps: []testSwarmNetworkStep{
@ -86,6 +83,61 @@ func TestSwarmNetwork(t *testing.T) {
SkipCheck: true,
},
},
{
name: "dec_inc_node_count",
steps: []testSwarmNetworkStep{
{
nodeCount: 3,
},
{
nodeCount: 1,
},
{
nodeCount: 5,
},
},
options: &testSwarmNetworkOptions{
Timeout: 90 * time.Second,
},
},
}
if *longrunning {
tests = append(tests, longRunningCases()...)
} else if testutil.RaceEnabled {
tests = shortCaseForRace()
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
testSwarmNetwork(t, tc.options, tc.steps...)
})
}
}
type testSwarmNetworkCase struct {
name string
steps []testSwarmNetworkStep
options *testSwarmNetworkOptions
}
// testSwarmNetworkStep is the configuration
// for the state of the simulation network.
type testSwarmNetworkStep struct {
// number of swarm nodes that must be in the Up state
nodeCount int
}
// testSwarmNetworkOptions contains optional parameters for running
// testSwarmNetwork.
type testSwarmNetworkOptions struct {
Timeout time.Duration
SkipCheck bool
}
func longRunningCases() []testSwarmNetworkCase {
return []testSwarmNetworkCase{
{
name: "50_nodes",
steps: []testSwarmNetworkStep{
@ -96,7 +148,6 @@ func TestSwarmNetwork(t *testing.T) {
options: &testSwarmNetworkOptions{
Timeout: 3 * time.Minute,
},
disabled: !*longrunning,
},
{
name: "50_nodes_skip_check",
@ -109,7 +160,6 @@ func TestSwarmNetwork(t *testing.T) {
Timeout: 3 * time.Minute,
SkipCheck: true,
},
disabled: !*longrunning,
},
{
name: "inc_node_count",
@ -127,7 +177,6 @@ func TestSwarmNetwork(t *testing.T) {
options: &testSwarmNetworkOptions{
Timeout: 90 * time.Second,
},
disabled: !*longrunning,
},
{
name: "dec_node_count",
@ -145,24 +194,6 @@ func TestSwarmNetwork(t *testing.T) {
options: &testSwarmNetworkOptions{
Timeout: 90 * time.Second,
},
disabled: !*longrunning,
},
{
name: "dec_inc_node_count",
steps: []testSwarmNetworkStep{
{
nodeCount: 3,
},
{
nodeCount: 1,
},
{
nodeCount: 5,
},
},
options: &testSwarmNetworkOptions{
Timeout: 90 * time.Second,
},
},
{
name: "inc_dec_node_count",
@ -186,7 +217,6 @@ func TestSwarmNetwork(t *testing.T) {
options: &testSwarmNetworkOptions{
Timeout: 5 * time.Minute,
},
disabled: !*longrunning,
},
{
name: "inc_dec_node_count_skip_check",
@ -211,23 +241,25 @@ func TestSwarmNetwork(t *testing.T) {
Timeout: 5 * time.Minute,
SkipCheck: true,
},
disabled: !*longrunning,
},
} {
if tc.disabled {
continue
}
t.Run(tc.name, func(t *testing.T) {
testSwarmNetwork(t, tc.options, tc.steps...)
})
}
}
// testSwarmNetworkStep is the configuration
// for the state of the simulation network.
type testSwarmNetworkStep struct {
// number of swarm nodes that must be in the Up state
nodeCount int
func shortCaseForRace() []testSwarmNetworkCase {
// As for now, Travis with -race can only run 8 nodes
return []testSwarmNetworkCase{
{
name: "8_nodes",
steps: []testSwarmNetworkStep{
{
nodeCount: 8,
},
},
options: &testSwarmNetworkOptions{
Timeout: 1 * time.Minute,
},
},
}
}
// file represents the file uploaded on a particular node.
@ -244,13 +276,6 @@ type check struct {
nodeID enode.ID
}
// testSwarmNetworkOptions contains optional parameters for running
// testSwarmNetwork.
type testSwarmNetworkOptions struct {
Timeout time.Duration
SkipCheck bool
}
// testSwarmNetwork is a helper function used for testing different
// static and dynamic Swarm network simulations.
// It is responsible for:
@ -259,6 +284,7 @@ type testSwarmNetworkOptions struct {
// - May wait for Kademlia on every node to be healthy.
// - Checking if a file is retrievable from all nodes.
func testSwarmNetwork(t *testing.T, o *testSwarmNetworkOptions, steps ...testSwarmNetworkStep) {
t.Helper()
if o == nil {
o = new(testSwarmNetworkOptions)

View File

@ -142,7 +142,7 @@ func mget(store ChunkStore, hs []Address, f func(h Address, chunk Chunk) error)
close(errc)
}()
var err error
timeout := 10 * time.Second
timeout := 20 * time.Second
select {
case err = <-errc:
case <-time.NewTimer(timeout).C:

View File

@ -47,6 +47,8 @@ func TestDB_collectGarbageWorker_multipleBatches(t *testing.T) {
// testDB_collectGarbageWorker is a helper test function to test
// garbage collection runs by uploading and syncing a number of chunks.
func testDB_collectGarbageWorker(t *testing.T) {
t.Helper()
chunkCount := 150
testHookCollectGarbageChan := make(chan int64)

View File

@ -20,11 +20,13 @@ import (
"bytes"
"context"
"fmt"
"os"
"sync"
"testing"
"time"
"github.com/ethereum/go-ethereum/swarm/storage"
"github.com/ethereum/go-ethereum/swarm/testutil"
)
// TestDB_SubscribePull uploads some chunks before and after
@ -32,6 +34,12 @@ import (
// all addresses are received in the right order
// for expected proximity order bins.
func TestDB_SubscribePull(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("does not complete with -race on Travis")
// Note: related ticket TODO
}
db, cleanupFunc := newTestDB(t, nil)
defer cleanupFunc()
@ -79,6 +87,12 @@ func TestDB_SubscribePull(t *testing.T) {
// validates if all addresses are received in the right order
// for expected proximity order bins.
func TestDB_SubscribePull_multiple(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("does not complete with -race on Travis")
// Note: related ticket TODO
}
db, cleanupFunc := newTestDB(t, nil)
defer cleanupFunc()
@ -132,6 +146,12 @@ func TestDB_SubscribePull_multiple(t *testing.T) {
// and validates if all expected addresses are received in the
// right order for expected proximity order bins.
func TestDB_SubscribePull_since(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("does not complete with -race on Travis")
// Note: related ticket TODO
}
db, cleanupFunc := newTestDB(t, nil)
defer cleanupFunc()
@ -222,6 +242,12 @@ func TestDB_SubscribePull_since(t *testing.T) {
// and validates if all expected addresses are received in the
// right order for expected proximity order bins.
func TestDB_SubscribePull_until(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("does not complete with -race on Travis")
// Note: related ticket TODO
}
db, cleanupFunc := newTestDB(t, nil)
defer cleanupFunc()
@ -311,6 +337,12 @@ func TestDB_SubscribePull_until(t *testing.T) {
// and until arguments, and validates if all expected addresses
// are received in the right order for expected proximity order bins.
func TestDB_SubscribePull_sinceAndUntil(t *testing.T) {
if testutil.RaceEnabled && os.Getenv("TRAVIS") == "true" {
t.Skip("does not complete with -race on Travis")
// Note: related ticket TODO
}
db, cleanupFunc := newTestDB(t, nil)
defer cleanupFunc()

View File

@ -16,9 +16,11 @@
// +build !race
package stream
package testutil
// Provide a flag to reduce the scope of tests when running them
// with race detector. Some of the tests are doing a lot of allocations
// on the heap, and race detector uses much more memory to track them.
const raceTest = false
// RaceEnabled is true when -race flag is provided to the go tool. This const
// might be used in tests to skip some cases as the race detector may increase
// memory usage 5-10x and execution time by 2-20x. That might causes problems
// on Travis. Please, use this flag sparingly and keep your unit tests
// as light on resources as possible.
const RaceEnabled = false

View File

@ -16,8 +16,8 @@
// +build race
package stream
package testutil
// Reduce the scope of some tests when running with race detector,
// as it raises the memory consumption significantly.
const raceTest = true
// RaceEnabled is true when -race flag is provided to the go tool.
// See norace.go for more.
const RaceEnabled = true