Refactored the parallelized bits, still broken though.
This commit is contained in:
parent
e73de7985e
commit
c8de36ad5e
7
Makefile
7
Makefile
|
@ -1,6 +1,11 @@
|
|||
btc-crawl: **.go
|
||||
all: btc-crawl
|
||||
|
||||
**/*.go:
|
||||
go build ./...
|
||||
|
||||
btc-crawl: **/*.go *.go
|
||||
go build .
|
||||
|
||||
build: btc-crawl
|
||||
|
||||
clean:
|
||||
|
|
6
cmd.go
6
cmd.go
|
@ -97,8 +97,6 @@ func main() {
|
|||
return
|
||||
}
|
||||
|
||||
resultChan := make(chan Result)
|
||||
|
||||
// Construct interrupt handler
|
||||
sig := make(chan os.Signal, 1)
|
||||
signal.Notify(sig, os.Interrupt)
|
||||
|
@ -109,11 +107,11 @@ func main() {
|
|||
|
||||
<-sig // Hurry up?
|
||||
logger.Warningf("Urgent interrupt. Abandoning in-progress workers.")
|
||||
close(resultChan) // FIXME: Could this cause stuff to asplode?
|
||||
crawler.Shutdown() // FIXME: Could this cause stuff to asplode?
|
||||
}()
|
||||
|
||||
// Launch crawler
|
||||
go crawler.Run(resultChan, options.Concurrency)
|
||||
resultChan := crawler.Run(options.Concurrency)
|
||||
logger.Infof("Crawler started with %d concurrency limit.", options.Concurrency)
|
||||
|
||||
// Start processing results
|
||||
|
|
95
crawler.go
95
crawler.go
|
@ -1,6 +1,7 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"./queue"
|
||||
|
@ -20,6 +21,7 @@ type Crawler struct {
|
|||
PeerAge time.Duration
|
||||
ConnectTimeout time.Duration
|
||||
shutdown chan struct{}
|
||||
wait sync.WaitGroup
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
|
@ -32,18 +34,17 @@ func NewCrawler(client *Client, seeds []string) *Crawler {
|
|||
client: client,
|
||||
seenFilter: map[string]bool{},
|
||||
shutdown: make(chan struct{}, 1),
|
||||
wait: sync.WaitGroup{},
|
||||
}
|
||||
filter := func(address string) *string {
|
||||
return c.filter(address)
|
||||
}
|
||||
c.queue = queue.NewQueue(filter, 10)
|
||||
c.queue = queue.NewQueue(filter, &c.wait)
|
||||
|
||||
go func() {
|
||||
// Prefill the queue
|
||||
for _, address := range seeds {
|
||||
c.queue.Input <- address
|
||||
}
|
||||
}()
|
||||
// Prefill the queue
|
||||
for _, address := range seeds {
|
||||
c.queue.Add(address)
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
||||
|
@ -139,7 +140,7 @@ func (c *Crawler) process(r *Result) *Result {
|
|||
continue
|
||||
}
|
||||
|
||||
c.queue.Input <- NetAddressKey(addr)
|
||||
c.queue.Add(NetAddressKey(addr))
|
||||
}
|
||||
|
||||
if len(r.Peers) > 0 {
|
||||
|
@ -150,53 +151,51 @@ func (c *Crawler) process(r *Result) *Result {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *Crawler) Run(resultChan chan<- Result, numWorkers int) {
|
||||
func (c *Crawler) Run(numWorkers int) <-chan Result {
|
||||
result := make(chan Result, 100)
|
||||
workerChan := make(chan struct{}, numWorkers)
|
||||
tempResult := make(chan Result)
|
||||
numActive := 0
|
||||
isActive := true
|
||||
isDone := false
|
||||
|
||||
// Block until we get the first item
|
||||
c.queue.Wait()
|
||||
go func() {
|
||||
// Queue handler
|
||||
for address := range c.queue.Iter() {
|
||||
// Reserve worker slot (block)
|
||||
workerChan <- struct{}{}
|
||||
|
||||
// This is the main "event loop".
|
||||
// FIXME: Feels like there should be a better way to manage the number of
|
||||
// concurrent workers without limiting slots with workerChan and without
|
||||
// using a numActive counter.
|
||||
for {
|
||||
select {
|
||||
case workerChan <- struct{}{}:
|
||||
if !isActive {
|
||||
// Don't start any new workers, leave the slot filled.
|
||||
if isDone {
|
||||
break
|
||||
} else if c.queue.IsEmpty() {
|
||||
if numActive == 0 {
|
||||
logger.Infof("Done after %d queued items.", c.queue.Count())
|
||||
close(resultChan)
|
||||
return
|
||||
}
|
||||
|
||||
// Start worker
|
||||
c.wait.Add(1)
|
||||
go func() {
|
||||
logger.Debugf("[%s] Work received.", address)
|
||||
r := c.handleAddress(address)
|
||||
|
||||
// Process the result
|
||||
if c.process(r) != nil {
|
||||
result <- *r
|
||||
}
|
||||
|
||||
// Clear worker slot
|
||||
<-workerChan
|
||||
break
|
||||
}
|
||||
|
||||
numActive++
|
||||
go func() {
|
||||
address := <-c.queue.Output
|
||||
logger.Debugf("[%s] Work received.", address)
|
||||
tempResult <- *c.handleAddress(address)
|
||||
c.wait.Done()
|
||||
logger.Debugf("[%s] Work completed.", address)
|
||||
}()
|
||||
|
||||
case r := <-tempResult:
|
||||
if c.process(&r) != nil {
|
||||
resultChan <- r
|
||||
}
|
||||
numActive--
|
||||
<-workerChan
|
||||
|
||||
case <-c.shutdown:
|
||||
logger.Infof("Shutting down after %d workers finish.", numActive)
|
||||
isActive = false
|
||||
}
|
||||
}
|
||||
|
||||
logger.Infof("Done after %d queued items.", c.queue.Count())
|
||||
}()
|
||||
|
||||
go func() {
|
||||
<-c.shutdown
|
||||
logger.Infof("Shutting down after workers finish.")
|
||||
isDone = true
|
||||
|
||||
// Urgent.
|
||||
<-c.shutdown
|
||||
close(result)
|
||||
}()
|
||||
|
||||
return result
|
||||
}
|
||||
|
|
131
queue/queue.go
131
queue/queue.go
|
@ -1,94 +1,79 @@
|
|||
package queue
|
||||
|
||||
// A single goroutine manages the overflow queue for thread-safety, funneling
|
||||
// data between the Input and Output channels through a specified filter.
|
||||
import "sync"
|
||||
|
||||
// TODO: Make this an interface and multiple implementations (Redis etc?)
|
||||
type Queue struct {
|
||||
Input chan string
|
||||
Output chan string
|
||||
overflow []string
|
||||
filter func(string) *string
|
||||
count int
|
||||
sync.Mutex
|
||||
storage []string
|
||||
filter func(string) *string
|
||||
count int
|
||||
cond *sync.Cond
|
||||
waitGroup *sync.WaitGroup
|
||||
}
|
||||
|
||||
func NewQueue(filter func(string) *string, bufferSize int) *Queue {
|
||||
func NewQueue(filter func(string) *string, waitGroup *sync.WaitGroup) *Queue {
|
||||
q := Queue{
|
||||
Input: make(chan string, bufferSize),
|
||||
Output: make(chan string, bufferSize),
|
||||
overflow: []string{},
|
||||
filter: filter,
|
||||
storage: []string{},
|
||||
filter: filter,
|
||||
waitGroup: waitGroup,
|
||||
}
|
||||
|
||||
go func() {
|
||||
// Block until we have a next item
|
||||
nextItem := q.next()
|
||||
|
||||
for {
|
||||
select {
|
||||
case item := <-q.Input:
|
||||
// New input
|
||||
r := q.filter(item)
|
||||
if r != nil {
|
||||
// Store in the overflow
|
||||
q.overflow = append(q.overflow, *r)
|
||||
q.count++
|
||||
}
|
||||
case q.Output <- nextItem:
|
||||
// Block until we have more inputs
|
||||
nextItem = q.next()
|
||||
}
|
||||
}
|
||||
}()
|
||||
q.cond = sync.NewCond(&q)
|
||||
|
||||
return &q
|
||||
}
|
||||
|
||||
func (q *Queue) next() string {
|
||||
// Block until a next item is available.
|
||||
|
||||
if len(q.overflow) > 0 {
|
||||
// Pop off the overflow queue.
|
||||
r := q.overflow[0]
|
||||
q.overflow = q.overflow[1:]
|
||||
return r
|
||||
}
|
||||
|
||||
for {
|
||||
// Block until we have a viable output
|
||||
r := q.filter(<-q.Input)
|
||||
|
||||
if r != nil {
|
||||
q.count++
|
||||
return *r
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *Queue) IsEmpty() bool {
|
||||
// FIXME: This breaks everything, get rid of it.
|
||||
|
||||
if len(q.overflow) > 0 {
|
||||
func (q *Queue) Add(item string) bool {
|
||||
r := q.filter(item)
|
||||
if r == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
select {
|
||||
case r := <-q.Output:
|
||||
go func() {
|
||||
// Put it back
|
||||
q.Output <- r
|
||||
}()
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
q.Lock()
|
||||
q.storage = append(q.storage, *r)
|
||||
q.count++
|
||||
q.Unlock()
|
||||
|
||||
q.waitGroup.Add(1)
|
||||
q.cond.Signal()
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (q *Queue) Wait() {
|
||||
// Wait until there is an Output. Useful for blocking until the queue is
|
||||
// ramped up.
|
||||
r := <-q.Output
|
||||
func (q *Queue) Iter() <-chan string {
|
||||
ch := make(chan string)
|
||||
|
||||
go func() {
|
||||
q.Output <- r
|
||||
q.waitGroup.Wait()
|
||||
q.cond.Signal() // Wake up to close the channel.
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for {
|
||||
q.Lock()
|
||||
|
||||
if len(q.storage) == 0 {
|
||||
// Wait until next Add
|
||||
q.cond.Wait()
|
||||
|
||||
if len(q.storage) == 0 {
|
||||
// Queue is finished
|
||||
close(ch)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
r := q.storage[0]
|
||||
q.storage = q.storage[1:]
|
||||
|
||||
q.waitGroup.Done()
|
||||
q.Unlock()
|
||||
|
||||
ch <- r
|
||||
}
|
||||
}()
|
||||
|
||||
return ch
|
||||
}
|
||||
|
||||
func (q *Queue) Count() int {
|
||||
|
|
Loading…
Reference in New Issue