Refactored the parallelized bits, still broken though.

This commit is contained in:
Andrey Petrov 2014-05-17 19:44:33 -07:00
parent e73de7985e
commit c8de36ad5e
4 changed files with 113 additions and 126 deletions

View File

@ -1,6 +1,11 @@
btc-crawl: **.go
all: btc-crawl
**/*.go:
go build ./...
btc-crawl: **/*.go *.go
go build .
build: btc-crawl
clean:

6
cmd.go
View File

@ -97,8 +97,6 @@ func main() {
return
}
resultChan := make(chan Result)
// Construct interrupt handler
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
@ -109,11 +107,11 @@ func main() {
<-sig // Hurry up?
logger.Warningf("Urgent interrupt. Abandoning in-progress workers.")
close(resultChan) // FIXME: Could this cause stuff to asplode?
crawler.Shutdown() // FIXME: Could this cause stuff to asplode?
}()
// Launch crawler
go crawler.Run(resultChan, options.Concurrency)
resultChan := crawler.Run(options.Concurrency)
logger.Infof("Crawler started with %d concurrency limit.", options.Concurrency)
// Start processing results

View File

@ -1,6 +1,7 @@
package main
import (
"sync"
"time"
"./queue"
@ -20,6 +21,7 @@ type Crawler struct {
PeerAge time.Duration
ConnectTimeout time.Duration
shutdown chan struct{}
wait sync.WaitGroup
}
type Result struct {
@ -32,18 +34,17 @@ func NewCrawler(client *Client, seeds []string) *Crawler {
client: client,
seenFilter: map[string]bool{},
shutdown: make(chan struct{}, 1),
wait: sync.WaitGroup{},
}
filter := func(address string) *string {
return c.filter(address)
}
c.queue = queue.NewQueue(filter, 10)
c.queue = queue.NewQueue(filter, &c.wait)
go func() {
// Prefill the queue
for _, address := range seeds {
c.queue.Input <- address
}
}()
// Prefill the queue
for _, address := range seeds {
c.queue.Add(address)
}
return &c
}
@ -139,7 +140,7 @@ func (c *Crawler) process(r *Result) *Result {
continue
}
c.queue.Input <- NetAddressKey(addr)
c.queue.Add(NetAddressKey(addr))
}
if len(r.Peers) > 0 {
@ -150,53 +151,51 @@ func (c *Crawler) process(r *Result) *Result {
return nil
}
func (c *Crawler) Run(resultChan chan<- Result, numWorkers int) {
func (c *Crawler) Run(numWorkers int) <-chan Result {
result := make(chan Result, 100)
workerChan := make(chan struct{}, numWorkers)
tempResult := make(chan Result)
numActive := 0
isActive := true
isDone := false
// Block until we get the first item
c.queue.Wait()
go func() {
// Queue handler
for address := range c.queue.Iter() {
// Reserve worker slot (block)
workerChan <- struct{}{}
// This is the main "event loop".
// FIXME: Feels like there should be a better way to manage the number of
// concurrent workers without limiting slots with workerChan and without
// using a numActive counter.
for {
select {
case workerChan <- struct{}{}:
if !isActive {
// Don't start any new workers, leave the slot filled.
if isDone {
break
} else if c.queue.IsEmpty() {
if numActive == 0 {
logger.Infof("Done after %d queued items.", c.queue.Count())
close(resultChan)
return
}
// Start worker
c.wait.Add(1)
go func() {
logger.Debugf("[%s] Work received.", address)
r := c.handleAddress(address)
// Process the result
if c.process(r) != nil {
result <- *r
}
// Clear worker slot
<-workerChan
break
}
numActive++
go func() {
address := <-c.queue.Output
logger.Debugf("[%s] Work received.", address)
tempResult <- *c.handleAddress(address)
c.wait.Done()
logger.Debugf("[%s] Work completed.", address)
}()
case r := <-tempResult:
if c.process(&r) != nil {
resultChan <- r
}
numActive--
<-workerChan
case <-c.shutdown:
logger.Infof("Shutting down after %d workers finish.", numActive)
isActive = false
}
}
logger.Infof("Done after %d queued items.", c.queue.Count())
}()
go func() {
<-c.shutdown
logger.Infof("Shutting down after workers finish.")
isDone = true
// Urgent.
<-c.shutdown
close(result)
}()
return result
}

View File

@ -1,94 +1,79 @@
package queue
// A single goroutine manages the overflow queue for thread-safety, funneling
// data between the Input and Output channels through a specified filter.
import "sync"
// TODO: Make this an interface and multiple implementations (Redis etc?)
type Queue struct {
Input chan string
Output chan string
overflow []string
filter func(string) *string
count int
sync.Mutex
storage []string
filter func(string) *string
count int
cond *sync.Cond
waitGroup *sync.WaitGroup
}
func NewQueue(filter func(string) *string, bufferSize int) *Queue {
func NewQueue(filter func(string) *string, waitGroup *sync.WaitGroup) *Queue {
q := Queue{
Input: make(chan string, bufferSize),
Output: make(chan string, bufferSize),
overflow: []string{},
filter: filter,
storage: []string{},
filter: filter,
waitGroup: waitGroup,
}
go func() {
// Block until we have a next item
nextItem := q.next()
for {
select {
case item := <-q.Input:
// New input
r := q.filter(item)
if r != nil {
// Store in the overflow
q.overflow = append(q.overflow, *r)
q.count++
}
case q.Output <- nextItem:
// Block until we have more inputs
nextItem = q.next()
}
}
}()
q.cond = sync.NewCond(&q)
return &q
}
func (q *Queue) next() string {
// Block until a next item is available.
if len(q.overflow) > 0 {
// Pop off the overflow queue.
r := q.overflow[0]
q.overflow = q.overflow[1:]
return r
}
for {
// Block until we have a viable output
r := q.filter(<-q.Input)
if r != nil {
q.count++
return *r
}
}
}
func (q *Queue) IsEmpty() bool {
// FIXME: This breaks everything, get rid of it.
if len(q.overflow) > 0 {
func (q *Queue) Add(item string) bool {
r := q.filter(item)
if r == nil {
return false
}
select {
case r := <-q.Output:
go func() {
// Put it back
q.Output <- r
}()
return false
default:
return true
}
q.Lock()
q.storage = append(q.storage, *r)
q.count++
q.Unlock()
q.waitGroup.Add(1)
q.cond.Signal()
return true
}
func (q *Queue) Wait() {
// Wait until there is an Output. Useful for blocking until the queue is
// ramped up.
r := <-q.Output
func (q *Queue) Iter() <-chan string {
ch := make(chan string)
go func() {
q.Output <- r
q.waitGroup.Wait()
q.cond.Signal() // Wake up to close the channel.
}()
go func() {
for {
q.Lock()
if len(q.storage) == 0 {
// Wait until next Add
q.cond.Wait()
if len(q.storage) == 0 {
// Queue is finished
close(ch)
return
}
}
r := q.storage[0]
q.storage = q.storage[1:]
q.waitGroup.Done()
q.Unlock()
ch <- r
}
}()
return ch
}
func (q *Queue) Count() int {