btc-crawl/crawler.go

227 lines
4.9 KiB
Go
Raw Normal View History

2014-04-22 18:28:10 -05:00
package main
import (
"time"
2014-04-25 17:13:45 -05:00
2014-05-10 20:48:33 -05:00
"./queue"
2014-04-25 17:13:45 -05:00
"github.com/conformal/btcwire"
2014-04-22 18:28:10 -05:00
)
// TODO: Break Client/Peer/Crawler into separate modules.
type Crawler struct {
2014-05-10 20:48:33 -05:00
client *Client
queue *queue.Queue
numSeen int
numUnique int
numConnected int
numAttempted int
seenFilter map[string]bool // TODO: Replace with bloom filter?
peerAge time.Duration
2014-04-22 18:28:10 -05:00
}
2014-04-25 18:24:27 -05:00
type Result struct {
Node *Peer
Peers []*btcwire.NetAddress
}
2014-05-10 20:48:33 -05:00
func NewCrawler(client *Client, seeds []string, peerAge time.Duration) *Crawler {
2014-04-22 18:28:10 -05:00
c := Crawler{
2014-04-23 19:44:22 -05:00
client: client,
seenFilter: map[string]bool{},
peerAge: peerAge,
2014-04-22 18:28:10 -05:00
}
2014-05-10 20:48:33 -05:00
filter := func(address string) *string {
return c.filter(address)
2014-04-22 18:28:10 -05:00
}
2014-05-10 20:48:33 -05:00
c.queue = queue.NewQueue(filter, 10)
go func() {
// Prefill the queue
for _, address := range seeds {
c.queue.Input <- address
}
}()
2014-04-22 18:28:10 -05:00
return &c
}
2014-04-25 18:24:27 -05:00
func (c *Crawler) handleAddress(address string) *Result {
2014-05-10 20:48:33 -05:00
c.numAttempted++
2014-04-22 18:28:10 -05:00
client := c.client
peer := NewPeer(client, address)
2014-04-25 18:24:27 -05:00
r := Result{Node: peer}
2014-04-22 18:28:10 -05:00
err := peer.Connect()
if err != nil {
2014-04-24 21:13:33 -05:00
logger.Debugf("[%s] Connection failed: %v", address, err)
2014-04-22 18:28:10 -05:00
return &r
}
defer peer.Disconnect()
err = peer.Handshake()
if err != nil {
2014-04-24 21:13:33 -05:00
logger.Debugf("[%s] Handsake failed: %v", address, err)
2014-04-22 18:28:10 -05:00
return &r
}
// Send getaddr.
2014-04-25 17:13:45 -05:00
err = peer.WriteMessage(btcwire.NewMsgGetAddr())
if err != nil {
2014-04-24 21:13:33 -05:00
logger.Warningf("[%s] GetAddr failed: %v", address, err)
2014-04-22 18:28:10 -05:00
return &r
}
2014-05-10 20:48:33 -05:00
c.numConnected++
2014-04-22 18:28:10 -05:00
// Listen for tx inv messages.
firstReceived := -1
tolerateMessages := 3
otherMessages := []string{}
for {
// We can't really tell when we're done receiving peers, so we stop either
// when we get a smaller-than-normal set size or when we've received too
// many unrelated messages.
2014-04-25 17:13:45 -05:00
msg, _, err := peer.ReadMessage()
2014-04-22 18:28:10 -05:00
if err != nil {
2014-04-24 21:13:33 -05:00
logger.Warningf("[%s] Failed to read message: %v", address, err)
2014-04-22 18:28:10 -05:00
continue
}
switch tmsg := msg.(type) {
case *btcwire.MsgAddr:
2014-04-25 18:24:27 -05:00
r.Peers = append(r.Peers, tmsg.AddrList...)
2014-04-22 18:28:10 -05:00
if firstReceived == -1 {
firstReceived = len(tmsg.AddrList)
} else if firstReceived > len(tmsg.AddrList) || firstReceived == 0 {
// Probably done.
return &r
}
default:
otherMessages = append(otherMessages, tmsg.Command())
if len(otherMessages) > tolerateMessages {
2014-04-25 18:24:27 -05:00
logger.Debugf("[%s] Giving up with %d results after tolerating messages: %v.", address, len(r.Peers), otherMessages)
2014-04-22 18:28:10 -05:00
return &r
}
}
}
}
2014-05-10 20:48:33 -05:00
func (c *Crawler) filter(address string) *string {
2014-04-22 18:28:10 -05:00
// Returns true if not seen before, otherwise false
2014-05-10 20:48:33 -05:00
c.numSeen++
2014-04-22 18:28:10 -05:00
state, ok := c.seenFilter[address]
if ok == true && state == true {
2014-05-10 20:48:33 -05:00
return nil
2014-04-22 18:28:10 -05:00
}
c.seenFilter[address] = true
2014-05-10 20:48:33 -05:00
c.numUnique++
return &address
}
2014-04-22 18:28:10 -05:00
2014-05-10 20:48:33 -05:00
/*
func (c *Crawler) Run(resultChan chan<- Result, numWorkers int) {
workChan := make(chan string, numWorkers)
queueChan := make(chan string)
tempResult := make(chan Result)
go func(queueChan <-chan string) {
// Single thread to safely manage the queue
c.addAddress(<-queueChan)
nextAddress, _ := c.popAddress()
for {
select {
case address := <-queueChan:
// Enque address
c.addAddress(address)
case workChan <- nextAddress:
nextAddress, err := c.popAddress()
if err != nil {
// Block until we get more work
c.addAddress(<-queueChan)
nextAddress, _ = c.popAddress()
}
}
}
}(queueChan)
go func(tempResult <-chan Result, workChan chan<- string) {
// Convert from result to queue.
for {
select {
case r := <-tempResult:
}
}
}(tempResult, workChan)
for address := range workChan {
// Spawn more workers as we get buffered work
go func() {
logger.Debugf("[%s] Worker started.", address)
tempResult <- *c.handleAddress(address)
}()
}
2014-04-22 18:28:10 -05:00
}
2014-05-10 20:48:33 -05:00
*/
2014-04-22 18:28:10 -05:00
2014-04-25 18:24:27 -05:00
func (c *Crawler) Run(numWorkers int, stopAfter int) *[]Result {
numActive := 0
2014-04-22 18:28:10 -05:00
2014-04-25 18:24:27 -05:00
resultChan := make(chan Result)
workerChan := make(chan struct{}, numWorkers)
results := []Result{}
2014-04-25 18:31:03 -05:00
if stopAfter == 0 {
// No stopping.
stopAfter = -1
}
2014-04-22 18:28:10 -05:00
// This is the main "event loop". Feels like there may be a better way to
// manage the number of concurrent workers but I can't think of it right now.
for {
select {
2014-04-25 18:24:27 -05:00
case workerChan <- struct{}{}:
2014-04-22 18:28:10 -05:00
go func() {
2014-05-10 20:48:33 -05:00
address := <-c.queue.Output
2014-04-24 21:13:33 -05:00
logger.Debugf("[%s] Worker started.", address)
2014-04-25 18:24:27 -05:00
resultChan <- *c.handleAddress(address)
2014-04-22 18:28:10 -05:00
}()
2014-04-25 18:24:27 -05:00
case r := <-resultChan:
timestampSince := time.Now().Add(-c.peerAge)
for _, addr := range r.Peers {
if !addr.Timestamp.After(timestampSince) {
continue
}
2014-05-10 20:48:33 -05:00
c.queue.Input <- NetAddressKey(addr)
2014-04-22 18:28:10 -05:00
}
2014-05-10 20:48:33 -05:00
numActive--
2014-04-25 18:24:27 -05:00
if len(r.Peers) > 0 {
stopAfter--
results = append(results, r)
2014-04-22 18:28:10 -05:00
2014-05-10 20:48:33 -05:00
logger.Infof("[%s] Returned %d peers. Total %d unique peers via %d connected (of %d attempted).", r.Node.Address, len(r.Peers), c.numUnique, c.numConnected, c.numAttempted)
2014-04-23 19:44:22 -05:00
}
2014-04-22 18:28:10 -05:00
2014-05-10 20:48:33 -05:00
if stopAfter == 0 || (c.queue.IsEmpty() && numActive == 0) {
2014-04-24 21:13:33 -05:00
logger.Infof("Done.")
2014-04-25 18:24:27 -05:00
return &results
2014-04-22 18:28:10 -05:00
}
2014-04-25 18:24:27 -05:00
<-workerChan
2014-04-22 18:28:10 -05:00
}
}
}