spidergo

package module
v0.0.0-...-7a0d02b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 23, 2016 License: MIT Imports: 12 Imported by: 0

README

spidergo

Go Walker GoDoc

🪲 A high performance spider(crawler) written in go.

Feature

  • Concurrent
  • Distributed
  • Support analyse html page and download binary file

Installation

go get github.com/sakeven/spidergo

Example

There is a example in github/sakeven/spidergo/example. Blow is a file from example.

package main

import (
	"log"
	"net/http"
	"os"
	"regexp"
	"runtime"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/sakeven/spidergo"
	"github.com/sakeven/spidergo/lib/analyser"
	"github.com/sakeven/spidergo/lib/page"
	"github.com/sakeven/spidergo/lib/result"
)

func main() {
	log.SetFlags(log.Ldate | log.Lshortfile | log.Ltime)
	runtime.GOMAXPROCS(runtime.NumCPU())
	req, _ := http.NewRequest("GET", "http://acm.hdu.edu.cn/listproblem.php?vol=1", nil)

	spidergo.New([]analyser.Analyser{NewAnalyser(), NewAnalyser()}).
		SetThreadNum(4).
		AddRequest(req).
		SetDelay(uint(100)).
		SetDepth(uint(4)).
		Run()
}

type Analyser struct {
}

func (a *Analyser) Analyse(pg *page.Page) *result.Result {
	if pg.Err != "" {
		log.Println(pg.Err)
		return nil
	}

	// log.Println(pg.Req.Req.URL.String())

	if pg.ContentType == "image/jpeg" {
		path := strings.Split(pg.Req.Req.URL.String(), "/")
		f, err := os.Create("out/" + path[len(path)-1])
		if err != nil {
			log.Println(err)
			return nil
		}
		defer f.Close()
		f.Write(pg.Raw)
		return nil
	}

	if pg.ContentType != "text/html" {
		return nil
	}
	pg.Doc.Find("a").Each(func(i int, se *goquery.Selection) {
		href, _ := se.Attr("href")

		if strings.HasPrefix(href, "list") {
			href = "http://acm.hdu.edu.cn/" + href
			req, err := http.NewRequest("GET", href, nil)
			if err != nil {
				log.Println(err)
				return
			}
			pg.AddReq(req)
		}

	})

	pg.Doc.Find("img").Each(func(i int, se *goquery.Selection) {
		href, _ := se.Attr("src")
		href = "http://acm.hdu.edu.cn/" + href
		href = page.FixUri(href)
		req, err := http.NewRequest("GET", href, nil)
		if err != nil {
			log.Println("req", err)
			return
		}
		pg.AddReq(req)
	})

	text := pg.Doc.Find("script").Text()
	titlePat := `p\((.*?)\);`
	titleRx := regexp.MustCompile(titlePat)
	match := titleRx.FindAllString(text, -1)
	for _, m := range match {
		pro := strings.Split(m, ",")
		if len(pro) >= 2 {
			href := "http://acm.hdu.edu.cn/showproblem.php?pid=" + pro[1]
			req, err := http.NewRequest("GET", href, nil)
			if err != nil {
				log.Println(err)
				continue
			}
			pg.AddReq(req)
		}
	}
	return nil
}

func NewAnalyser() *Analyser {
	return &Analyser{}
}

License

Under MIT

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Result

type Result struct {
}

type Spider

type Spider struct {
	OnWatch bool
	// contains filtered or unexported fields
}

func New

func New(_analysers []analyser.Analyser) *Spider

func (*Spider) AddPipeline

func (s *Spider) AddPipeline(pipeline pipe.Piper) *Spider

func (*Spider) AddRequest

func (s *Spider) AddRequest(req *http.Request) *Spider

func (*Spider) CanStop

func (s *Spider) CanStop() bool

func (*Spider) RegisterDownload

func (s *Spider) RegisterDownload(downloaders []downloader.Downloader) *Spider

func (*Spider) RegisterScheduler

func (s *Spider) RegisterScheduler(_scheduler scheduler.Scheduler) *Spider

func (*Spider) Run

func (s *Spider) Run()

Run begin run spider.

func (*Spider) SetDelay

func (s *Spider) SetDelay(delay uint) *Spider

SetDelay set delay time after fetched a url.

func (*Spider) SetDepth

func (s *Spider) SetDepth(depth uint) *Spider

SetDepth set how deep we dig in.

func (*Spider) SetThreadNum

func (s *Spider) SetThreadNum(n uint) *Spider

func (*Spider) Stop

func (s *Spider) Stop()

func (*Spider) Watch

func (s *Spider) Watch()

Directories

Path Synopsis
lib
raw

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL