crawler

package module
v0.0.0-...-1db56e8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 27, 2015 License: MIT Imports: 7 Imported by: 0

README

go-crawler Build Status Build Status

just an awesome crawler in go

configable - concurrency

Playground

Quick Glance

package main

import (
    "fmt"
    "github.com/ddo/go-crawler"
)

func main() {
    //counter, just for better log
    no := 0

    /*
        default limit:  10
        default client: timeout 10s
        default filter: http(s), no duplicated
        default scope:  http(s), no duplicated, same host only
    */
    c, err := crawler.New(&crawler.Config{
        Url: "http://facebook.com/",
    })

    //your url is invalid
    if err != nil {
        panic(err)
    }

    //url handler
    receiver_url := func(url string) {
        no++
        fmt.Println(no, "\t ", url)
    }

    //err handler
    receiver_err := func(err error) {
        fmt.Println("error\t", err)
    }

    //trigger
    c.Start(receiver_url, receiver_err)

    fmt.Println("done")
}

output

1     https://www.facebook.com/recover/initiate
2     http://facebook.com/legal/terms
3     http://facebook.com/about/privacy
4     http://facebook.com/help/cookies
5     http://facebook.com/pages/create/?ref_type=registration_form
6     https://vi-vn.facebook.com/
7     https://www.facebook.com/
8     https://zh-tw.facebook.com/
9     https://ko-kr.facebook.com/
10    https://ja-jp.facebook.com/
done

Todo

  • init with Filter
  • init with http.Client
  • crawler testing
  • travis-ci
  • coveralls.io
  • non utf-8 issue
  • init with Fetcher
  • mutex/chan limit/worker counter
  • delay
  • README advanced doc

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Config

type Config struct {
	Url     string
	Limit   int
	Client  *http.Client
	Filters []Filter
	Scopes  []Filter
}

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func New

func New(config *Config) (*Crawler, error)

func (*Crawler) Start

func (c *Crawler) Start(r_url receiver_url, r_err receiver_err)

type Fetcher

type Fetcher struct {
	Client *http.Client
	Picker Picker
}

func (*Fetcher) Fetch

func (f *Fetcher) Fetch(u *url.URL) (urls_obj []*url.URL, err error)

type Filter

type Filter interface {
	Filter(*url.URL) bool
}

type FilterSameHost

type FilterSameHost struct {
	U *url.URL
}

same hostname only

func (*FilterSameHost) Filter

func (f *FilterSameHost) Filter(u *url.URL) bool

type FilterUnique

type FilterUnique struct {
	Us []*url.URL
}

no duplicated Filter

func (*FilterUnique) Filter

func (f *FilterUnique) Filter(u *url.URL) bool

type FilterUrl

type FilterUrl struct{}

is http url Filter

func (*FilterUrl) Filter

func (f *FilterUrl) Filter(u *url.URL) bool

type Picker

type Picker interface {
	Pick(r io.Reader) ([]string, error)
}

type PickerAttr

type PickerAttr struct {
	TagName string
	Attr    string
}

default picker

func (*PickerAttr) Pick

func (p *PickerAttr) Pick(r io.Reader) (data []string, err error)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL