down

command

v0.0.0-...-c4fa36d Latest Latest Go to latest Published: Jun 3, 2019 License: Apache-2.0 Imports: 2 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/venmotools/golearning

Links

Open Source Insights

README ¶

通道版爬虫

使用

使用时需要编写内容解析函数，解析函数的内容如下

func (content []byte) core.Response

如果需要提取所有的URL然后把URL中内容再次提取可以使用如下方法

// URL解析函数
func ParserIndex(context []byte) core.Response{
    res := core.NewRequestResult()
    // 数据转码
    doc, err := goquery.NewDocumentFromReader(bytes.NewReader(content))
	if err != nil {
		fmt.Println(err)
    }
    // 提取出所有的a标签中的href属性中内容
	doc.Find("a").Each(func(i int, selection *goquery.Selection) {
		href, ok := selection.Attr("href")
		if !ok || strings.HasPrefix(href, "//") || strings.Contains(href, "javascript") ||
			strings.HasPrefix(href, "#") || !strings.Contains(href, "jobs") {
			return
        }
        // 使用AppendRequest添加新的请求，ParserURL函数用于解析提取出URL中的内容
		res.AppendRequest(core.NewGetRequest(href, ParserURL))
	})
}

func ParserURL(content []byte) core.Response {
    // 转码
	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(content))
	if err != nil {
		fmt.Println(err)
	}
	res := core.NewRequestResult()
	doc.Find("div").Each(func(i int, selection *goquery.Selection) {
		if selection.HasClass("job_msg") {
			str := selection.Text()
			c, err := iconv.ConvertString(str, "gb2312", "utf-8")
			if err != nil {
				fmt.Println(err)
			}
			if c == "" {
				return
            }
            // 使用AppendItem存储提取出来的item
			res.AppendItem(strings.TrimSpace(c))
		}
	})
	return res
}

main.go

func main() {
	req := core.NewGetRequest("http://xxxx.com", project.ParserIndex)
	eng := core.NewEngine()
	eng.Run(req)
}

engine默认使用10个worker

Documentation ¶

There is no documentation for this package.

Source Files ¶

View all Source files

main.go

Directories ¶

Path	Synopsis
exception
project
saver

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL