spider_lib

package module
v0.0.0-...-322cc5b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 29, 2016 License: Apache-2.0 Imports: 5 Imported by: 0

README

pholcus-goscript

go script for henrylee2cn/pholcus

Documentation

Index

Constants

This section is empty.

Variables

View Source
var JOB51 = &Spider{
	Name:        "JOB51",
	Description: "智联招聘职务  [http://51job.com//]",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.Aid(map[string]interface{}{"loop": [2]int{1, 2}, "Rule": "请求列表"}, "请求列表")
		},

		Trunk: map[string]*Rule{

			"请求列表": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {

						ctx.AddQueue(&request.Request{
							Url:  "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=000000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=%E8%BD%AF%E4%BB%B6%E5%B7%A5%E7%A8%8B%E5%B8%88%28java%29&keywordtype=0&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9&curr_page=" + strconv.Itoa(loop[0]),
							Rule: "请求列表",
						})
					}
					return nil
				},
			},

			"获取列表": {
				ParseFunc: func(ctx *Context) {
					logs.Log.Informational("获取列表log")

					ctx.GetDom().
						Find(".t1").
						Each(func(i int, s *goquery.Selection) {
							url, _ := s.Find("a").Attr("href")

							logs.Log.Informational("url:", url)

							ctx.AddQueue(&request.Request{
								Url:      url,
								Rule:     "output",
								Priority: 1,
							})
						})
				},
			},

			"output": {

				ItemFields: []string{
					"salary",
					"work_position",
					"publish_date",
					"job_type",
					"job_years",
					"education",
					"number",
					"job_category",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					thjob := query.Find(".tHjob").First()
					tCompany_main_jtag := query.Find(".tCompany_main").First().Find(".jtag").First()

					salary := thjob.Find("strong").First().Text()
					work_position := thjob.Find(".lname").First().Text()
					publish_date := tCompany_main_jtag.Find(".sp4").Eq(3).Text()
					job_type := ""
					job_years := tCompany_main_jtag.Find(".sp4").Eq(0).Text()
					education := tCompany_main_jtag.Find(".sp4").Eq(1).Text()
					number := tCompany_main_jtag.Find(".sp4").Eq(2).Text()
					job_category := thjob.Find("h1").First().Text()

					ctx.Output(map[int]interface{}{
						0: salary,
						1: work_position,
						2: publish_date,
						3: job_type,
						4: job_years,
						5: education,
						6: number,
						7: job_category,
					})
				},
			},
		},
	},
}
View Source
var ZHILIAN = &Spider{
	Name:        "zhaopin",
	Description: "智联招聘职务  [http://sou.zhaopin.com/]",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {
			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "请求列表"}, "请求列表")
		},

		Trunk: map[string]*Rule{

			"请求列表": {
				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {

						ctx.AddQueue(&request.Request{
							Url:  "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=java%E9%AB%98%E7%BA%A7%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=" + strconv.Itoa(loop[0]),
							Rule: "请求列表",
						})
					}
					return nil
				},

				ParseFunc: func(ctx *Context) {
					var curr int

					logs.Log.Informational("页码:", curr)
					logs.Log.Informational("页码:", strconv.Itoa(curr+1))

					ctx.AddQueue(&request.Request{
						Url:  "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=java%E9%AB%98%E7%BA%A7%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=" + strconv.Itoa(curr+1),
						Rule: "请求列表",
						Temp: map[string]interface{}{"p": curr + 1},
					})

					ctx.Parse("获取列表")
				},
			},

			"获取列表": {
				ParseFunc: func(ctx *Context) {
					logs.Log.Informational("获取列表log")

					logs.Log.Informational("获取列表GetDom", ctx.GetDom())

					ctx.GetDom().
						Find(".zwmc").
						Each(func(i int, s *goquery.Selection) {
							url, _ := s.Find("a").Attr("href")

							logs.Log.Informational("url:", url)

							ctx.AddQueue(&request.Request{
								Url:      url,
								Rule:     "output",
								Priority: 1,
							})
						})
				},
			},

			"output": {

				ItemFields: []string{
					"salary",
					"work_position",
					"publish_date",
					"job_type",
					"job_years",
					"education",
					"number",
					"job_category",
				},
				ParseFunc: func(ctx *Context) {
					query := ctx.GetDom()

					domresult := query.Find(".terminalpage-left").First().Find("li")

					salary := domresult.First().Find("strong").First().Text()
					work_position := domresult.Eq(1).Find("strong").First().Text()
					publish_date := domresult.Eq(2).Find("strong").First().Text()
					job_type := domresult.Eq(3).Find("strong").First().Text()
					job_years := domresult.Eq(4).Find("strong").First().Text()
					education := domresult.Eq(5).Find("strong").First().Text()
					number := domresult.Eq(6).Find("strong").First().Text()
					job_category := domresult.Eq(7).Find("strong").First().Text()

					ctx.Output(map[int]interface{}{
						0: salary,
						1: work_position,
						2: publish_date,
						3: job_type,
						4: job_years,
						5: education,
						6: number,
						7: job_category,
					})
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL