wukongwenda

package
v1.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 22, 2023 License: Apache-2.0 Imports: 7 Imported by: 0

README

悟空问答每个专栏

抓取悟空问答每个专栏的内容,只要不停止,就会不停的抓取

Documentation

Index

Constants

View Source
const (
	WUKONG_NORMAL_URL = "https://www.wukong.com/wenda/web/nativefeed/brow/?concern_id=" //不同栏目访问地址
	UA                = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
)

Variables

View Source
var WukongWenda = &Spider{
	Name:        "悟空问答",
	Description: "悟空问答 各个频道专栏问题",

	EnableCookie: false,
	RuleTree: &RuleTree{
		Root: func(ctx *Context) {

			for _, domain := range domains {
				url := WUKONG_NORMAL_URL + domain + "&t=" +
					strconv.FormatInt(time.Now().UnixNano()/1e6, 10)
				header := http.Header{}
				header.Add("User-Agent", UA)

				ctx.AddQueue(&request.Request{
					Url:    url,
					Header: header,
					Rule:   "获取结果",
				})

			}
		},

		Trunk: map[string]*Rule{
			"获取结果": {

				ItemFields: []string{
					"问题标题",
					"问题描述",
					"问题回答",
					"问题url地址",
				},
				ParseFunc: func(ctx *Context) {

					type question struct {
						title   string
						content string
						answer  string
						url     string
						offset  string
					}

					var questionlist []question
					data := gjson.Get(ctx.GetText(), "data")
					more := gjson.Get(ctx.GetText(), "has_more").String()

					data.ForEach(func(key, value gjson.Result) bool {
						questionlist = append(questionlist,
							question{
								title:   gjson.Get(value.String(), "question.title").String(),
								content: gjson.Get(value.String(), "question.content.text").String(),
								answer:  gjson.Get(value.String(), "answer.content").String(),
								url:     "https://www.wukong.com/question/" + gjson.Get(value.String(), "question.qid").String() + "/",
								offset:  gjson.Get(value.String(), "behot_time").String(),
							})
						return true
					})

					if more == "true" {
						newOffset := questionlist[len(questionlist)-1].offset
						header := http.Header{}
						header.Add("User-Agent", UA)

						visit_url := ctx.GetUrl()
						if strings.Contains(visit_url, "&max_behot_time=") {
							visit_url = strings.Split(visit_url, "&max_behot_time=")[0]
						}

						ctx.AddQueue(&request.Request{
							Url:    visit_url + "&max_behot_time=" + newOffset,
							Header: header,
							Rule:   "获取结果",
						})

					}

					for _, v := range questionlist {
						ctx.Output(map[int]interface{}{
							0: v.title,
							1: v.content,
							2: v.answer,
							3: v.url,
						})
					}

				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL