pttifierLib

package module
v0.0.0-...-78c658d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 4, 2016 License: MIT Imports: 9 Imported by: 0

README

PttifierLib

GitHub licenseBuild Status

A library for crawling Taiwan BBS PTT web posts and parsing out those specific articles by your defined rules.

Install

go get -u -x github.com/tommady/pttifierLib

Example

package main

import (
	"fmt"
	"log"

	ptlib "github.com/tommady/pttifierLib"
)

func main() {
	link := ptlib.WrapBoardPageLink("WomenTalk", "5014")
	root, err := ptlib.GetNodeFromLink(link)
	if err != nil {
		log.Fatalf("GG on get page: %v", err)
	}

	board := ptlib.NewBoardCrawler(root)
	posts := board.GetPostsInfosAndArticles()
	if board.Err() != nil {
		log.Fatalf("GG on board: %v", board.Err())
	}

	parserList := []*ptlib.Parser{
		ptlib.NewParser(
			ptlib.SetParserTitle("女"),
		),
		ptlib.NewParser(
			ptlib.SetParserTitle("男"),
			ptlib.SetParserAuthor("a2006lkk"),
		),
	}

	results := []*ptlib.BoardInfoAndArticle{}
	resultCh := make(chan []*ptlib.BoardInfoAndArticle, len(parserList))
	for _, parser := range parserList {
		go func(parser *ptlib.Parser) {
			rs := parser.ParsingAll(posts)
			resultCh <- rs
		}(parser)
	}

	for i := 0; i < len(parserList); i++ {
		select {
		case rs := <-resultCh:
			results = append(results, rs...)
		}
	}

	for _, r := range results {
		fmt.Println(r.Date, r.Title, r.Author)
	}
}

Documentation

Index

Constants

View Source
const (
	TweetTagPraise = "推"
	TweetTagNormal = "→"
	TweetTagBoo    = "噓"
)
View Source
const (
	PttBaseURL         = "https://www.ptt.cc"
	PttBaseCrawlingURL = "https://www.ptt.cc/bbs/"
	DefaultParsingPage = "/index"
)

Variables

View Source
var (
	ErrActionBarNodeNil = errors.New("pttifier.boardCrawler: action bar node is nil")
	ErrRListNodeNil     = errors.New("pttifier.boardCrawler: R list node is nil")
)
View Source
var (
	MaxReConnectTimes                   = 5
	MaxReConnectDelayTime time.Duration = 1
)
View Source
var (
	ErrMainContainerNodeNil = errors.New("pttifierLib.postCrawler: main container node is nil")
)

Functions

func EqualToComparison

func EqualToComparison(n, comparison int) bool

func GetActionBarNode

func GetActionBarNode(root *html.Node) *html.Node
func GetNodeFromLink(targetURL string) (*html.Node, error)

func GetRListNode

func GetRListNode(root *html.Node) *html.Node

func GreaterThanComparison

func GreaterThanComparison(n, comparison int) bool

func GreaterThanOrEqualToComparison

func GreaterThanOrEqualToComparison(n, comparison int) bool

func LessThanComparison

func LessThanComparison(n, comparison int) bool

func LessThanOrEqualToComparison

func LessThanOrEqualToComparison(n, comparison int) bool

func RemoveBottumAnnouncements

func RemoveBottumAnnouncements(rListNode *html.Node)

func TweetAmountStringToInt

func TweetAmountStringToInt(strTweetAmount string) (intTweetAmount int)
func WrapBoardPageLink(targetBoard, pageNum string) string

Types

type BaseInfo

type BaseInfo struct {
	URL    string `json:"url"`
	Title  string `json:"title"`
	Author string `json:"author"`
	Date   string `json:"date"`
}

type BoardCrawler

type BoardCrawler struct {
	// contains filtered or unexported fields
}

func NewBoardCrawler

func NewBoardCrawler(root *html.Node) *BoardCrawler

func (*BoardCrawler) Err

func (b *BoardCrawler) Err() error

func (*BoardCrawler) GetCurrPageLinkNum

func (b *BoardCrawler) GetCurrPageLinkNum() int
func (b *BoardCrawler) GetNextPageLink() string

func (*BoardCrawler) GetPostsInfos

func (b *BoardCrawler) GetPostsInfos() (infos []*BoardInfo)

func (*BoardCrawler) GetPostsInfosAndArticles

func (b *BoardCrawler) GetPostsInfosAndArticles() []*BoardInfoAndArticle
func (b *BoardCrawler) GetPrevPageLink() string

type BoardInfo

type BoardInfo struct {
	BaseInfo
	TweetAmount int `json:"tweet_amount"`
}

type BoardInfoAndArticle

type BoardInfoAndArticle struct {
	*BoardInfo
	Content string   `json:"content"`
	Tweets  []*Tweet `json:"tweets"`
}

type IntMatcher

type IntMatcher func(n, comparison int) bool

type MatcherRule

type MatcherRule struct {
	TitleMatcher       StrMatcher
	AuthorMatcher      StrMatcher
	ContentMatcher     StrMatcher
	TweetAmountMatcher IntMatcher
}

type Parser

type Parser struct {
	TextRule
	MatcherRule
	// contains filtered or unexported fields
}

func NewParser

func NewParser(settings ...RuleSetting) *Parser

func (*Parser) Err

func (p *Parser) Err() error

func (*Parser) ParsingAll

func (p *Parser) ParsingAll(posts []*BoardInfoAndArticle) (results []*BoardInfoAndArticle)

func (*Parser) ParsingBoard

func (p *Parser) ParsingBoard(posts []*BoardInfo) (results []*BoardInfo)

type PostCrawler

type PostCrawler struct {
	// contains filtered or unexported fields
}

func NewPostCrawler

func NewPostCrawler(root *html.Node) *PostCrawler

func (*PostCrawler) Err

func (p *PostCrawler) Err() error

func (*PostCrawler) GetAuthor

func (p *PostCrawler) GetAuthor() string

func (*PostCrawler) GetContent

func (p *PostCrawler) GetContent() string

func (*PostCrawler) GetDate

func (p *PostCrawler) GetDate() string

func (*PostCrawler) GetIP

func (p *PostCrawler) GetIP() (ip net.IP)

func (*PostCrawler) GetTitle

func (p *PostCrawler) GetTitle() string

func (*PostCrawler) GetTweets

func (p *PostCrawler) GetTweets() (tweets []*Tweet)

func (*PostCrawler) GetURL

func (p *PostCrawler) GetURL() (url string)

type RuleSetting

type RuleSetting func(*Parser)

func SetParserAuthor

func SetParserAuthor(author string) RuleSetting

func SetParserAuthorMatcher

func SetParserAuthorMatcher(matcher StrMatcher) RuleSetting

func SetParserContent

func SetParserContent(content string) RuleSetting

func SetParserContentMatcher

func SetParserContentMatcher(matcher StrMatcher) RuleSetting

func SetParserTitle

func SetParserTitle(title string) RuleSetting

func SetParserTitleMatcher

func SetParserTitleMatcher(matcher StrMatcher) RuleSetting

func SetParserTweetAmount

func SetParserTweetAmount(tweetAmount string) RuleSetting

type StrMatcher

type StrMatcher func(s, chars string) bool

type TextRule

type TextRule struct {
	Title       string
	Author      string
	Content     string
	TweetAmount int
}

type Tweet

type Tweet struct {
	Author  string
	Content string
	Date    string
	Tag     string
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL