protofiles

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 11, 2018 License: Apache-2.0 Imports: 7 Imported by: 0

Documentation

Overview

Package protofiles is a generated protocol buffer package.

It is generated from these files:

protofiles/ideacrawler.proto

It has these top-level messages:

Status
KVP
DomainOpt
Subscription
PageRequest
PageHTML

Index

Constants

This section is empty.

Variables

View Source
var PageReqType_name = map[int32]string{
	0: "GET",
	1: "HEAD",
	2: "BUILTINJS",
	3: "JSCRIPT",
}
View Source
var PageReqType_value = map[string]int32{
	"GET":       0,
	"HEAD":      1,
	"BUILTINJS": 2,
	"JSCRIPT":   3,
}
View Source
var SubType_name = map[int32]string{
	0: "SEQNUM",
	1: "DATETIME",
}
View Source
var SubType_value = map[string]int32{
	"SEQNUM":   0,
	"DATETIME": 1,
}

Functions

func RegisterIdeaCrawlerServer

func RegisterIdeaCrawlerServer(s *grpc.Server, srv IdeaCrawlerServer)

Types

type DomainOpt

type DomainOpt struct {
	SeedUrl string `protobuf:"bytes,1,opt,name=seedUrl" json:"seedUrl,omitempty"`
	// crawl delay in seconds
	MinDelay int32 `protobuf:"varint,2,opt,name=minDelay" json:"minDelay,omitempty"`
	MaxDelay int32 `protobuf:"varint,3,opt,name=maxDelay" json:"maxDelay,omitempty"`
	// don't follow any pages,  just send back responses for the received URLs.
	NoFollow bool `protobuf:"varint,4,opt,name=noFollow" json:"noFollow,omitempty"`
	// only pages matching reqUrlRegexp will be shipped back to the client.
	// only matching pages will be saved to s3 as well.
	CallbackUrlRegexp string `protobuf:"bytes,5,opt,name=callbackUrlRegexp" json:"callbackUrlRegexp,omitempty"`
	// only pages matching followUrlRegexp will be followed and sublinks added to fetcher.
	FollowUrlRegexp       string `protobuf:"bytes,6,opt,name=followUrlRegexp" json:"followUrlRegexp,omitempty"`
	MaxConcurrentRequests int32  `protobuf:"varint,7,opt,name=maxConcurrentRequests" json:"maxConcurrentRequests,omitempty"`
	// TODO
	Useragent string `protobuf:"bytes,8,opt,name=useragent" json:"useragent,omitempty"`
	Impolite  bool   `protobuf:"varint,9,opt,name=impolite" json:"impolite,omitempty"`
	// TODO
	Depth int32 `protobuf:"varint,10,opt,name=depth" json:"depth,omitempty"`
	// TODO: maybe just remove all scheduling features, immediate jobs only
	Repeat bool `protobuf:"varint,11,opt,name=repeat" json:"repeat,omitempty"`
	// needs min limit of 5mins, ideally 1hour
	Frequency *google_protobuf1.Duration `protobuf:"bytes,12,opt,name=frequency" json:"frequency,omitempty"`
	// time of first run, if this is saturday 10pm, frequency is 2 weeks. ideally atleast 10 mins away.
	// it will continue to run at that time every 2 weeks
	Firstrun *google_protobuf.Timestamp `protobuf:"bytes,13,opt,name=firstrun" json:"firstrun,omitempty"`
	// Callback check order -
	//   (1) - callbackUrlRegexp
	//   (2) - callbackXpathMatch
	//   (3) - callbackXpathRegexp
	//  Any one has to match.
	// provide multiple xpaths as keys and expected values as value.  Pages are
	// sent back to client only if all values are found in page.
	CallbackXpathMatch []*KVP `protobuf:"bytes,14,rep,name=callbackXpathMatch" json:"callbackXpathMatch,omitempty"`
	// TODO keepKeywords and followOtherDomains still need to be implemented
	// keep page only if these keywords are present
	//  repeated string keepKeywords = 14;
	// drop pages if these keywords are present
	CallbackXpathRegexp []*KVP `protobuf:"bytes,15,rep,name=callbackXpathRegexp" json:"callbackXpathRegexp,omitempty"`
	//  in seconds, it is the time to wait for a new
	// page, before stopping the job; affects workerIdleTTL of fetchbot.
	// min value is 600, it is also default.
	MaxIdleTime        int32    `protobuf:"varint,16,opt,name=maxIdleTime" json:"maxIdleTime,omitempty"`
	FollowOtherDomains bool     `protobuf:"varint,17,opt,name=followOtherDomains" json:"followOtherDomains,omitempty"`
	KeepDomains        []string `protobuf:"bytes,18,rep,name=keepDomains" json:"keepDomains,omitempty"`
	DropDomains        []string `protobuf:"bytes,19,rep,name=dropDomains" json:"dropDomains,omitempty"`
	DomainDropPriority bool     `protobuf:"varint,20,opt,name=domainDropPriority" json:"domainDropPriority,omitempty"`
	// safe url normalizations happen by default. below is only for a few unsafe ones.
	// for list of safe normalizations: https://github.com/PuerkitoBio/purell/blob/master/purell.go#L59
	// remove index.php, etc,  fragments #section, +FlagsUsuallySafeGreedy from above link
	UnsafeNormalizeURL bool `protobuf:"varint,21,opt,name=unsafeNormalizeURL" json:"unsafeNormalizeURL,omitempty"`
	Login              bool `protobuf:"varint,22,opt,name=login" json:"login,omitempty"`
	// currently not possible, assumes false
	LoginUsingSelenium bool   `protobuf:"varint,23,opt,name=loginUsingSelenium" json:"loginUsingSelenium,omitempty"`
	LoginUrl           string `protobuf:"bytes,24,opt,name=loginUrl" json:"loginUrl,omitempty"`
	// for username, password fields, other form data to send on post request
	LoginPayload []*KVP `protobuf:"bytes,25,rep,name=loginPayload" json:"loginPayload,omitempty"`
	// if there are hidden fields in the page that need to be scraped before login
	LoginParseFields bool `protobuf:"varint,26,opt,name=loginParseFields" json:"loginParseFields,omitempty"`
	// key is key of hidden fields to parse from form, value is the xpath of field to scrape.
	LoginParseXpath []*KVP `protobuf:"bytes,27,rep,name=loginParseXpath" json:"loginParseXpath,omitempty"`
	// to check if login succeeded, provide xpath as key, and expected value as value.
	// for example,  after login, xpath of top right corner,  and username as value.
	// if the xpath is not there of if there is no value match,  then we probably didn't login.
	LoginSuccessCheck *KVP `protobuf:"bytes,28,opt,name=loginSuccessCheck" json:"loginSuccessCheck,omitempty"`
	// checks login state after downloading each page, using check defined in 'loginSuccessCheck'
	CheckLoginAfterEachPage bool `protobuf:"varint,29,opt,name=checkLoginAfterEachPage" json:"checkLoginAfterEachPage,omitempty"`
	// javascript for login in chrome browser.
	LoginJS string `protobuf:"bytes,30,opt,name=loginJS" json:"loginJS,omitempty"`
	// whether to use chrome, location of chrome binary
	Chrome       bool   `protobuf:"varint,31,opt,name=chrome" json:"chrome,omitempty"`
	ChromeBinary string `protobuf:"bytes,32,opt,name=chromeBinary" json:"chromeBinary,omitempty"`
	DomLoadTime  int32  `protobuf:"varint,33,opt,name=domLoadTime" json:"domLoadTime,omitempty"`
	// check if this network interface is still active before every request.
	NetworkIface string `protobuf:"bytes,34,opt,name=networkIface" json:"networkIface,omitempty"`
	// TODO
	CancelOnDisconnect bool `protobuf:"varint,35,opt,name=cancelOnDisconnect" json:"cancelOnDisconnect,omitempty"`
	// if true,  sends a HEAD request first ensure content is text/html before sending GET request.
	CheckContent bool `protobuf:"varint,36,opt,name=checkContent" json:"checkContent,omitempty"`
	// if prefetch flag is true, downloads resources like img, css, png, svg, js associated with the actual page to mimic browser behaviour.
	Prefetch bool `protobuf:"varint,37,opt,name=prefetch" json:"prefetch,omitempty"`
}

func (*DomainOpt) Descriptor

func (*DomainOpt) Descriptor() ([]byte, []int)

func (*DomainOpt) GetCallbackUrlRegexp

func (m *DomainOpt) GetCallbackUrlRegexp() string

func (*DomainOpt) GetCallbackXpathMatch

func (m *DomainOpt) GetCallbackXpathMatch() []*KVP

func (*DomainOpt) GetCallbackXpathRegexp

func (m *DomainOpt) GetCallbackXpathRegexp() []*KVP

func (*DomainOpt) GetCancelOnDisconnect

func (m *DomainOpt) GetCancelOnDisconnect() bool

func (*DomainOpt) GetCheckContent

func (m *DomainOpt) GetCheckContent() bool

func (*DomainOpt) GetCheckLoginAfterEachPage

func (m *DomainOpt) GetCheckLoginAfterEachPage() bool

func (*DomainOpt) GetChrome

func (m *DomainOpt) GetChrome() bool

func (*DomainOpt) GetChromeBinary

func (m *DomainOpt) GetChromeBinary() string

func (*DomainOpt) GetDepth

func (m *DomainOpt) GetDepth() int32

func (*DomainOpt) GetDomLoadTime

func (m *DomainOpt) GetDomLoadTime() int32

func (*DomainOpt) GetDomainDropPriority

func (m *DomainOpt) GetDomainDropPriority() bool

func (*DomainOpt) GetDropDomains

func (m *DomainOpt) GetDropDomains() []string

func (*DomainOpt) GetFirstrun

func (m *DomainOpt) GetFirstrun() *google_protobuf.Timestamp

func (*DomainOpt) GetFollowOtherDomains

func (m *DomainOpt) GetFollowOtherDomains() bool

func (*DomainOpt) GetFollowUrlRegexp

func (m *DomainOpt) GetFollowUrlRegexp() string

func (*DomainOpt) GetFrequency

func (m *DomainOpt) GetFrequency() *google_protobuf1.Duration

func (*DomainOpt) GetImpolite

func (m *DomainOpt) GetImpolite() bool

func (*DomainOpt) GetKeepDomains

func (m *DomainOpt) GetKeepDomains() []string

func (*DomainOpt) GetLogin

func (m *DomainOpt) GetLogin() bool

func (*DomainOpt) GetLoginJS

func (m *DomainOpt) GetLoginJS() string

func (*DomainOpt) GetLoginParseFields

func (m *DomainOpt) GetLoginParseFields() bool

func (*DomainOpt) GetLoginParseXpath

func (m *DomainOpt) GetLoginParseXpath() []*KVP

func (*DomainOpt) GetLoginPayload

func (m *DomainOpt) GetLoginPayload() []*KVP

func (*DomainOpt) GetLoginSuccessCheck

func (m *DomainOpt) GetLoginSuccessCheck() *KVP

func (*DomainOpt) GetLoginUrl

func (m *DomainOpt) GetLoginUrl() string

func (*DomainOpt) GetLoginUsingSelenium

func (m *DomainOpt) GetLoginUsingSelenium() bool

func (*DomainOpt) GetMaxConcurrentRequests

func (m *DomainOpt) GetMaxConcurrentRequests() int32

func (*DomainOpt) GetMaxDelay

func (m *DomainOpt) GetMaxDelay() int32

func (*DomainOpt) GetMaxIdleTime

func (m *DomainOpt) GetMaxIdleTime() int32

func (*DomainOpt) GetMinDelay

func (m *DomainOpt) GetMinDelay() int32

func (*DomainOpt) GetNetworkIface

func (m *DomainOpt) GetNetworkIface() string

func (*DomainOpt) GetNoFollow

func (m *DomainOpt) GetNoFollow() bool

func (*DomainOpt) GetPrefetch

func (m *DomainOpt) GetPrefetch() bool

func (*DomainOpt) GetRepeat

func (m *DomainOpt) GetRepeat() bool

func (*DomainOpt) GetSeedUrl

func (m *DomainOpt) GetSeedUrl() string

func (*DomainOpt) GetUnsafeNormalizeURL

func (m *DomainOpt) GetUnsafeNormalizeURL() bool

func (*DomainOpt) GetUseragent

func (m *DomainOpt) GetUseragent() string

func (*DomainOpt) ProtoMessage

func (*DomainOpt) ProtoMessage()

func (*DomainOpt) Reset

func (m *DomainOpt) Reset()

func (*DomainOpt) String

func (m *DomainOpt) String() string

type IdeaCrawlerClient

type IdeaCrawlerClient interface {
	//  rpc AddDomain(DomainOpt) returns (Subscription) {}
	//  rpc AddDomains(stream DomainOpt) returns (stream Subscription) {}
	AddDomainAndListen(ctx context.Context, in *DomainOpt, opts ...grpc.CallOption) (IdeaCrawler_AddDomainAndListenClient, error)
	AddPages(ctx context.Context, opts ...grpc.CallOption) (IdeaCrawler_AddPagesClient, error)
	CancelJob(ctx context.Context, in *Subscription, opts ...grpc.CallOption) (*Status, error)
}

func NewIdeaCrawlerClient

func NewIdeaCrawlerClient(cc *grpc.ClientConn) IdeaCrawlerClient

type IdeaCrawlerServer

type IdeaCrawlerServer interface {
	//  rpc AddDomain(DomainOpt) returns (Subscription) {}
	//  rpc AddDomains(stream DomainOpt) returns (stream Subscription) {}
	AddDomainAndListen(*DomainOpt, IdeaCrawler_AddDomainAndListenServer) error
	AddPages(IdeaCrawler_AddPagesServer) error
	CancelJob(context.Context, *Subscription) (*Status, error)
}

type IdeaCrawler_AddDomainAndListenClient

type IdeaCrawler_AddDomainAndListenClient interface {
	Recv() (*PageHTML, error)
	grpc.ClientStream
}

type IdeaCrawler_AddDomainAndListenServer

type IdeaCrawler_AddDomainAndListenServer interface {
	Send(*PageHTML) error
	grpc.ServerStream
}

type IdeaCrawler_AddPagesClient

type IdeaCrawler_AddPagesClient interface {
	Send(*PageRequest) error
	CloseAndRecv() (*Status, error)
	grpc.ClientStream
}

type IdeaCrawler_AddPagesServer

type IdeaCrawler_AddPagesServer interface {
	SendAndClose(*Status) error
	Recv() (*PageRequest, error)
	grpc.ServerStream
}

type KVP

type KVP struct {
	Key   string `protobuf:"bytes,1,opt,name=key" json:"key,omitempty"`
	Value string `protobuf:"bytes,2,opt,name=value" json:"value,omitempty"`
}

func (*KVP) Descriptor

func (*KVP) Descriptor() ([]byte, []int)

func (*KVP) GetKey

func (m *KVP) GetKey() string

func (*KVP) GetValue

func (m *KVP) GetValue() string

func (*KVP) ProtoMessage

func (*KVP) ProtoMessage()

func (*KVP) Reset

func (m *KVP) Reset()

func (*KVP) String

func (m *KVP) String() string

type PageHTML

type PageHTML struct {
	Success        bool          `protobuf:"varint,1,opt,name=success" json:"success,omitempty"`
	Error          string        `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"`
	Sub            *Subscription `protobuf:"bytes,3,opt,name=sub" json:"sub,omitempty"`
	Url            string        `protobuf:"bytes,4,opt,name=url" json:"url,omitempty"`
	Httpstatuscode int32         `protobuf:"varint,5,opt,name=httpstatuscode" json:"httpstatuscode,omitempty"`
	Content        []byte        `protobuf:"bytes,6,opt,name=content,proto3" json:"content,omitempty"`
	MetaStr        string        `protobuf:"bytes,7,opt,name=metaStr" json:"metaStr,omitempty"`
}

func (*PageHTML) Descriptor

func (*PageHTML) Descriptor() ([]byte, []int)

func (*PageHTML) GetContent

func (m *PageHTML) GetContent() []byte

func (*PageHTML) GetError

func (m *PageHTML) GetError() string

func (*PageHTML) GetHttpstatuscode

func (m *PageHTML) GetHttpstatuscode() int32

func (*PageHTML) GetMetaStr

func (m *PageHTML) GetMetaStr() string

func (*PageHTML) GetSub

func (m *PageHTML) GetSub() *Subscription

func (*PageHTML) GetSuccess

func (m *PageHTML) GetSuccess() bool

func (*PageHTML) GetUrl

func (m *PageHTML) GetUrl() string

func (*PageHTML) ProtoMessage

func (*PageHTML) ProtoMessage()

func (*PageHTML) Reset

func (m *PageHTML) Reset()

func (*PageHTML) String

func (m *PageHTML) String() string

type PageReqType

type PageReqType int32
const (
	PageReqType_GET PageReqType = 0
	// Sends a HEAD request to first identify page is text/html before downloading
	// If we are unsure link will send back large gzip file, etc. which we want to
	// avoid.
	PageReqType_HEAD      PageReqType = 1
	PageReqType_BUILTINJS PageReqType = 2
	PageReqType_JSCRIPT   PageReqType = 3
)

func (PageReqType) EnumDescriptor

func (PageReqType) EnumDescriptor() ([]byte, []int)

func (PageReqType) String

func (x PageReqType) String() string

type PageRequest

type PageRequest struct {
	Sub        *Subscription `protobuf:"bytes,1,opt,name=sub" json:"sub,omitempty"`
	Reqtype    PageReqType   `protobuf:"varint,2,opt,name=reqtype,enum=protofiles.PageReqType" json:"reqtype,omitempty"`
	Url        string        `protobuf:"bytes,3,opt,name=url" json:"url,omitempty"`
	Js         string        `protobuf:"bytes,4,opt,name=js" json:"js,omitempty"`
	NoCallback bool          `protobuf:"varint,5,opt,name=noCallback" json:"noCallback,omitempty"`
	MetaStr    string        `protobuf:"bytes,6,opt,name=metaStr" json:"metaStr,omitempty"`
}

func (*PageRequest) Descriptor

func (*PageRequest) Descriptor() ([]byte, []int)

func (*PageRequest) GetJs

func (m *PageRequest) GetJs() string

func (*PageRequest) GetMetaStr

func (m *PageRequest) GetMetaStr() string

func (*PageRequest) GetNoCallback

func (m *PageRequest) GetNoCallback() bool

func (*PageRequest) GetReqtype

func (m *PageRequest) GetReqtype() PageReqType

func (*PageRequest) GetSub

func (m *PageRequest) GetSub() *Subscription

func (*PageRequest) GetUrl

func (m *PageRequest) GetUrl() string

func (*PageRequest) ProtoMessage

func (*PageRequest) ProtoMessage()

func (*PageRequest) Reset

func (m *PageRequest) Reset()

func (*PageRequest) String

func (m *PageRequest) String() string

type Status

type Status struct {
	Success bool   `protobuf:"varint,1,opt,name=success" json:"success,omitempty"`
	Error   string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"`
}

func (*Status) Descriptor

func (*Status) Descriptor() ([]byte, []int)

func (*Status) GetError

func (m *Status) GetError() string

func (*Status) GetSuccess

func (m *Status) GetSuccess() bool

func (*Status) ProtoMessage

func (*Status) ProtoMessage()

func (*Status) Reset

func (m *Status) Reset()

func (*Status) String

func (m *Status) String() string

type SubType

type SubType int32

Subscription type

const (
	// crawler will remember sequence number of each page stored, so we can start back exactly where we left off
	SubType_SEQNUM SubType = 0
	// if we know only the time when we left off,  or if we want to start reading from a certain day's run
	SubType_DATETIME SubType = 1
)

func (SubType) EnumDescriptor

func (SubType) EnumDescriptor() ([]byte, []int)

func (SubType) String

func (x SubType) String() string

type Subscription

type Subscription struct {
	Subcode    string                     `protobuf:"bytes,1,opt,name=subcode" json:"subcode,omitempty"`
	Domainname string                     `protobuf:"bytes,2,opt,name=domainname" json:"domainname,omitempty"`
	Subtype    SubType                    `protobuf:"varint,3,opt,name=subtype,enum=protofiles.SubType" json:"subtype,omitempty"`
	Seqnum     int32                      `protobuf:"varint,4,opt,name=seqnum" json:"seqnum,omitempty"`
	Datetime   *google_protobuf.Timestamp `protobuf:"bytes,5,opt,name=datetime" json:"datetime,omitempty"`
}

func (*Subscription) Descriptor

func (*Subscription) Descriptor() ([]byte, []int)

func (*Subscription) GetDatetime

func (m *Subscription) GetDatetime() *google_protobuf.Timestamp

func (*Subscription) GetDomainname

func (m *Subscription) GetDomainname() string

func (*Subscription) GetSeqnum

func (m *Subscription) GetSeqnum() int32

func (*Subscription) GetSubcode

func (m *Subscription) GetSubcode() string

func (*Subscription) GetSubtype

func (m *Subscription) GetSubtype() SubType

func (*Subscription) ProtoMessage

func (*Subscription) ProtoMessage()

func (*Subscription) Reset

func (m *Subscription) Reset()

func (*Subscription) String

func (m *Subscription) String() string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL