config

package
v0.62.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 30, 2024 License: Apache-2.0 Imports: 13 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// Source sets target source.
	Source = func(v string) Option {
		return func(c *Config) {
			c.Source = v
		}
	}

	// Language ...
	Language = func(v string) Option {
		return func(c *Config) {
			c.Lang = v
		}
	}

	// Query sets CSS query for the target.
	Query = func(v string) Option {
		return func(c *Config) {
			c.Query = v
		}
	}

	// Timeout sets the overall deadline for the operation.
	Timeout = func(d time.Duration) Option {
		return func(c *Config) {
			c.Timeout = d
		}
	}

	// WaitFor sets CSS query for the target of In-Out.
	WaitFor = func(query string) Option {
		return func(c *Config) {
			c.WaitFor = query
		}
	}

	// WaitUntil sets page load duration to wait for.
	WaitUntil = func(d time.Duration) Option {
		return func(c *Config) {
			c.WaitUntil = d
		}
	}

	// Screenshot captures screenshot, Reader will ImgBytes of the image populated.
	Screenshot = func(v bool) Option {
		return func(c *Config) {
			c.Screenshot = v
		}
	}

	// UserAgent allows to specify custom User Agent of the HTTP headless calls.
	UserAgent = func(ua string) Option {
		return func(c *Config) {
			c.UserAgent = ua
		}
	}

	// Content sets content of the target.
	Content = func(v string) Option {
		return func(c *Config) {
			c.Content = v
		}
	}

	// TargetType sets content type of the target.
	TargetType = func(v ContentType) Option {
		return func(c *Config) {
			c.ContentType = v
		}
	}

	// Limit sets the limit of tags for the target.
	Limit = func(v int) Option {
		return func(c *Config) {
			c.Limit = v
		}
	}

	// Verbose enables high verbosity.
	Verbose = func(v bool) Option {
		return func(c *Config) {
			c.Verbose = v
		}
	}

	// NoStopWords enables stop-words exclusion from the output.
	NoStopWords = func(v bool) Option {
		return func(c *Config) {
			c.NoStopWords = v
		}
	}

	// StopWords allows to provide a custom set of stop-words.
	StopWords = func(v []string) Option {
		return func(c *Config) {
			c.StopWords = stopwords.Setup(stopwords.WordsSlice(v))
		}
	}

	// ContentOnly ignores all none content related parts of the HTML page.
	ContentOnly = func(v bool) Option {
		return func(c *Config) {
			c.ContentOnly = v
		}
	}

	// FullSite tells parser to process full site (HTML only).
	FullSite = func(v bool) Option {
		return func(c *Config) {
			c.FullSite = v
		}
	}

	// TagWeightsString ...
	TagWeightsString = func(v string) Option {
		return func(c *Config) {
			c.TagWeights = ParseTagWeights(strings.NewReader(v), String)
		}
	}

	// TagWeightsJSON ...
	TagWeightsJSON = func(v string) Option {
		return func(c *Config) {
			f, err := os.Open(v)
			if err != nil {
				println(fmt.Errorf("error: can't open JSON file [%s]: %w", v, err))
				return
			}
			r := bufio.NewReader(f)
			c.TagWeights = ParseTagWeights(r, JSON)
			f.Close()
		}
	}

	// ExtraTagWeightsString ...
	ExtraTagWeightsString = func(v string) Option {
		return func(c *Config) {
			c.ExtraTagWeights = ParseTagWeights(strings.NewReader(v), String)
		}
	}

	// TagWeightsJSON ...
	ExtraTagWeightsJSON = func(v string) Option {
		return func(c *Config) {
			f, err := os.Open(v)
			if err != nil {
				println(fmt.Errorf("error: can't open JSON file [%s]: %w", v, err))
				return
			}
			r := bufio.NewReader(f)
			c.ExtraTagWeights = ParseTagWeights(r, JSON)
			f.Close()
		}
	}

	// ExcludeTagsString ...
	ExcludeTagsString = func(v string) Option {
		return func(c *Config) {
			c.ExcludeTags = ParseTagWeights(strings.NewReader(v), String)
		}
	}

	// AllTagWeights ...
	AllTagWeights = func(v bool) Option {
		return func(c *Config) {
			c.AllTagWeights = v
		}
	}

	AdjustScores = func(v bool) Option {
		return func(c *Config) {
			c.AdjustScores = v
		}
	}

	Extensions = func(v []extension.Extension) Option {
		return func(c *Config) {
			c.Extensions = make([]extension.Extension, len(v))
			copy(c.Extensions, v)
		}
	}
)
View Source
var (
	ContentTypes = [...]string{
		"Unknown",
		"Text",
		"HTML",
		"Markdown",
	}
)

Functions

func BytesToStrings added in v0.60.1

func BytesToStrings(txts [][]byte) []string

func DetectLang added in v0.59.0

func DetectLang(cfg *Config, controlStr string)

DetectLang detects language and setups the stop words for it.

func SetLang added in v0.61.0

func SetLang(cfg *Config, lang string)

SetLang - updates language in configuration & sets corresponding stop-words.

Types

type Config

type Config struct {
	Source string
	Lang   string
	ContentType
	Content string

	Timeout time.Duration

	// headless
	Query      string
	WaitFor    string
	WaitUntil  time.Duration
	Screenshot bool
	UserAgent  string

	// misc
	Limit       int
	Verbose     bool
	NoStopWords bool
	SkipLang    bool
	StopWords   *stopwords.Register
	ContentOnly bool
	FullSite    bool

	// weighing
	AllTagWeights bool
	TagWeights
	ExtraTagWeights TagWeights
	ExcludeTags     TagWeights
	AdjustScores    bool

	Extensions []extension.Extension
	// contains filtered or unexported fields
}

Config ...

func New

func New(options ...Option) *Config

New ...

func (*Config) Segment added in v0.60.1

func (c *Config) Segment(text []byte) [][]byte

Segmenter ...

func (*Config) SetStopWords

func (c *Config) SetStopWords(lang string)

SetStopWords ...

type ContentType

type ContentType byte

ContentType ...

const (
	Unknown ContentType = iota
	Text
	HTML
	Markdown
)

Content types

func ContentTypeOf

func ContentTypeOf(contentType string) ContentType

ContentTypeOf returns ContentType based on string value.

func (ContentType) String

func (ct ContentType) String() string

String ...

type DefaultSegmenter added in v0.60.1

type DefaultSegmenter struct {
	// contains filtered or unexported fields
}

func NewDefaultSegmenter added in v0.60.1

func NewDefaultSegmenter(c *Config) *DefaultSegmenter

func (*DefaultSegmenter) Segment added in v0.60.1

func (s *DefaultSegmenter) Segment(text []byte) [][]byte

type Option

type Option func(*Config)

Option allows to customise configuration.

type Segmenter added in v0.60.1

type Segmenter interface {
	Segment(text []byte) [][]byte
}

type TagWeights

type TagWeights map[string]float64

TagWeights ...

func ParseTagWeights added in v0.50.0

func ParseTagWeights(reader io.Reader, readerType TagWeightsType) TagWeights

type TagWeightsType added in v0.50.0

type TagWeightsType byte

TagWeightsType ...

const (
	String TagWeightsType = iota // <tagName1>:<tagScore1>|<tagName2>:<tagScore2>
	JSON                         // { "<tagName1>": <tagScore1>, "<tagName2>": <tagScore2> }
)

Wight input types

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL