readability

package module
v0.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 12, 2022 License: MIT Imports: 17 Imported by: 0

README

Go-Readability

GoDoc Travis CI Go Report Card

Go-Readability is a Go package that cleans a HTML page from clutter like buttons, ads and background images, and changes the page's text size, contrast and layout for better readability.

This package is fork from readability and go-readability, which inspired by readability for node.js and readability for python.

Why fork ?

There are severals reasons as to why I create a new fork instead sending a PR to original repository. Cause I need:

  • Extract images
  • Readable mix HTML tags
  • Custom line break

Example

package main

import (
	"fmt"
	nurl "net/url"
	"time"

	"github.com/importcjj/go-readability"
)

func main() {
	// Create URL
	url := "https://www.nytimes.com/2018/01/21/technology/inside-amazon-go-a-store-of-the-future.html"
	parsedURL, _ := nurl.Parse(url)

	extractor := &readability.Extractor{
		TextLineBreak:  "<br/><br/>",
		TextWithImgTag: true,
	}

	// Fetch readable content
	article, err := extractor.FromURL(parsedURL, 5*time.Second)
	if err != nil {
		panic(err)
	}

	// Show results
	fmt.Println(article.Meta.Title)
	fmt.Println(article.Meta.Excerpt)
	fmt.Println(article.Meta.Author)
	// readable content
	fmt.Println(article.Text)
	// Tidy HTML
	fmt.Println(article.HTML)
	// Images
	fmt.Println(article.Images)
}

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultExtrator = &Extractor{
	TextLineBreak: fmt.Sprintln(),
}

DefaultExtrator ...

View Source
var LINEREAK = fmt.Sprintln()

Functions

func CleanDoc

func CleanDoc(doc *goquery.Document)

func GetHTMLContent

func GetHTMLContent(articleContent *goquery.Selection) string

GetHTMLContent fetch and cleans the raw html from article

func GetTextContent

func GetTextContent(articleContent *goquery.Selection, customRender *TextRenderers) string

GetTextContent fetch and cleans the text from article

Types

type Article

type Article struct {
	URL    string   `json:"url"`
	Meta   Metadata `json:"meta"`
	Text   string   `json:"text"`
	HTML   string   `json:"html"`
	Images []string `json:"images"`
}

Article is the content of an URL

func FromReader

func FromReader(reader io.Reader, url *nurl.URL) (Article, error)

FromReader get readable content from the specified io.Reader

func FromURL

func FromURL(url *nurl.URL, timeout time.Duration) (Article, error)

FromURL get readable content from the specified URL

type Extractor

type Extractor struct {
	TextLineBreak       string
	CustomTextRenderers *TextRenderers
}

Extractor ...

func (*Extractor) FromReader

func (extractor *Extractor) FromReader(reader io.Reader, url *nurl.URL) (Article, error)

FromReader get readable content from the specified io.Reader

func (*Extractor) FromReaderWithSelector

func (extractor *Extractor) FromReaderWithSelector(reader io.Reader, selector string, url *nurl.URL) (Article, error)

func (*Extractor) FromURL

func (extractor *Extractor) FromURL(url *nurl.URL, timeout time.Duration) (Article, error)

FromURL get readable content from the specified URL

type Metadata

type Metadata struct {
	Title       string `json:"title"`
	Image       string `json:"image"`
	Excerpt     string `json:"excerpt"`
	Author      string `json:"author"`
	MinReadTime int    `json:"min_read_time"`
	MaxReadTime int    `json:"max_read_time"`
}

Metadata is metadata of an article

type RenderFunc

type RenderFunc func(node *html.Node, buf *bytes.Buffer)

type TextRenderers

type TextRenderers struct {
	LineBreak string
	// contains filtered or unexported fields
}

func NewNoobTextRenderers

func NewNoobTextRenderers(lineBreak string) *TextRenderers

func NewTextRenderers

func NewTextRenderers(lineBreak string) *TextRenderers

func (*TextRenderers) Register

func (r *TextRenderers) Register(tag string, before, after RenderFunc) error

func (*TextRenderers) WriteLineBreak

func (r *TextRenderers) WriteLineBreak(buf *bytes.Buffer)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL