totext

package module
v0.12.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 27, 2024 License: MIT Imports: 19 Imported by: 0

README

totext

A Go wrapper library to convert different types of documents to plain text.

Dependencies

To convert MS word doc files, install wv

For Ubuntu/Debian:

sudo apt install wv

For MacOs:

brew install wv
To convert PDF files, install poppler

For Ubuntu/Debian:

sudo apt install poppler-utils

For MacOs:

brew install poppler
To convert RTF files, install unrtf

For Ubuntu/Debian:

sudo apt install unrtf

For MacOs:

brew install unrtf
To convert HTML files, prettier is required
npm init
npm install --save-dev --save-exact prettier
To fetch remote web page and extract text

When a remote page is requested to be fetched by the application for the first time, it will download the latest version of Chromium browser automatically.

prettier is required

npm init
npm install --save-dev --save-exact prettier

Building command line tool

go mod tidy
chmod +x compile.sh
./compile.sh

Documentation

Overview

Package totext uses https://github.com/sajari/docconv to extract text from different file types.

Index

Constants

This section is empty.

Variables

View Source
var (
	AppVersion string
	BuildDate  string
	CommitHash string
	Author     string
)

Build information

Functions

func CaptureHTML added in v0.3.0

func CaptureHTML(browser *rod.Browser, inputURL string) (content string, err error)

CaptureHTML fetches the HTML page at the URL given and returns the complete HTML content

func CleanUpHTML added in v0.3.0

func CleanUpHTML(content string) string

CleanUpHTML cleans up the HTML content and extracts the text content

func ConvertDocToText

func ConvertDocToText(filepath string) (content string, metadata map[string]string, err error)

ConvertDocToText receives MS word doc filepath as an argument and returns its text content and metadata

Dependencies:

Debian/Ubuntu: sudo apt install wv

MacOS: brew install wv

func ConvertDocxToText

func ConvertDocxToText(filepath string) (content string, metadata map[string]string, err error)

ConvertDocxToText receives MS word docx filepath as an argument and returns its text content and metadata

func ConvertHTMLToText added in v0.3.0

func ConvertHTMLToText(filepath string, skipPrettifyError bool) (content string, metadata map[string]string, err error)

ConvertHTMLToText receives HTML filepath as an argument and returns its text content and metadata

func ConvertOdtToText

func ConvertOdtToText(filepath string) (content string, metadata map[string]string, err error)

ConvertOdtToText receives odt filepath as an argument and returns its text content and metadata

func ConvertPDFToText

func ConvertPDFToText(filepath string) (content string, metadata map[string]string, err error)

ConvertPDFToText receives pdf filepath as an argument and returns its text content and metadata

Dependencies:

Debian/Ubuntu: sudo apt install poppler-utils

MacOS: brew install poppler

func ConvertPagesToText

func ConvertPagesToText(filepath string) (content string, metadata map[string]string, err error)

ConvertPagesToText receives pages filepath as an argument and returns its text content and metadata

func ConvertRTFToText

func ConvertRTFToText(filepath string) (content string, metadata map[string]string, err error)

ConvertRTFToText receives rtf filepath as an argument and returns its text content and metadata

Dependencies:

Debian/Ubuntu: sudo apt install unrtf

MacOS: brew install unrtf

func ConvertURLToText added in v0.3.0

func ConvertURLToText(browser *rod.Browser, inputURL string, skipPrettifyError bool) (htmlFilename, content string, metadata map[string]string, err error)

ConvertURLToText fetches the HTML page at the URL given and returns its text content and metadata

func CreateHTMLFilename added in v0.3.0

func CreateHTMLFilename(u *url.URL) string

CreateHTMLFilename generates a filename for the HTML file from the URL

func DeleteFile

func DeleteFile(filepath string) error

DeleteFile deletes a file

func FilterNonReadableCharacter added in v0.9.0

func FilterNonReadableCharacter(input string) string

FilterNonReadableCharacter - filter out non-readable characters

func GetFilename

func GetFilename(filepath string) string

GetFilename returns the filename of a file

func IsAbsPath

func IsAbsPath(filepath string) bool

IsAbsPath checks if the filepath is an absolute path

func IsContentTypeHTML added in v0.6.0

func IsContentTypeHTML(contentType string) bool

IsContentTypeHTML checks if the content type is HTML

func IsHostnameValid added in v0.3.0

func IsHostnameValid(hostname string) bool

IsHostnameValid validates the hostname

func IsMIMETypeMatched added in v0.2.0

func IsMIMETypeMatched(fileExt FileExtension, mime MIME) bool

IsMIMETypeMatched compares *multipart.FileHeader MIME type with file extension

func ParseURLAndValidate added in v0.3.0

func ParseURLAndValidate(inputURL string) (*url.URL, error)

ParseURLAndValidate parses the URL and validates the scheme, hostname and content type

func PrettifyHTML added in v0.3.0

func PrettifyHTML(filepath string) (err error)

PrettifyHTML prettifies the HTML content using the prettier library

Dependencies:

npm init

npm install --save-dev --save-exact prettier

func ReadText

func ReadText(filepath string) (string, error)

ReadText reads text content from a text file

func SetCwd

func SetCwd(filepath string) error

SetCwd sets current working directory

func Version

func Version(appName string) *cobra.Command

Version returns the version number, build date and commit hash

func WriteText

func WriteText(filepath string, content string) error

WriteText writes text content to a file

Types

type FileExtension

type FileExtension string

FileExtension is the file extension type

const (
	DOC   FileExtension = "doc"
	DOCX  FileExtension = "docx"
	HTML  FileExtension = "html"
	JSON  FileExtension = "json"
	MD    FileExtension = "md"
	ODT   FileExtension = "odt"
	PAGES FileExtension = "pages"
	PDF   FileExtension = "pdf"
	RTF   FileExtension = "rtf"
	TXT   FileExtension = "txt"
)

File types

func GetFileExtension

func GetFileExtension(filepath string) FileExtension

GetFileExtension returns the file extension of a file

type MIME added in v0.2.0

type MIME string

MIME types

const (
	MimeDOC   MIME = "application/msword"
	MimeDOCX  MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	MimeHTML  MIME = "text/html"
	MimeJSON  MIME = "application/json"
	MimeMD    MIME = "text/markdown"
	MimeODT   MIME = "application/vnd.oasis.opendocument.text"
	MimePAGES MIME = "application/vnd.apple.pages"
	MimePDF   MIME = "application/pdf"
	MimeRTF   MIME = "application/rtf"
	MimeTXT   MIME = "text/plain"
)

MIME types

Directories

Path Synopsis
cli

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL