doclib

package
v0.0.0-...-a743e43 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 3, 2019 License: MIT Imports: 30 Imported by: 0

Documentation

Index

Constants

View Source
const (
	Inch2MM    = 25.4
	Inch2Point = 72.0
	MM2Point   = Inch2Point / Inch2MM
	Point2MM   = Inch2MM / Inch2Point
)
View Source
const BorderWidth = 3.0 // !@#$ For testing.
View Source
const ShadowWidth = BorderWidth + 0.5 // !@#$ For testing.

Variables

View Source
var (
	Debug bool
	Trace bool
	// ExposeErrors can be set to true to not recover from errors in library functions.
	ExposeErrors bool
)
View Source
var ErrNoMatch = errors.New("no match for hit")
View Source
var ErrRange = errors.New("out of range")
View Source
var FileHashSize = 10

Functions

func ChangePathDir

func ChangePathDir(inDir, inPath, outDir string) (string, error)

ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir`.

func ChangePathDirExt

func ChangePathDirExt(inDir, inPath, outDir, outExt string) (string, error)

ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir` and its extension replaced with `outExt`.

func ChangePathExt

func ChangePathExt(inPath, outExt string) string

ChangePathExt returns `inPath` with its extension replaced with `outExt`.

func CleanCorpus

func CleanCorpus(corpus []string) []string

CleanCorpus returns `corpus` with known bad files removed.

func CreateBleveIndex

func CreateBleveIndex(indexPath string, forceCreate, allowAppend bool) (bleve.Index, error)

CreateBleveIndex creates a new persistent Bleve index at `indexPath`. If `forceCreate` is true then an existing index will be deleted. If `allowAppend` is true then an existing index will be appended to. TODO: Remove `allowAppend` argument. Instead always append to an existing index if

`forceCreate` is false.

func CreateBleveMemIndex

func CreateBleveMemIndex() (bleve.Index, error)

CreateBleveMemIndex creates a new in-memory (unpersisted) Bleve index.

func Describe

func Describe(pdfReader *pdf.PdfReader) (numPages int, width, height float64, err error)

Describe returns numPages, width, height for the PDF in `pdfReader`. Width and height are in mm.

func DocPageSize

func DocPageSize(pageSizes [][2]float64) (w, h float64)

DocPageSize returns the width and height of a document whose page sizes are `pageSizes`. This is a single source of truth for our definition of document page size. Currently the document width is defined as the longest page width in the document.

func Exists

func Exists(filename string) bool

Exists returns true if `filename` exists.

func ExpandUser

func ExpandUser(filename string) string

ExpandUser returns `filename` with ~ replaced with user's home directory.

func ExportBleveMem

func ExportBleveMem(index bleve.Index) ([]byte, error)

func ExtractPageText

func ExtractPageText(page *pdf.PdfPage) (string, error)

ExtractPageText returns the text on page `page`.

func ExtractPageTextLocation

func ExtractPageTextLocation(page *pdf.PdfPage) (string, []extractor.TextLocation, error)

ExtractPageTextLocation returns the locations of text on page `page`.

func ExtractPageTextObject

func ExtractPageTextObject(page *pdf.PdfPage) (*extractor.PageText, error)

ExtractPageTextObject returns the PageText on page `page`. PageText is an opaque UniDoc struct that describes the text marks on a PDF page. extractDocPages uses UniDoc to extract the text from all pages in PDF file `inPath` as a slice of PdfPage.

func FileHash

func FileHash(filename string) (string, error)

FileHash returns a hex encoded string of the SHA-256 digest of the contents of file `filename`.

func FileSize

func FileSize(filename string) (int64, error)

FileSize returns the size of file `filename` in bytes.

func GetPosition

func GetPosition(positions []serial.TextLocation, start, end uint32) serial.TextLocation

func ImportBleveMem

func ImportBleveMem(data []byte) (bleve.Index, error)

func IntRange

func IntRange(i0, i1 int) []int

IntRange returns slice [i0, i1).

func IntSetIntersection

func IntSetIntersection(a, b map[int]bool) map[int]bool

IntSetIntersection returns `a` ∩ `b`.

func IntSetToSlice

func IntSetToSlice(set map[int]bool) []int

IntSetToSlice returns keys of `set` as a slice.

func IntSetUnion

func IntSetUnion(a, b map[int]bool) map[int]bool

IntSetUnion returns `a` ∪ `b`.

func IntSliceDifference

func IntSliceDifference(a, b []int) []int

IntSliceDifference returns the elements in `a` that aren't in `b`.

func IntSliceIntersection

func IntSliceIntersection(a, b []int) []int

IntSliceIntersection returns `a` ∩ `b`.

func IntSliceSymmetricDifference

func IntSliceSymmetricDifference(a, b []int) []int

IntSliceDifference returns the elements in `a` that aren't in `b` plus the elements in `b` that aren't in `a`.

func IntSliceToSet

func IntSliceToSet(arr []int) map[int]bool

IntSliceToSet returns a map whose keys are the elements of `arr`.

func IntSliceUnion

func IntSliceUnion(a, b []int) []int

IntSliceUnion returns `a` ∪ `b`.

func MMToPoint

func MMToPoint(x float64) float64

func MakeUsage

func MakeUsage(msg string)

MakeUsage updates flag.Usage to include usage message `msg`.

func MinMaxIntSlice

func MinMaxIntSlice(arr []int) (min, max int, valid bool)

MinMaxIntSlice returns min and max of `arr`. `valid` is true if `arr` contains values.

func MkDir

func MkDir(dir string) error

MkDir creates a directory called `dir` if it doesn't already exist.

func MkParentDir

func MkParentDir(filename string) error

MkParentDir creates the parent directory for `filename` if it doesn't already exist.

func PageSizeMm

func PageSizeMm(page *pdf.PdfPage) (width, height float64, err error)

PageSizeMm returns the width and height of `page` in mm.

func PageSizePt

func PageSizePt(page *pdf.PdfPage) (width, height float64, err error)

PageSizePt returns the width and height of `page` in points.

func PatternsToPaths

func PatternsToPaths(patternList []string, sortSize bool) ([]string, error)

PatternsToPaths returns a list of files matching the patterns in `patternList`.

func PdfOpenDescribe

func PdfOpenDescribe(inPath string) (numPages int, width, height float64, err error)

PdfOpenDescribe returns numPages, width, height for PDF file `inPath`. Width and height are in mm.

func PdfOpenFile

func PdfOpenFile(inPath string, lazy bool) (*pdf.PdfReader, error)

PdfOpenFile opens PDF file `inPath` and attempts to handle null encryption schemes.

func PdfOpenReader

func PdfOpenReader(f io.ReadSeeker, lazy bool) (*pdf.PdfReader, error)

func PointToMM

func PointToMM(x float64) float64

func ProcessPDFPagesFile

func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *pdf.PdfPage) error) error

ProcessPDFPagesFile runs `processPage` on every page in PDF file `inPath`. It can recover from errors in the libraries it calls if RecoverErrors is true.

func ProcessPDFPagesReader

func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker,
	processPage func(pageNum uint32, page *pdf.PdfPage) error) error

func ReaderSizeHash

func ReaderSizeHash(rs io.ReadSeeker) (int64, string, error)

func RegularFile

func RegularFile(filename string) (bool, error)

RegularFile returns true if file `filename` is a regular file.

func RemoveDirectory

func RemoveDirectory(dir string) error

RemoveDirectory recursively removes directory `dir` and its contents from disk.

func Reverse

func Reverse(arr []string) []string

Reverse returns `arr` in reverse order.

func SetLogging

func SetLogging()

func SortFileSize

func SortFileSize(pathList []string, minSize, maxSize int64) ([]string, error)

SortFileSize returns the paths of the files in `pathList` sorted by ascending size. If `minSize` >= 0 then only files of this size or larger are returned. If `maxSize` >= 0 then only files of this size or smaller are returned.

func StringUniques

func StringUniques(arr []string) []string

StringUniques returns the unique strings in `arr`.

func TestRoundtripMem

func TestRoundtripMem(index bleve.Index) bleve.Index

func ToSerialTextLocation

func ToSerialTextLocation(loc extractor.TextLocation) serial.TextLocation

ToSerialTextLocation converts extractor.TextLocation `loc` to a more compact serial.TextLocation.

func WriteJsonSlice

func WriteJsonSlice(filename string, vals []string) error

WriteJsonSlice writes slice `vals` to json file `filename`, one line per string. NOTE: We write this json file in a human readable way because we will be using it in development

Types

type DocPageText

type DocPageText struct {
	DocIdx  uint64 // Doc index (0-offset) into PositionsState.fileList .
	PageIdx uint32 // Page index (0-offset) into DocPositions.index .
	PageNum uint32 // Page number in PDF file (1-offset)
	Text    string // Extracted page text.
}

DocPageText contains doc:page indexes, the PDF page number and the text extracted from from a PDF page.

type DocPositions

type DocPositions struct {
	// contains filtered or unexported fields
}

DocPositions tracks the data that is used to index a PDF file.

func (*DocPositions) AddDocPage

func (lDoc *DocPositions) AddDocPage(pageNum uint32, dpl serial.DocPageLocations, text string) (uint32, error)

AddDocPage adds a page (with page number `pageNum` and contents `dpl`) to `lDoc`. !@#$ Remove `text` param.

func (*DocPositions) Close

func (lDoc *DocPositions) Close() error

func (*DocPositions) GetTextPath

func (lDoc *DocPositions) GetTextPath(pageIdx uint32) string

func (DocPositions) Len

func (d DocPositions) Len() int

func (*DocPositions) ReadPagePositions

func (lDoc *DocPositions) ReadPagePositions(pageIdx uint32) (uint32, serial.DocPageLocations, error)

ReadPagePositions returns the DocPageLocations of the text on the `pageIdx` (0-offset) returned text in document `lDoc`.

func (*DocPositions) ReadPageText

func (lDoc *DocPositions) ReadPageText(pageIdx uint32) (string, error)

func (*DocPositions) Save

func (lDoc *DocPositions) Save() error

func (DocPositions) String

func (d DocPositions) String() string

type Extract

type Extract struct {
	// contains filtered or unexported fields
}

type ExtractList

type ExtractList struct {
	// contains filtered or unexported fields
}

ExtractList is a list of document:page inputs that are to be combined in a specified order.

func CreateExtractList

func CreateExtractList(maxPages int) *ExtractList

func (*ExtractList) AddRect

func (l *ExtractList) AddRect(inPath string, pageNum uint32, llx, lly, urx, ury float32)

func (*ExtractList) NumPages

func (l *ExtractList) NumPages() int

func (*ExtractList) SaveOutputPdf

func (l *ExtractList) SaveOutputPdf(outPath string) error

SaveOutputPdf is called by position_search.go to markup a PDF file with the locations of text. `l` contains the input PDF names and the pages and coordinates to mark. The resulting PDF is written to `outPath`.

func (ExtractList) String

func (l ExtractList) String() string

type FileDesc

type FileDesc struct {
	InPath string  // Full path to PDF file.
	Hash   string  // SHA-256 hash of file contents.
	SizeMB float64 // Size of PDF file on disk.
}

FileDesc describes a PDF file.

func CreateFileDesc

func CreateFileDesc(inPath string, rs io.ReadSeeker) (FileDesc, error)

type FileFinder

type FileFinder struct {
	// contains filtered or unexported fields
}

FileFinder is a group of file paths.

func NewFileFinder

func NewFileFinder(pathList []string) FileFinder

NewFileFinder returns a FileFinder of all file paths in `pathList`.

func NewFileFinderFromCorpus

func NewFileFinderFromCorpus() (FileFinder, error)

NewFileFinderFromCorpus returns a FileFinder for all files in our main corpus directory.

func (*FileFinder) Find

func (ff *FileFinder) Find(fullpath string) string

Find finds the file path in `ff` that best matches `fullpath`.

type IDText

type IDText struct {
	ID   string
	Text string
}

type PdfMatch

type PdfMatch struct {
	InPath  string
	PageNum uint32
	LineNum int
	Line    string
	serial.DocPageLocations
	// contains filtered or unexported fields
}

PdfMatch describes a single search match in a PDF document. It is the analog of a bleve search.DocumentMatch

func (PdfMatch) String

func (p PdfMatch) String() string

type PdfMatchSet

type PdfMatchSet struct {
	TotalMatches   int
	SearchDuration time.Duration
	Matches        []PdfMatch
}

func SearchIndex

func SearchIndex(lState *PositionsState, index bleve.Index, term string, maxResults int) (
	PdfMatchSet, error)

func SearchPdfIndex

func SearchPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)

func (PdfMatchSet) Files

func (s PdfMatchSet) Files() []string

Files returns the unique file names in `s`.

func (PdfMatchSet) Filter

func (s PdfMatchSet) Filter(maxResultsPerFile int) PdfMatchSet

Filter returns a filtered list of results is `s` as a PdfMatchSet.

func (PdfMatchSet) String

func (s PdfMatchSet) String() string

type PositionsState

type PositionsState struct {
	// contains filtered or unexported fields
}

PositionsState is the global state of a writer or reader to the position indexes saved to disk.

func FromHIPDs

func FromHIPDs(hipds []serial.HashIndexPathDoc) PositionsState

func IndexPdfFiles

func IndexPdfFiles(pathList []string, persistDir string, forceCreate, allowAppend bool,
	report func(string)) (*PositionsState, bleve.Index, int, error)

IndexPdfFiles creates a bleve+PositionsState index for `pathList`. If `persistDir` is not empty, the index is written to this directory. If `forceCreate` is true and `persistDir` is not empty, a new directory is always created. If `allowAppend` is true and `persistDir` is not empty and a bleve index already exists on disk then the bleve index will be appended to. `report` is a supplied function that is called to report progress. TODO: Remove `allowAppend` argument. Instead always append to a bleve index if it exists and

`forceCreate` is not set.

func IndexPdfReaders

func IndexPdfReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, forceCreate,
	allowAppend bool, report func(string)) (*PositionsState, bleve.Index, int, error)

IndexPdfReaders returns a PositionsState and a bleve.Index over the PDF contents read by the io.ReaderSeeker's in `rsList`. The names of the PDFs are in the corresponding position in `pathList`. The inde`persistDir If `persist` is false, the index is stored in memory. If `persist` is true, the index is stored on disk in `persistDir`. `report` is a supplied function that is called to report progress.

func OpenPositionsState

func OpenPositionsState(root string, forceCreate bool) (*PositionsState, error)

OpenPositionsState loads indexes from an existing locations directory `root` or creates one if it doesn't exist. When opening for writing, do this to ensure final index is written to disk:

lState, err := doclib.OpenPositionsState(persistDir, forceCreate)
defer lState.Flush()

func (PositionsState) Check

func (l PositionsState) Check()

func (*PositionsState) CreatePositionsDoc

func (lState *PositionsState) CreatePositionsDoc(fd FileDesc) (*DocPositions, error)

CreatePositionsDoc creates a DocPositions for writing. CreatePositionsDoc always populates the DocPositions with base fields. In a persistent `lState`, necessary directories are created and files are opened.

func (*PositionsState) ExtractDocPagePositions

func (lState *PositionsState) ExtractDocPagePositions(inPath string) ([]DocPageText, error)

func (*PositionsState) ExtractDocPagePositionsReader

func (lState *PositionsState) ExtractDocPagePositionsReader(inPath string, rs io.ReadSeeker) (
	[]DocPageText, error)

ExtractDocPagePositionsReader extracts the text of the PDF file referenced by `rs`. It returns the text as a DocPageText per page. The []DocPageText refer to DocPositions which are stored in lState.hashDoc which is updated in this function.

func (*PositionsState) Flush

func (lState *PositionsState) Flush() error

func (*PositionsState) GetHashPath

func (lState *PositionsState) GetHashPath(docIdx uint64) (hash, inPath string)

func (PositionsState) Len

func (l PositionsState) Len() int

func (*PositionsState) OpenPositionsDoc

func (lState *PositionsState) OpenPositionsDoc(docIdx uint64) (*DocPositions, error)

OpenPositionsDoc opens a DocPositions for reading. In a persistent `lState`, necessary files are opened in lDoc.openDoc().

func (*PositionsState) ReadDocPagePositions

func (lState *PositionsState) ReadDocPagePositions(docIdx uint64, pageIdx uint32) (
	string, uint32, serial.DocPageLocations, error)

ReadDocPagePositions is inefficient. A DocPositions (a file) is opened and closed to read a page.

func (*PositionsState) ReadDocPageText

func (lState *PositionsState) ReadDocPageText(docIdx uint64, pageIdx uint32) (string, error)

func (PositionsState) String

func (l PositionsState) String() string

func (PositionsState) ToHIPDs

func (l PositionsState) ToHIPDs() []serial.HashIndexPathDoc

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL