doclib

package

v0.0.0-...-a743e43 Latest Latest Go to latest Published: Jun 3, 2019 License: MIT Imports: 30 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/peterwilliams97/pdf-search

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func ChangePathDir(inDir, inPath, outDir string) (string, error)
func ChangePathDirExt(inDir, inPath, outDir, outExt string) (string, error)
func ChangePathExt(inPath, outExt string) string
func CleanCorpus(corpus []string) []string
func CreateBleveIndex(indexPath string, forceCreate, allowAppend bool) (bleve.Index, error)
func CreateBleveMemIndex() (bleve.Index, error)
func Describe(pdfReader *pdf.PdfReader) (numPages int, width, height float64, err error)
func DocPageSize(pageSizes [][2]float64) (w, h float64)
func Exists(filename string) bool
func ExpandUser(filename string) string
func ExportBleveMem(index bleve.Index) ([]byte, error)
func ExtractPageText(page *pdf.PdfPage) (string, error)
func ExtractPageTextLocation(page *pdf.PdfPage) (string, []extractor.TextLocation, error)
func ExtractPageTextObject(page *pdf.PdfPage) (*extractor.PageText, error)
func FileHash(filename string) (string, error)
func FileSize(filename string) (int64, error)
func GetPosition(positions []serial.TextLocation, start, end uint32) serial.TextLocation
func ImportBleveMem(data []byte) (bleve.Index, error)
func IntRange(i0, i1 int) []int
func IntSetIntersection(a, b map[int]bool) map[int]bool
func IntSetToSlice(set map[int]bool) []int
func IntSetUnion(a, b map[int]bool) map[int]bool
func IntSliceDifference(a, b []int) []int
func IntSliceIntersection(a, b []int) []int
func IntSliceSymmetricDifference(a, b []int) []int
func IntSliceToSet(arr []int) map[int]bool
func IntSliceUnion(a, b []int) []int
func MMToPoint(x float64) float64
func MakeUsage(msg string)
func MinMaxIntSlice(arr []int) (min, max int, valid bool)
func MkDir(dir string) error
func MkParentDir(filename string) error
func PageSizeMm(page *pdf.PdfPage) (width, height float64, err error)
func PageSizePt(page *pdf.PdfPage) (width, height float64, err error)
func PatternsToPaths(patternList []string, sortSize bool) ([]string, error)
func PdfOpenDescribe(inPath string) (numPages int, width, height float64, err error)
func PdfOpenFile(inPath string, lazy bool) (*pdf.PdfReader, error)
func PdfOpenReader(f io.ReadSeeker, lazy bool) (*pdf.PdfReader, error)
func PointToMM(x float64) float64
func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *pdf.PdfPage) error) error
func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker, ...) error
func ReaderSizeHash(rs io.ReadSeeker) (int64, string, error)
func RegularFile(filename string) (bool, error)
func RemoveDirectory(dir string) error
func Reverse(arr []string) []string
func SetLogging()
func SortFileSize(pathList []string, minSize, maxSize int64) ([]string, error)
func StringUniques(arr []string) []string
func TestRoundtripMem(index bleve.Index) bleve.Index
func ToSerialTextLocation(loc extractor.TextLocation) serial.TextLocation
func WriteJsonSlice(filename string, vals []string) error
type DocPageText
type DocPositions
- func (lDoc *DocPositions) AddDocPage(pageNum uint32, dpl serial.DocPageLocations, text string) (uint32, error)
- func (lDoc *DocPositions) Close() error
- func (lDoc *DocPositions) GetTextPath(pageIdx uint32) string
- func (d DocPositions) Len() int
- func (lDoc *DocPositions) ReadPagePositions(pageIdx uint32) (uint32, serial.DocPageLocations, error)
- func (lDoc *DocPositions) ReadPageText(pageIdx uint32) (string, error)
- func (lDoc *DocPositions) Save() error
- func (d DocPositions) String() string
type Extract
type ExtractList
- func CreateExtractList(maxPages int) *ExtractList
- func (l *ExtractList) AddRect(inPath string, pageNum uint32, llx, lly, urx, ury float32)
- func (l *ExtractList) NumPages() int
- func (l *ExtractList) SaveOutputPdf(outPath string) error
- func (l ExtractList) String() string
type FileDesc
- func CreateFileDesc(inPath string, rs io.ReadSeeker) (FileDesc, error)
type FileFinder
- func NewFileFinder(pathList []string) FileFinder
- func NewFileFinderFromCorpus() (FileFinder, error)
- func (ff *FileFinder) Find(fullpath string) string
type IDText
type PdfMatch
- func (p PdfMatch) String() string
type PdfMatchSet
- func SearchIndex(lState *PositionsState, index bleve.Index, term string, maxResults int) (PdfMatchSet, error)
- func SearchPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)
- func (s PdfMatchSet) Files() []string
- func (s PdfMatchSet) Filter(maxResultsPerFile int) PdfMatchSet
- func (s PdfMatchSet) String() string
type PositionsState
- func FromHIPDs(hipds []serial.HashIndexPathDoc) PositionsState
- func IndexPdfFiles(pathList []string, persistDir string, forceCreate, allowAppend bool, ...) (*PositionsState, bleve.Index, int, error)
- func IndexPdfReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, ...) (*PositionsState, bleve.Index, int, error)
- func OpenPositionsState(root string, forceCreate bool) (*PositionsState, error)
- func (l PositionsState) Check()
- func (lState *PositionsState) CreatePositionsDoc(fd FileDesc) (*DocPositions, error)
- func (lState *PositionsState) ExtractDocPagePositions(inPath string) ([]DocPageText, error)
- func (lState *PositionsState) ExtractDocPagePositionsReader(inPath string, rs io.ReadSeeker) ([]DocPageText, error)
- func (lState *PositionsState) Flush() error
- func (lState *PositionsState) GetHashPath(docIdx uint64) (hash, inPath string)
- func (l PositionsState) Len() int
- func (lState *PositionsState) OpenPositionsDoc(docIdx uint64) (*DocPositions, error)
- func (lState *PositionsState) ReadDocPagePositions(docIdx uint64, pageIdx uint32) (string, uint32, serial.DocPageLocations, error)
- func (lState *PositionsState) ReadDocPageText(docIdx uint64, pageIdx uint32) (string, error)
- func (l PositionsState) String() string
- func (l PositionsState) ToHIPDs() []serial.HashIndexPathDoc

Constants ¶

View Source

const (
	Inch2MM    = 25.4
	Inch2Point = 72.0
	MM2Point   = Inch2Point / Inch2MM
	Point2MM   = Inch2MM / Inch2Point
)

View Source

const BorderWidth = 3.0 // !@#$ For testing.

View Source

const ShadowWidth = BorderWidth + 0.5 // !@#$ For testing.

Variables ¶

View Source

var (
	Debug bool
	Trace bool
	// ExposeErrors can be set to true to not recover from errors in library functions.
	ExposeErrors bool
)

View Source

var ErrNoMatch = errors.New("no match for hit")

View Source

var ErrRange = errors.New("out of range")

View Source

var FileHashSize = 10

Functions ¶

func ChangePathDir ¶

func ChangePathDir(inDir, inPath, outDir string) (string, error)

ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir`.

func ChangePathDirExt ¶

func ChangePathDirExt(inDir, inPath, outDir, outExt string) (string, error)

ChangePathDir returns `inPath` with its ancestor directory `inDir` replaced with `outDir` and its extension replaced with `outExt`.

func ChangePathExt ¶

func ChangePathExt(inPath, outExt string) string

ChangePathExt returns `inPath` with its extension replaced with `outExt`.

func CleanCorpus ¶

func CleanCorpus(corpus []string) []string

CleanCorpus returns `corpus` with known bad files removed.

func CreateBleveIndex ¶

func CreateBleveIndex(indexPath string, forceCreate, allowAppend bool) (bleve.Index, error)

CreateBleveIndex creates a new persistent Bleve index at `indexPath`. If `forceCreate` is true then an existing index will be deleted. If `allowAppend` is true then an existing index will be appended to. TODO: Remove `allowAppend` argument. Instead always append to an existing index if

`forceCreate` is false.

func CreateBleveMemIndex ¶

func CreateBleveMemIndex() (bleve.Index, error)

CreateBleveMemIndex creates a new in-memory (unpersisted) Bleve index.

func Describe ¶

func Describe(pdfReader *pdf.PdfReader) (numPages int, width, height float64, err error)

Describe returns numPages, width, height for the PDF in `pdfReader`. Width and height are in mm.

func DocPageSize ¶

func DocPageSize(pageSizes [][2]float64) (w, h float64)

DocPageSize returns the width and height of a document whose page sizes are `pageSizes`. This is a single source of truth for our definition of document page size. Currently the document width is defined as the longest page width in the document.

func Exists ¶

func Exists(filename string) bool

Exists returns true if `filename` exists.

func ExpandUser ¶

func ExpandUser(filename string) string

ExpandUser returns `filename` with ~ replaced with user's home directory.

func ExportBleveMem ¶

func ExportBleveMem(index bleve.Index) ([]byte, error)

func ExtractPageText ¶

func ExtractPageText(page *pdf.PdfPage) (string, error)

ExtractPageText returns the text on page `page`.

func ExtractPageTextLocation ¶

func ExtractPageTextLocation(page *pdf.PdfPage) (string, []extractor.TextLocation, error)

ExtractPageTextLocation returns the locations of text on page `page`.

func ExtractPageTextObject ¶

func ExtractPageTextObject(page *pdf.PdfPage) (*extractor.PageText, error)

ExtractPageTextObject returns the PageText on page `page`. PageText is an opaque UniDoc struct that describes the text marks on a PDF page. extractDocPages uses UniDoc to extract the text from all pages in PDF file `inPath` as a slice of PdfPage.

func FileHash ¶

func FileHash(filename string) (string, error)

FileHash returns a hex encoded string of the SHA-256 digest of the contents of file `filename`.

func FileSize ¶

func FileSize(filename string) (int64, error)

FileSize returns the size of file `filename` in bytes.

func GetPosition ¶

func GetPosition(positions []serial.TextLocation, start, end uint32) serial.TextLocation

func ImportBleveMem ¶

func ImportBleveMem(data []byte) (bleve.Index, error)

func IntRange ¶

func IntRange(i0, i1 int) []int

IntRange returns slice [i0, i1).

func IntSetIntersection ¶

func IntSetIntersection(a, b map[int]bool) map[int]bool

IntSetIntersection returns `a` ∩ `b`.

func IntSetToSlice ¶

func IntSetToSlice(set map[int]bool) []int

IntSetToSlice returns keys of `set` as a slice.

func IntSetUnion ¶

func IntSetUnion(a, b map[int]bool) map[int]bool

IntSetUnion returns `a` ∪ `b`.

func IntSliceDifference ¶

func IntSliceDifference(a, b []int) []int

IntSliceDifference returns the elements in `a` that aren't in `b`.

func IntSliceIntersection ¶

func IntSliceIntersection(a, b []int) []int

IntSliceIntersection returns `a` ∩ `b`.

func IntSliceSymmetricDifference ¶

func IntSliceSymmetricDifference(a, b []int) []int

IntSliceDifference returns the elements in `a` that aren't in `b` plus the elements in `b` that aren't in `a`.

func IntSliceToSet ¶

func IntSliceToSet(arr []int) map[int]bool

IntSliceToSet returns a map whose keys are the elements of `arr`.

func IntSliceUnion ¶

func IntSliceUnion(a, b []int) []int

IntSliceUnion returns `a` ∪ `b`.

func MMToPoint ¶

func MMToPoint(x float64) float64

func MakeUsage ¶

func MakeUsage(msg string)

MakeUsage updates flag.Usage to include usage message `msg`.

func MinMaxIntSlice ¶

func MinMaxIntSlice(arr []int) (min, max int, valid bool)

MinMaxIntSlice returns min and max of `arr`. `valid` is true if `arr` contains values.

func MkDir ¶

func MkDir(dir string) error

MkDir creates a directory called `dir` if it doesn't already exist.

func MkParentDir ¶

func MkParentDir(filename string) error

MkParentDir creates the parent directory for `filename` if it doesn't already exist.

func PageSizeMm ¶

func PageSizeMm(page *pdf.PdfPage) (width, height float64, err error)

PageSizeMm returns the width and height of `page` in mm.

func PageSizePt ¶

func PageSizePt(page *pdf.PdfPage) (width, height float64, err error)

PageSizePt returns the width and height of `page` in points.

func PatternsToPaths ¶

func PatternsToPaths(patternList []string, sortSize bool) ([]string, error)

PatternsToPaths returns a list of files matching the patterns in `patternList`.

func PdfOpenDescribe ¶

func PdfOpenDescribe(inPath string) (numPages int, width, height float64, err error)

PdfOpenDescribe returns numPages, width, height for PDF file `inPath`. Width and height are in mm.

func PdfOpenFile ¶

func PdfOpenFile(inPath string, lazy bool) (*pdf.PdfReader, error)

PdfOpenFile opens PDF file `inPath` and attempts to handle null encryption schemes.

func PdfOpenReader ¶

func PdfOpenReader(f io.ReadSeeker, lazy bool) (*pdf.PdfReader, error)

func PointToMM ¶

func PointToMM(x float64) float64

func ProcessPDFPagesFile ¶

func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *pdf.PdfPage) error) error

ProcessPDFPagesFile runs `processPage` on every page in PDF file `inPath`. It can recover from errors in the libraries it calls if RecoverErrors is true.

func ProcessPDFPagesReader ¶

func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker,
	processPage func(pageNum uint32, page *pdf.PdfPage) error) error

func ReaderSizeHash ¶

func ReaderSizeHash(rs io.ReadSeeker) (int64, string, error)

func RegularFile ¶

func RegularFile(filename string) (bool, error)

RegularFile returns true if file `filename` is a regular file.

func RemoveDirectory ¶

func RemoveDirectory(dir string) error

RemoveDirectory recursively removes directory `dir` and its contents from disk.

func Reverse ¶

func Reverse(arr []string) []string

Reverse returns `arr` in reverse order.

func SetLogging ¶

func SetLogging()

func SortFileSize ¶

func SortFileSize(pathList []string, minSize, maxSize int64) ([]string, error)

SortFileSize returns the paths of the files in `pathList` sorted by ascending size. If `minSize` >= 0 then only files of this size or larger are returned. If `maxSize` >= 0 then only files of this size or smaller are returned.

func StringUniques ¶

func StringUniques(arr []string) []string

StringUniques returns the unique strings in `arr`.

func TestRoundtripMem ¶

func TestRoundtripMem(index bleve.Index) bleve.Index

func ToSerialTextLocation ¶

func ToSerialTextLocation(loc extractor.TextLocation) serial.TextLocation

ToSerialTextLocation converts extractor.TextLocation `loc` to a more compact serial.TextLocation.

func WriteJsonSlice ¶

func WriteJsonSlice(filename string, vals []string) error

WriteJsonSlice writes slice `vals` to json file `filename`, one line per string. NOTE: We write this json file in a human readable way because we will be using it in development

Types ¶

type DocPageText ¶

type DocPageText struct {
	DocIdx  uint64 // Doc index (0-offset) into PositionsState.fileList .
	PageIdx uint32 // Page index (0-offset) into DocPositions.index .
	PageNum uint32 // Page number in PDF file (1-offset)
	Text    string // Extracted page text.
}

DocPageText contains doc:page indexes, the PDF page number and the text extracted from from a PDF page.

type DocPositions ¶

type DocPositions struct {
	// contains filtered or unexported fields
}

DocPositions tracks the data that is used to index a PDF file.

func (*DocPositions) AddDocPage ¶

func (lDoc *DocPositions) AddDocPage(pageNum uint32, dpl serial.DocPageLocations, text string) (uint32, error)

AddDocPage adds a page (with page number `pageNum` and contents `dpl`) to `lDoc`. !@#$ Remove `text` param.

func (*DocPositions) Close ¶

func (lDoc *DocPositions) Close() error

func (*DocPositions) GetTextPath ¶

func (lDoc *DocPositions) GetTextPath(pageIdx uint32) string

func (DocPositions) Len ¶

func (d DocPositions) Len() int

func (*DocPositions) ReadPagePositions ¶

func (lDoc *DocPositions) ReadPagePositions(pageIdx uint32) (uint32, serial.DocPageLocations, error)

ReadPagePositions returns the DocPageLocations of the text on the `pageIdx` (0-offset) returned text in document `lDoc`.

func (*DocPositions) ReadPageText ¶

func (lDoc *DocPositions) ReadPageText(pageIdx uint32) (string, error)

func (*DocPositions) Save ¶

func (lDoc *DocPositions) Save() error

func (DocPositions) String ¶

func (d DocPositions) String() string

type Extract ¶

type Extract struct {
	// contains filtered or unexported fields
}

type ExtractList ¶

type ExtractList struct {
	// contains filtered or unexported fields
}

ExtractList is a list of document:page inputs that are to be combined in a specified order.

func CreateExtractList ¶

func CreateExtractList(maxPages int) *ExtractList

func (*ExtractList) AddRect ¶

func (l *ExtractList) AddRect(inPath string, pageNum uint32, llx, lly, urx, ury float32)

func (*ExtractList) NumPages ¶

func (l *ExtractList) NumPages() int

func (*ExtractList) SaveOutputPdf ¶

func (l *ExtractList) SaveOutputPdf(outPath string) error

SaveOutputPdf is called by position_search.go to markup a PDF file with the locations of text. `l` contains the input PDF names and the pages and coordinates to mark. The resulting PDF is written to `outPath`.

func (ExtractList) String ¶

func (l ExtractList) String() string

type FileDesc ¶

type FileDesc struct {
	InPath string  // Full path to PDF file.
	Hash   string  // SHA-256 hash of file contents.
	SizeMB float64 // Size of PDF file on disk.
}

FileDesc describes a PDF file.

func CreateFileDesc ¶

func CreateFileDesc(inPath string, rs io.ReadSeeker) (FileDesc, error)

type FileFinder ¶

type FileFinder struct {
	// contains filtered or unexported fields
}

FileFinder is a group of file paths.

func NewFileFinder ¶

func NewFileFinder(pathList []string) FileFinder

NewFileFinder returns a FileFinder of all file paths in `pathList`.

func NewFileFinderFromCorpus ¶

func NewFileFinderFromCorpus() (FileFinder, error)

NewFileFinderFromCorpus returns a FileFinder for all files in our main corpus directory.

func (*FileFinder) Find ¶

func (ff *FileFinder) Find(fullpath string) string

Find finds the file path in `ff` that best matches `fullpath`.

type IDText ¶

type IDText struct {
	ID   string
	Text string
}

type PdfMatch ¶

type PdfMatch struct {
	InPath  string
	PageNum uint32
	LineNum int
	Line    string
	serial.DocPageLocations
	// contains filtered or unexported fields
}

PdfMatch describes a single search match in a PDF document. It is the analog of a bleve search.DocumentMatch

func (PdfMatch) String ¶

func (p PdfMatch) String() string

type PdfMatchSet ¶

type PdfMatchSet struct {
	TotalMatches   int
	SearchDuration time.Duration
	Matches        []PdfMatch
}

func SearchIndex ¶

func SearchIndex(lState *PositionsState, index bleve.Index, term string, maxResults int) (
	PdfMatchSet, error)

func SearchPdfIndex ¶

func SearchPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)

func (PdfMatchSet) Files ¶

func (s PdfMatchSet) Files() []string

Files returns the unique file names in `s`.

func (PdfMatchSet) Filter ¶

func (s PdfMatchSet) Filter(maxResultsPerFile int) PdfMatchSet

Filter returns a filtered list of results is `s` as a PdfMatchSet.

func (PdfMatchSet) String ¶

func (s PdfMatchSet) String() string

type PositionsState ¶

type PositionsState struct {
	// contains filtered or unexported fields
}

PositionsState is the global state of a writer or reader to the position indexes saved to disk.

func FromHIPDs ¶

func FromHIPDs(hipds []serial.HashIndexPathDoc) PositionsState

func IndexPdfFiles ¶

func IndexPdfFiles(pathList []string, persistDir string, forceCreate, allowAppend bool,
	report func(string)) (*PositionsState, bleve.Index, int, error)

IndexPdfFiles creates a bleve+PositionsState index for `pathList`. If `persistDir` is not empty, the index is written to this directory. If `forceCreate` is true and `persistDir` is not empty, a new directory is always created. If `allowAppend` is true and `persistDir` is not empty and a bleve index already exists on disk then the bleve index will be appended to. `report` is a supplied function that is called to report progress. TODO: Remove `allowAppend` argument. Instead always append to a bleve index if it exists and

`forceCreate` is not set.

func IndexPdfReaders ¶

func IndexPdfReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, forceCreate,
	allowAppend bool, report func(string)) (*PositionsState, bleve.Index, int, error)

IndexPdfReaders returns a PositionsState and a bleve.Index over the PDF contents read by the io.ReaderSeeker's in `rsList`. The names of the PDFs are in the corresponding position in `pathList`. The inde`persistDir If `persist` is false, the index is stored in memory. If `persist` is true, the index is stored on disk in `persistDir`. `report` is a supplied function that is called to report progress.

func OpenPositionsState ¶

func OpenPositionsState(root string, forceCreate bool) (*PositionsState, error)

OpenPositionsState loads indexes from an existing locations directory `root` or creates one if it doesn't exist. When opening for writing, do this to ensure final index is written to disk:

lState, err := doclib.OpenPositionsState(persistDir, forceCreate)
defer lState.Flush()

func (PositionsState) Check ¶

func (l PositionsState) Check()

func (*PositionsState) CreatePositionsDoc ¶

func (lState *PositionsState) CreatePositionsDoc(fd FileDesc) (*DocPositions, error)

CreatePositionsDoc creates a DocPositions for writing. CreatePositionsDoc always populates the DocPositions with base fields. In a persistent `lState`, necessary directories are created and files are opened.

func (*PositionsState) ExtractDocPagePositions ¶

func (lState *PositionsState) ExtractDocPagePositions(inPath string) ([]DocPageText, error)

func (*PositionsState) ExtractDocPagePositionsReader ¶

func (lState *PositionsState) ExtractDocPagePositionsReader(inPath string, rs io.ReadSeeker) (
	[]DocPageText, error)

ExtractDocPagePositionsReader extracts the text of the PDF file referenced by `rs`. It returns the text as a DocPageText per page. The []DocPageText refer to DocPositions which are stored in lState.hashDoc which is updated in this function.

func (*PositionsState) Flush ¶

func (lState *PositionsState) Flush() error

func (*PositionsState) GetHashPath ¶

func (lState *PositionsState) GetHashPath(docIdx uint64) (hash, inPath string)

func (PositionsState) Len ¶

func (l PositionsState) Len() int

func (*PositionsState) OpenPositionsDoc ¶

func (lState *PositionsState) OpenPositionsDoc(docIdx uint64) (*DocPositions, error)

OpenPositionsDoc opens a DocPositions for reading. In a persistent `lState`, necessary files are opened in lDoc.openDoc().

func (*PositionsState) ReadDocPagePositions ¶

func (lState *PositionsState) ReadDocPagePositions(docIdx uint64, pageIdx uint32) (
	string, uint32, serial.DocPageLocations, error)

ReadDocPagePositions is inefficient. A DocPositions (a file) is opened and closed to read a page.

func (*PositionsState) ReadDocPageText ¶

func (lState *PositionsState) ReadDocPageText(docIdx uint64, pageIdx uint32) (string, error)

func (PositionsState) String ¶

func (l PositionsState) String() string

func (PositionsState) ToHIPDs ¶

func (l PositionsState) ToHIPDs() []serial.HashIndexPathDoc

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL