doclib

package

v0.0.0 Latest Latest Go to latest Published: Oct 28, 2019 License: MIT Imports: 31 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/papercutsoftware/pdfsearch

Links

Open Source Insights

README ¶

doclib

doclib implements the bleve + unidoc interfaces

Documentation ¶

Overview ¶

* This source implements the main function IndexPdfReaders(). * IndexPdfFiles() is a convenience function that opens files and calls IndexPdfReaders().

* This source file implements the main doclib function IndexPdfReaders(). * IndexPdfFiles() is a convenience function that opens files and calls IndexPdfReaders().

Index ¶

Constants
Variables
func ExportBleveMem(index bleve.Index) ([]byte, error)
func ExtractPageTextMarks(page *model.PdfPage) (string, *extractor.TextMarkArray, error)
func ImportBleveMem(data []byte) (bleve.Index, error)
func PageSizePt(page *model.PdfPage) (width, height float64, err error)
func PdfOpenFile(inPath string) (*model.PdfReader, error)
func PdfOpenFileLazy(inPath string) (*os.File, *model.PdfReader, error)
func PdfOpenReader(rs io.ReadSeeker, lazy bool) (*model.PdfReader, error)
func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *model.PdfPage) error) error
func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker, ...) error
type BlevePdf
- func BlevePdfFromHIPDs(hipds []serial.HashIndexPathDoc) (BlevePdf, error)
- func IndexPdfFilesOrReaders(pathList []string, rsList []io.ReadSeeker, persistDir string, forceCreate bool, ...) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)
- func IndexPdfFilesUsingReaders(pathList []string, persistDir string, forceCreate bool, report func(string)) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)
- func (blevePdf *BlevePdf) Equals(other *BlevePdf) bool
- func (blevePdf BlevePdf) Len() int
- func (blevePdf *BlevePdf) SearchBleveIndex(index bleve.Index, term0 string, maxResults int) (PdfMatchSet, error)
- func (blevePdf BlevePdf) String() string
- func (blevePdf BlevePdf) ToHIPDs() ([]serial.HashIndexPathDoc, error)
type DocPageText
type DocPositions
- func (docPos *DocPositions) AddDocPage(pageNum uint32, ppos PagePositions, text string) (uint32, error)
- func (docPos *DocPositions) Close() error
- func (docPos *DocPositions) Equals(e *DocPositions) bool
- func (docPos DocPositions) Len() int
- func (docPos *DocPositions) Save() error
- func (docPos DocPositions) String() string
type ExtractList
- func CreateExtractList(maxPages, maxPerPage int) *ExtractList
- func (l *ExtractList) AddRect(inPath string, pageNum uint32, r model.PdfRectangle)
- func (l *ExtractList) SaveOutputPdf(outPath string) error
- func (l ExtractList) String() string
type IDText
type PDFPageProcessor
- func CreatePDFPageProcessorFile(inPath string) (*PDFPageProcessor, error)
- func CreatePDFPageProcessorReader(inPath string, rs io.ReadSeeker) (*PDFPageProcessor, error)
- func (p *PDFPageProcessor) Close() error
- func (p PDFPageProcessor) NumPages() (uint32, error)
- func (p *PDFPageProcessor) Process(processPage func(pageNum uint32, page *model.PdfPage) error) (err error)
type PagePositions
- func PagePositionsFromTextMarks(textMarks *extractor.TextMarkArray) PagePositions
- func (ppos PagePositions) BBox(start, end uint32) (model.PdfRectangle, bool)
- func (ppos PagePositions) Empty() bool
- func (ppos PagePositions) Equals(epl PagePositions) bool
- func (ppos PagePositions) String() string
type PdfMatchSet
- func SearchPersistentPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)
- func (p PdfMatchSet) Best() PdfMatchSet
- func (p PdfMatchSet) Equals(q PdfMatchSet) bool
- func (s PdfMatchSet) Files() []string
- func (s PdfMatchSet) String() string
type PdfPageMatch
- func (p PdfPageMatch) String() string
type Phrase
type Span

Constants ¶

View Source

const (
	// BorderWidth is the width of rectangle sides in points
	BorderWidth = 1.0
	// ShadowWidth is the with of the shadow on the inside and outside of the rectangles
	ShadowWidth = 0.2
)

Variables ¶

View Source

var (
	// Debug can be set true to enable debug level logging.
	Debug bool
	// Trace can be set true to enable debug level logging.
	Trace bool
	// ExposeErrors can be set to true to not recover from errors in library functions.
	ExposeErrors bool
)

View Source

var CheckConsistency = false

CheckConsistency should be set true to regularly check the BlevePdf consistency.

View Source

var ErrNoMatch = errors.New("no match for hit")

ErrNoMatch indicates there was no match for a bleve hit. It is not a real error.

View Source

var ErrNoPositions = errors.New("no match for hit")

ErrNoMatch indicates there was no match for a bleve hit. It is not a real error.

Functions ¶

func ExportBleveMem ¶

func ExportBleveMem(index bleve.Index) ([]byte, error)

ExportBleveMem serializes bleve index `index` to a byte slice.

func ExtractPageTextMarks ¶

func ExtractPageTextMarks(page *model.PdfPage) (string, *extractor.TextMarkArray, error)

ExtractPageTextMarks returns the extracted text and corresponding TextMarks on page `page`.

func ImportBleveMem ¶

func ImportBleveMem(data []byte) (bleve.Index, error)

ImportBleveMem deserializes `data` to a bleve.Index.

func PageSizePt ¶

func PageSizePt(page *model.PdfPage) (width, height float64, err error)

PageSizePt returns the width and height of `page` in points.

func PdfOpenFile ¶

func PdfOpenFile(inPath string) (*model.PdfReader, error)

PdfOpenFile opens PDF file `inPath` and attempts to handle null encryption schemes.

func PdfOpenFileLazy ¶

func PdfOpenFileLazy(inPath string) (*os.File, *model.PdfReader, error)

PdfOpenFile opens PDF file `inPath` lazily and attempts to handle null encryption schemes. Caller must close the returned file handle if there are no errors.

func PdfOpenReader ¶

func PdfOpenReader(rs io.ReadSeeker, lazy bool) (*model.PdfReader, error)

PdfOpenReader opens the PDF file accessed by `rs` and attempts to handle null encryption schemes. If `lazy` is true, a lazy PDF reader is opened.

func ProcessPDFPagesFile ¶

func ProcessPDFPagesFile(inPath string, processPage func(pageNum uint32, page *model.PdfPage) error) error

ProcessPDFPagesFile runs `processPage` on every page in PDF file `inPath`. It is a convenience function.

func ProcessPDFPagesReader ¶

func ProcessPDFPagesReader(inPath string, rs io.ReadSeeker,
	processPage func(pageNum uint32, page *model.PdfPage) error) error

ProcessPDFPagesReader runs `processPage` on every page in PDF file opened in `rs`. It is a convenience function.

Types ¶

type BlevePdf ¶

type BlevePdf struct {
	// contains filtered or unexported fields
}

BlevePdf links a bleve index over texts to the PDF files that the texts were extracted from, using the hashDoc {file hash: DocPositions} map. For each PDF file, the DocPositions maps extracted text to the location on of text on the PDF page it was extracted from. A BlevePdf can be optionally saved to and retreived from disk, in which case isMem() returns false. BlevePdf is intentionally opaque.

func BlevePdfFromHIPDs ¶

func BlevePdfFromHIPDs(hipds []serial.HashIndexPathDoc) (BlevePdf, error)

BlevePdfFromHIPDs creates a BlevePdf from its seralized form `hipds`. It is used to deserialize a BlevePdf. !@#$ Round trip test BlevePdfFromHIPDs + ToHIPDs

func IndexPdfFilesOrReaders ¶

func IndexPdfFilesOrReaders(pathList []string, rsList []io.ReadSeeker, persistDir string,
	forceCreate bool, report func(string)) (*BlevePdf, bleve.Index,
	int, int, time.Duration, time.Duration, error)

IndexPdfFilesOrReaders returns a BlevePdf and a bleve.Index over

the PDF contents referenced by the io.ReaderSeeker's in `rsList` if `rsList` is not empty, or
the PDF filenames in `pathList` if `rsList` is not empty.

If `persist` is false, the index is stored in memory. If `persist` is true, the index is stored on disk in `persistDir`. `report` is a supplied function that is called to report progress. Returns: (blevePdf, index, numFiles, totalPages, dtPdf, dtBleve, err) where

blevePdf: mapping of a bleve index to PDF pages and text coordinates
index: a bleve index
numFiles: number of PDF files succesfully indexed
totalPages: number of PDF pages succesfully indexed
dtPdf: number of seconds spent building blevePdf
dtBleve: number of seconds spent building index
err: error, if one occurred

NOTE: If you have access to your PDF files then use `pathList` and set `rsList` to nil as a long

list of file handles may exhaust system resources.

func IndexPdfFilesUsingReaders ¶

func IndexPdfFilesUsingReaders(pathList []string, persistDir string, forceCreate bool,
	report func(string)) (*BlevePdf, bleve.Index, int, int, time.Duration, time.Duration, error)

IndexPdfFilesUsingReaders creates a bleve+BlevePdf index for `pathList`. If `persistDir` is not empty, the index is written to this directory. If `forceCreate` is true and `persistDir` is not empty, a new directory is always created. then the bleve index will be appended to. `report` is a supplied function that is called to report progress. NOTE: This is for testing only. It doesn't make sense to access IndexPdfFilesOrReaders() with a

list of opened files as this can exhaust available file handles.

func (*BlevePdf) Equals ¶

func (blevePdf *BlevePdf) Equals(other *BlevePdf) bool

Equals returns true if `blevePdf` contains the same information as `other`.

func (BlevePdf) Len ¶

func (blevePdf BlevePdf) Len() int

Len returns the number of documents in `blevePdf`.

func (*BlevePdf) SearchBleveIndex ¶

func (blevePdf *BlevePdf) SearchBleveIndex(index bleve.Index, term0 string, maxResults int) (
	PdfMatchSet, error)

SearchBleveIndex performs a bleve search on `index `for `term` and returns up to `maxResults` matches. It maps the results to PDF page names, page numbers, line numbers and page locations using `blevePdf`.

func (BlevePdf) String ¶

func (blevePdf BlevePdf) String() string

String returns a string describing `blevePdf`.

func (BlevePdf) ToHIPDs ¶

func (blevePdf BlevePdf) ToHIPDs() ([]serial.HashIndexPathDoc, error)

ToHIPDs converts `blevePdf` to a serial.HashIndexPathDoc. blevePdf.Check() is run before saving to avoid empty serializations.

type DocPageText ¶

type DocPageText struct {
	DocIdx  uint64 // Doc index (0-offset) into BlevePdf.fileList .
	PageIdx uint32 // Page index (0-offset) into DocPositions.index .
	PageNum uint32 // Page number in PDF file (1-offset)
	Text    string // Extracted page text.
}

DocPageText contains doc:page indexes, the PDF page number and the text extracted from a PDF page.

type DocPositions ¶

type DocPositions struct {
	// contains filtered or unexported fields
}

DocPositions is used to the link per-document data in a bleve index to the PDF file that the data was extracted from. There is one DocPositions per PDF file.

func (*DocPositions) AddDocPage ¶

func (docPos *DocPositions) AddDocPage(pageNum uint32, ppos PagePositions, text string) (
	uint32, error)

AddDocPage adds a page with (1-offset) page number `pageNum` and contents `ppos` to `docPos`. It returns the page index, that can be used to access this page from ReadPagePositions() !@#$ Remove `text` param. ^^^ !@#$ ^^^

func (*DocPositions) Close ¶

func (docPos *DocPositions) Close() error

Close closes `docPos`'s open files if it peristent.

func (*DocPositions) Equals ¶

func (docPos *DocPositions) Equals(e *DocPositions) bool

Equals returns true if `d` contains the same information as `e`.

func (DocPositions) Len ¶

func (docPos DocPositions) Len() int

Len returns the number of pages in `d`.

func (*DocPositions) Save ¶

func (docPos *DocPositions) Save() error

Save saves `docPos` to disk if it peristent.

func (DocPositions) String ¶

func (docPos DocPositions) String() string

String returns a human readable string describing `d`.

type ExtractList ¶

type ExtractList struct {
	// contains filtered or unexported fields
}

ExtractList is a list of PDF file:page inputs that are to be marked up then combined in a specificed order. If i is the (0-offset) ith page, then content is the contents to be added to this page. src := sources[i] content := contents[src.inPath][src.pageNum]

func CreateExtractList ¶

func CreateExtractList(maxPages, maxPerPage int) *ExtractList

CreateExtractList returns an empty *ExtractList with `maxPages` maximum number of pages and `maxPerPage` maximum rectangles per page.

func (*ExtractList) AddRect ¶

func (l *ExtractList) AddRect(inPath string, pageNum uint32, r model.PdfRectangle)

AddRect adds to `l`, instructions to draw rectangle `r` on (1-offset) page number `pageNum` of PDF file `inPath`

func (*ExtractList) SaveOutputPdf ¶

func (l *ExtractList) SaveOutputPdf(outPath string) error

SaveOutputPdf is called to markup a PDF file with the locations of text. `l` contains the input PDF names and the pages and coordinates to mark. The resulting PDF is written to `outPath`.

func (ExtractList) String ¶

func (l ExtractList) String() string

String returns a string describing `l`.

type IDText ¶

type IDText struct {
	// ID identifies the document + page index.
	ID string
	// Text is the text that bleve indexes.
	Text string
}

IDText is what bleve sees for each page of a PDF file.

type PDFPageProcessor ¶

type PDFPageProcessor struct {
	// contains filtered or unexported fields
}

PDFPageProcessor is used for processing a PDF file one page at a time. It is an opaque struct.

func CreatePDFPageProcessorFile ¶

func CreatePDFPageProcessorFile(inPath string) (*PDFPageProcessor, error)

CreatePDFPageProcessorFile creates a PDFPageProcessor for reading the PDF file `inPath`.

func CreatePDFPageProcessorReader ¶

func CreatePDFPageProcessorReader(inPath string, rs io.ReadSeeker) (*PDFPageProcessor, error)

CreatePDFPageProcessorReader creates a PDFPageProcessor for reading the PDF file referenced by `rs`. `inPath` is provided for logging only. It is expected to be the path referenced by `rs`.

func (*PDFPageProcessor) Close ¶

func (p *PDFPageProcessor) Close() error

Close closes file handles opened by CreatePDFPageProcessorFile.

func (PDFPageProcessor) NumPages ¶

func (p PDFPageProcessor) NumPages() (uint32, error)

NumPages return the number of pages in the PDF file referenced by `p`.

func (*PDFPageProcessor) Process ¶

func (p *PDFPageProcessor) Process(processPage func(pageNum uint32, page *model.PdfPage) error) (
	err error)

Process runs `processPage` on every page in PDF file `p.inPath`. It can recover from errors in the libraries it calls if `ExposeErrors` is false.

type PagePositions ¶

type PagePositions struct {
	// contains filtered or unexported fields
}

PagePositions is used to link per-document data in a bleve index to the PDF file the data was extracted from. There is one PagePositions per PDF page. PagePositions stores the locations of text fragments on a page. The search index includes a binary copy of PagePositions, so our goal is to make PagePositions compact. !@#$ Which search index?

func PagePositionsFromTextMarks ¶

func PagePositionsFromTextMarks(textMarks *extractor.TextMarkArray) PagePositions

PagePositionsFromTextMarks converts extractor.TextMarkArray `textMarks` to a more compact PagePositions. We do this because PagePositions is stored in our index which we want to be small.

func (PagePositions) BBox ¶

func (ppos PagePositions) BBox(start, end uint32) (model.PdfRectangle, bool)

BBox returns a rectangle that bounds the text with offsets `start` and `end`. ofs: `start` <= ofs < `end` on the PDF page indexed by `ppos`. Caller must check that ppos.offsetBBoxes is not empty.

func (PagePositions) Empty ¶

func (ppos PagePositions) Empty() bool

Empty return true if `ppos` has no entries.

func (PagePositions) Equals ¶

func (ppos PagePositions) Equals(epl PagePositions) bool

Equals returns true if `ppos` contains the same information as `epl`.

func (PagePositions) String ¶

func (ppos PagePositions) String() string

String returns a string describing PagePositions `ppos`.

type PdfMatchSet ¶

type PdfMatchSet struct {
	TotalMatches   int            // Total number of matches.
	SearchDuration time.Duration  // The time it took to perform the search.
	Matches        []PdfPageMatch // The per-page matches which may come from different PDFs.
}

PdfMatchSet is the result of a search over a PdfIndex.

func SearchPersistentPdfIndex ¶

func SearchPersistentPdfIndex(persistDir, term string, maxResults int) (PdfMatchSet, error)

SearchPersistentPdfIndex performs a bleve search on the persistent index in `persistDir`/bleve for `term` and returns up to `maxResults` matches. It maps the results to PDF page names, page numbers, line numbers and page locations using the BlevePdf that was saved in directory `persistDir` by IndexPdfReaders().

func (PdfMatchSet) Best ¶

func (p PdfMatchSet) Best() PdfMatchSet

Best return a copy of `p` trimmed to the results with the highest score.

func (PdfMatchSet) Equals ¶

func (p PdfMatchSet) Equals(q PdfMatchSet) bool

Equals returns true if `p` contains the same results as `q`.

func (PdfMatchSet) Files ¶

func (s PdfMatchSet) Files() []string

Files returns the PDF file names names in PdfMatchSet `s`. These are all the PDF that contained at least one match of the search term.

func (PdfMatchSet) String ¶

func (s PdfMatchSet) String() string

String returns a human readable description of `s`.

type PdfPageMatch ¶

type PdfPageMatch struct {
	InPath        string   // Path of the PDF file that was matched. (A name stored in the index.)
	PageNum       uint32   // 1-offset page number of the PDF page containing the matched text.
	LineNums      []int    // 1-offset line number of the matched text within the extracted page text.
	Lines         []string // The contents of the line containing the matched text.
	PagePositions          // This is used to find the bounding box of the match text on the PDF page.
	// contains filtered or unexported fields
}

PdfPageMatch describes the search results for a PDF page returned from a search over a PDF index. It is the analog of a bleve search.DocumentMatch.

func (PdfPageMatch) String ¶

func (p PdfPageMatch) String() string

type Phrase ¶

type Phrase struct {
	// contains filtered or unexported fields
}

type Span ¶

type Span struct {
	Start uint32  // Offset of the start of the bleve match in the page.
	End   uint32  // Offset of the end of the bleve match in the page.
	Score float64 // Score for this match
}

Span gives the offsets in extracted text that span a phrase.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL