cmds

package
v0.0.0-...-71b52ef Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 29, 2024 License: MIT Imports: 27 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func BatchPdfToPng

func BatchPdfToPng(pdfDir, imageDir string) error

func CleanupImages

func CleanupImages(commonDirs *config.CommonDirs) model.CliFunc

func DownloadPdfsCmd

func DownloadPdfsCmd(commonDirs *config.CommonDirs) model.CliFunc

func DownloadUrlsCmd

func DownloadUrlsCmd(commonDirs *config.CommonDirs) model.CliFunc

func OcrImages

func OcrImages(commonDirs *config.CommonDirs) model.CliFunc

OcrImages TODO 1. Reuse client, no need to re-create. Just SetImage for each TODO 2. Error Groups golang.org/x/sync/errgroup Instead of sending errors from goroutines to the main go routine TODO 3. Batch or Async IO operations - batch opening/closing files or async TODO 4. Buffered channels for done and failed chans TODO 5. Don't double-loop over the images TODO 6. Logging in a tight loop can be a performance hit Aggregate logs and use less frequently. Probably log per n number of files completed TODO 7. Parallel write files

func PdfToPng

func PdfToPng(commonDirs *config.CommonDirs) model.CliFunc

PdfToPng converts PDF files to PNG files

func UpdateBucketItemIndex

func UpdateBucketItemIndex(commonDirs *config.CommonDirs) model.CliFunc

func UpdateFolders

func UpdateFolders(commonDirs *config.CommonDirs) model.CliFunc

func UploadPdfs

func UploadPdfs(commonDirs *config.CommonDirs) model.CliFunc

func WriteImage

func WriteImage(result ConversionResult) error

Types

type ConversionResult

type ConversionResult struct {
	Image     image.Image
	ImageName string
	ImageDir  string
}

type ImageExtractor

type ImageExtractor struct {
	TessClient *TessClient
	ImagePath  string
	CsvPath    string
	CommonDirs *config.CommonDirs
}

func NewImageExtractor

func NewImageExtractor(tessClient *TessClient, imagePath string, commonDirs *config.CommonDirs) *ImageExtractor

func (*ImageExtractor) CsvExists

func (i *ImageExtractor) CsvExists() (bool, error)

func (*ImageExtractor) ExtractIfNotExists

func (i *ImageExtractor) ExtractIfNotExists() ([]*model.OcrResult, error)

func (*ImageExtractor) WriteResults

func (i *ImageExtractor) WriteResults(results []*model.OcrResult) error

type ImageIterator

type ImageIterator struct {
	BaseDir       string
	SubDirs       []fs.DirEntry
	CurrentDir    string
	CurrentImages []fs.DirEntry
	CurrentIdx    int
	SubDirIdx     int
}

func NewImageIterator

func NewImageIterator(baseDir string) (*ImageIterator, error)

func (*ImageIterator) GetNext

func (it *ImageIterator) GetNext() string

func (*ImageIterator) HasNext

func (it *ImageIterator) HasNext() bool

type NestedIterator

type NestedIterator interface {
	HasNext() bool
	GetNext() string
}

type PdfConverter

type PdfConverter struct {
	PdfPath      string
	CommonDirs   *config.CommonDirs
	BaseFileName string
	ImageDir     string
}

func NewPdfConverter

func NewPdfConverter(pdfPath string, commonDirs *config.CommonDirs) *PdfConverter

func (*PdfConverter) ConvertIfNotPresent

func (p *PdfConverter) ConvertIfNotPresent(extension string) error

func (*PdfConverter) CreateImageDir

func (p *PdfConverter) CreateImageDir() error

func (*PdfConverter) CreateImageFile

func (p *PdfConverter) CreateImageFile(pageNumber int, extension string) (*os.File, error)

func (*PdfConverter) GetImageName

func (p *PdfConverter) GetImageName(pageNumber int, extension string) string

func (*PdfConverter) ImageDirExists

func (p *PdfConverter) ImageDirExists() (bool, error)

type PdfConverterV2

type PdfConverterV2 struct {
	PdfPath      string
	BaseFileName string
	ImageDir     string
}

func NewPdfConverterV2

func NewPdfConverterV2(pdfPath, imageDir string) *PdfConverterV2

NewPdfConverterV2 TODO - use a flat file structure instead of the nested structure images will be organized by {name}-{page no}.png, so they'll maintain order

func (*PdfConverterV2) ConvertPagesToImages

func (p *PdfConverterV2) ConvertPagesToImages(extension string) ([]ConversionResult, error)

func (*PdfConverterV2) GetImageName

func (p *PdfConverterV2) GetImageName(pageNumber int, extension string) string

type TessClient

type TessClient struct {
	TessDataPrefix string
	Language       string
	ImagePath      string
	Client         *gosseract.Client
}

func NewTessClientDefault

func NewTessClientDefault() (*TessClient, error)

func (*TessClient) ExtractImageToResults

func (t *TessClient) ExtractImageToResults(imagePath string) ([]*model.OcrResult, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL