commoncrawl

package
v0.0.0-...-c0e28cb Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 3, 2024 License: MIT Imports: 21 Imported by: 0

Documentation

Overview

Package commoncrawl - package to parse commoncrawl wat files and save links and pages to files, sorted and split for further processing

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CountFilesInSegmentToProcess

func CountFilesInSegmentToProcess(segment WatSegment) int

CountFilesInSegmentToProcess - count files in segment that still need to be processed

func ExtractWatFileNumber

func ExtractWatFileNumber(filename string) (string, error)

ExtractWatFileNumber extracts the number before the .warc.wat.gz extension.

func IsCorrectArchiveFormat

func IsCorrectArchiveFormat(s string) bool

IsCorrectArchiveFormat checks if the archive name is in the correct format

func IsValidDomain

func IsValidDomain(domain string) bool

IsValidDomain - final verification of domain

func ParseWatByLine

func ParseWatByLine(filePath string, linkFile string, pageFile string, savePage bool) error

ParseWatByLine - parse wat file line by line and store links in file

func UpdateSegmentImportEnd

func UpdateSegmentImportEnd(segmentList *[]WatSegment, segmentName string) error

UpdateSegmentImportEnd - update segment mport status

func UpdateSegmentImportStart

func UpdateSegmentImportStart(segmentList *[]WatSegment, segmentName string) error

UpdateSegmentImportStart - update segment import status

func UpdateSegmentLinkImportStatus

func UpdateSegmentLinkImportStatus(segmentList *[]WatSegment, segmentName string, filePath string) error

UpdateSegmentLinkImportStatus - update segment link import status

func ValidateSegmentImportEndAtStart

func ValidateSegmentImportEndAtStart(segmentList *[]WatSegment, dataDir DataDir, extensionTxtGz string)

ValidateSegmentImportEndAtStart - validate segment import status

Types

type DataDir

type DataDir struct {
	DataDir  string `json:"data_dir"`
	TmpDir   string `json:"tmp_dir"`
	LinksDir string `json:"links_dir"`
	PagesDir string `json:"pages_dir"`
}

DataDir - Define a struct to represent a data directory, tmp, links, pages folders

func CreateDataDir

func CreateDataDir(defaultDir string) (DataDir, error)

CreateDataDir - create data directory and tmp, links, pages folders

type FileLink struct {
	LinkHost      string
	LinkPath      string
	LinkRawQuery  string
	LinkScheme    string
	LinkText      string
	NoFollow      int
	NoIndex       int
	Imported      string
	IP            string
	PageHash      string
	LinkDomain    string
	LinkSubDomain string
}

FileLink - Define a struct to represent a link in file

type FilePage

type FilePage struct {
	Host          string
	Path          string
	RawQuery      string
	Scheme        string
	Title         string
	IP            string
	Imported      string
	InternalLinks int
	ExternalLinks int
	NoIndex       int
}

FilePage - Define a struct to represent a page in file

type SortFileLinkByFields

type SortFileLinkByFields struct {
	Key       string
	Domain    string
	Subdomain string
	Path      string
}

SortFileLinkByFields - structure used to sort links

type URLRecord

type URLRecord struct {
	URL       string
	Scheme    string
	Host      string
	Path      string
	RawQuery  string
	Fragment  string
	Domain    string
	SubDomain string
	Text      string // optional text from link
	NoFollow  int
}

URLRecord - Define a struct to represent a URL record

type WatFile

type WatFile struct {
	Number   string     `json:"number"`
	Path     string     `json:"path"`
	Imported *time.Time `json:"imported"`
}

WatFile - Define a struct to represent a wat file

type WatPage

type WatPage struct {
	IP            *string
	Imported      *string
	Title         *string
	NoIndex       *int
	NoFollow      *int
	InternalLinks int
	ExternalLinks int
	URLRecord     *URLRecord
	Links         []URLRecord
}

WatPage - Define a struct to represent a wat page

type WatSegment

type WatSegment struct {
	Archive       string     `json:"archive"`
	Segment       string     `json:"segment"`
	SegmentID     int        `json:"segment_id"`
	WatFiles      []WatFile  `json:"wat_files"`
	ImportStarted *time.Time `json:"import_started"`
	ImportEnded   *time.Time `json:"import_ended"`
}

WatSegment - Define a struct to represent a segment

func InitImport

func InitImport(archiveName string) ([]WatSegment, error)

InitImport - initialize import by downloading segments file and extracting segments into segmentList

func SelectSegmentByID

func SelectSegmentByID(segmentList []WatSegment, segmentID int) (WatSegment, error)

SelectSegmentByID - select segment to import by ID

func SelectSegmentToImport

func SelectSegmentToImport(segmentList []WatSegment) (WatSegment, error)

SelectSegmentToImport - select segment to import

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL