ia

package module
v0.0.0-...-ed0e677 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 9, 2021 License: MPL-2.0 Imports: 20 Imported by: 0

README

ia — Internet Archive library in Go

ia is a library for querying, downloading, and validating files from the Internet Archive.

License

This project is made available under the Mozilla Public License, v. 2.0.

Documentation

Overview

Package ia contains utilities for working with files from the Internet Archive.

Index

Constants

View Source
const TimestampFormat = "20060102150405"

Variables

This section is empty.

Functions

func DecodeDigest

func DecodeDigest(digest string) (*[20]byte, error)

DecodeDigest decodes a base32-encoded SHA-1 digest.

func DownloadFile

func DownloadFile(url, filename string) error

func DownloadFileChecked

func DownloadFileChecked(url, filename string, sha1Sum []byte) error

func DownloadTorrents

func DownloadTorrents(ids []string, dir string) error

DownloadTorrents downloads the named Internet Archive items via torrent.

func GetTimemap

func GetTimemap(pageURL string, options *TimemapOptions) ([][]string, error)

GetTimemap gets a list of Internet Archive captures of the given URL.

func NewReadValidator

func NewReadValidator(r io.Reader, name string, md5Sum, sha1Sum, crc32Sum []byte) io.Reader

func PageURL

func PageURL(url, timestamp string) string

func Save

func Save(pageURL string, options *SaveOptions) error
func Search(query string) ([]string, error)

Search queries the Internet Archive for the identifiers of all matching items.

func Validate

func Validate(dir string) error

func ValidateFile

func ValidateFile(filename string, md5Sum, sha1Sum, crc32Sum []byte) error

Types

type FileMeta

type FileMeta struct {
	Name     string          `xml:"name,attr"`   // filename, relative to root
	Source   string          `xml:"source,attr"` // "original", "metadata", or "derivative"
	Format   string          `xml:"format"`      // e.g., "Text", "Metadata", "Unknown"
	Original string          `xml:"original"`
	BTIH     jsonutil.Hex    `xml:"btih"` // BitTorrent info-hash
	ModTime  timefmt.UnixSec `xml:"mtime"`
	Size     int64           `xml:"size"`
	MD5      jsonutil.Hex    `xml:"md5"`
	CRC32    jsonutil.Hex    `xml:"crc32"`
	SHA1     jsonutil.Hex    `xml:"sha1"`
	Length   float64         `xml:"length"` // audio duration
	Height   int             `xml:"height"` // image height
	Width    int             `xml:"width"`  // image width
	Private  bool            `xml:"private"`
}

FileMeta contains file metadata listed in the *_files.xml file in the root of an item. This file is excluded for torrent downloads.

func ReadFileMeta

func ReadFileMeta(dir string) ([]FileMeta, error)

func (*FileMeta) OpenValidator

func (fm *FileMeta) OpenValidator(dir string) (io.ReadCloser, error)

func (*FileMeta) Validator

func (fm *FileMeta) Validator(r io.Reader) io.Reader

type ItemMeta

type ItemMeta struct {
	Identifier     string   `xml:"identifier"`
	Collections    []string `xml:"collection"`
	Description    string   `xml:"description"`
	Mediatype      string   `xml:"mediatype"` // e.g., "software"
	Subject        string   `xml:"subject"`
	Title          string   `xml:"title"`
	Uploader       string   `xml:"uploader"`
	Publicdate     string   `xml:"publicdate"` // "2006-01-02 15:04:05" format
	Addeddate      string   `xml:"addeddate"`  // "2006-01-02 15:04:05" format
	Curation       string   `xml:"curation"`
	BackupLocation string   `xml:"backup_location"` // removed from meta in April 2020
}

ItemMeta contains item metadata in the *_meta.xml file in the root of an item.

func ReadItemMeta

func ReadItemMeta(dir string) (*ItemMeta, error)

type SaveOptions

type SaveOptions struct {
	CaptureOutlinks    bool
	CaptureAll         bool // save error pages (HTTP status 400-599)
	CaptureScreenshot  bool
	SaveInMyWebArchive bool
	EmailResult        bool
}

type TimemapOptions

type TimemapOptions struct {
	MatchPrefix bool     // whether url is a prefix (* wildcard is appended)
	Collapse    string   // field to collapse by; earliest captures with unique field is kept
	Fields      []string // e.g., urlkey,timestamp,endtimestamp,original,mimetype,statuscode,digest,redirect,robotflags,length,offset,filename,groupcount,uniqcount
	Limit       int      // e.g., 100000
}

TimemapOptions contains options for a timemap API call.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL