core

package module
v0.0.0-...-8843561 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 27, 2017 License: AGPL-3.0 Imports: 28 Imported by: 17

README

archive

Core Data Model definitions for archival work

archive is a go implementation of standard data models for for data together.

Services that import archive so far:

Noticibly absent from this package is the definition of a user, please see the identity service for that stuff.

Documentation

Overview

Archive holds all common model definitions for archivers 2.0.

TODO - turn "Metadata" into github.com/datatogether/metablocks.Metablock

Index

Constants

This section is empty.

Variables

View Source
var (
	// how long before a url is considered stale. default is 72 hours.
	StaleDuration = time.Hour * 72
	// all these need to be set for file saving to work
	AwsRegion          string
	AwsAccessKeyId     string
	AwsSecretAccessKey string
	AwsS3BucketName    string
	AwsS3BucketPath    string
)
View Source
var (
	ErrNotFound        = fmt.Errorf("Not Found")
	ErrInvalidResponse = fmt.Errorf("Datastore returned an invalid response")
)

Functions

func CalcHash

func CalcHash(data []byte) (string, error)

CalcHash calculates the multihash key for a given slice of bytes TODO - find a proper home for this

func ContentUrlsCount

func ContentUrlsCount(db sqlutil.Queryable) (count int, err error)

func CountPrimers

func CountPrimers(db sqlutil.Queryable) (count int64, err error)

CountPrimers returns the total number of primers

func CountSources

func CountSources(db sqlutil.Queryable) (count int, err error)

CountSources grabs the total number of sources

func FileUrl

func FileUrl(url *Url) string

func MetadataCountByKey

func MetadataCountByKey(db sqlutil.Queryable, keyId string) (count int, err error)

func NormalizeURL

func NormalizeURL(u *url.URL) *url.URL

NormalizeURL removes inconsitencincies from a given url

func NormalizeURLString

func NormalizeURLString(url string) (string, error)

NormalizeURLString removes inconsitencincies from a given url string

func ValidArchivingUrl

func ValidArchivingUrl(db sqlutil.Queryable, url string) error

func WriteSnapshot

func WriteSnapshot(store datastore.Datastore, u *Url) error

WriteSnapshot creates a snapshot record in the DB from a given Url struct

Types

type Collection

type Collection struct {
	// version 4 uuid
	Id string `json:"id"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// sha256 multihash of the public key that created this collection
	Creator string `json:"creator"`
	// human-readable title of the collection
	Title string `json:"title"`
	// description of the collection
	Description string `json:"description"`
	// url this collection originates from
	Url string `json:"url,omitempty"`
}

Collections are generic groupings of content collections can be thought of as a csv file listing content hashes as the first column, and whatever other information is necessary in subsequent columns

func CollectionsByCreator

func CollectionsByCreator(store datastore.Datastore, creator, orderby string, limit, offset int) ([]*Collection, error)

func ListCollections

func ListCollections(store datastore.Datastore, limit, offset int) ([]*Collection, error)

func (Collection) DatastoreType

func (c Collection) DatastoreType() string

func (*Collection) Delete

func (c *Collection) Delete(store datastore.Datastore) error

Delete a collection, should only do for erronious additions

func (*Collection) DeleteItems

func (c *Collection) DeleteItems(store datastore.Datastore, items []*CollectionItem) error

DeleteItems removes a given list of items from the collection

func (Collection) GetId

func (c Collection) GetId() string

func (*Collection) ItemCount

func (c *Collection) ItemCount(store datastore.Datastore) (count int, err error)

ItemCount gets the number of items in the collection

func (Collection) Key

func (c Collection) Key() datastore.Key

func (*Collection) NewSQLModel

func (c *Collection) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*Collection) Read

func (c *Collection) Read(store datastore.Datastore) error

Read collection from db

func (*Collection) ReadItems

func (c *Collection) ReadItems(store datastore.Datastore, orderby string, limit, offset int) (items []*CollectionItem, err error)

ReadItems reads a bounded set of items from the collection the orderby param currently only supports SQL-style input of a single proprty, eg: "index" or "index DESC"

func (*Collection) SQLParams

func (c *Collection) SQLParams(cmd sql_datastore.Cmd) []interface{}

func (Collection) SQLQuery

func (c Collection) SQLQuery(cmd sql_datastore.Cmd) string

func (*Collection) Save

func (c *Collection) Save(store datastore.Datastore) (err error)

Save a collection

func (*Collection) SaveItems

func (c *Collection) SaveItems(store datastore.Datastore, items []*CollectionItem) error

SaveItems saves a slice of items to the collection. It's up to you to ensure that the "index" param doesn't get all messed up. TODO - validate / automate the Index param?

func (*Collection) UnmarshalSQL

func (c *Collection) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection

type CollectionItem

type CollectionItem struct {
	// Collection Items are Url's at heart
	Url

	// this item's index in the collection
	Index int `json:"index"`
	// unique description of this item
	Description string `json:"description"`
	// contains filtered or unexported fields
}

CollectionItem is an item in a collection. They are urls with added collection-specific information. This has the effect of storing all of the "main properties" of a collection item in the common list of urls

func (CollectionItem) DatastoreType

func (c CollectionItem) DatastoreType() string

DatastoreType is to satisfy sql_datastore.Model interface

func (*CollectionItem) Delete

func (c *CollectionItem) Delete(store datastore.Datastore) error

Delete a collection item

func (CollectionItem) GetId

func (c CollectionItem) GetId() string

GetId returns the Id of the collectionItem, which is the id of the underlying Url

func (CollectionItem) Key

func (c CollectionItem) Key() datastore.Key

Key is somewhat special as CollectionItems always have a Collection as their parent. This relationship is represented in directory-form: /Collection:[collection-id]/CollectionItem:[item-id]

func (*CollectionItem) NewSQLModel

func (c *CollectionItem) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*CollectionItem) Read

func (c *CollectionItem) Read(store datastore.Datastore) error

Read collection from db

func (*CollectionItem) SQLParams

func (c *CollectionItem) SQLParams(cmd sql_datastore.Cmd) []interface{}

SQLQuery is to satisfy the sql_datastore.Model interface, it returns this CollectionItem's parameters for a given type of SQL command

func (CollectionItem) SQLQuery

func (c CollectionItem) SQLQuery(cmd sql_datastore.Cmd) string

SQLQuery is to satisfy the sql_datastore.Model interface, it returns the concrete query for a given type of SQL command

func (*CollectionItem) Save

func (c *CollectionItem) Save(store datastore.Datastore) (err error)

Save a collection item to a store

func (*CollectionItem) UnmarshalSQL

func (c *CollectionItem) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection

type Consensus

type Consensus map[string]map[string]int

Consensus is an enumeration of Meta graph values arranged by key

func SumConsensus

func SumConsensus(subject string, blocks []*Metadata) (c Consensus, values map[string]interface{}, err error)

SumConsensus tallies the consensus around a given subject hash from a provided Metadata slice

func (Consensus) Metadata

func (c Consensus) Metadata(data map[string]interface{}) (map[string][]interface{}, error)

Metadata takes a store and gives back the actual metadata based on a provided stringMap Any key present in the consensus that isn't found in data will write the hash value instead Returned map should be valid for JSON encoding

type CustomCrawl

type CustomCrawl struct {
	// version 4 uuid
	Id string `json:"id"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// Json Web token that created this request
	Jwt string `json:"jwt"`
	// MorphRunId
	MorphRunId string `json:"morphRunId"`
	// timestamp this run was completed
	DateCompleted time.Time
	// repository for code that ran the crawl
	GithubRepo string `json:"githubRepo"`
	// OriginalUrl
	OriginalUrl string `json:"originalUrl"`
	// SqliteChecksum
	SqliteChecksum string `json:"sqliteChecksum"`
}

CustomCrawls are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the custom crawl, setting the stage for writing custom scripts to extract the underlying content.

func ListCustomCrawls

func ListCustomCrawls(store datastore.Datastore, limit, offset int) ([]*CustomCrawl, error)

func (CustomCrawl) DatastoreType

func (CustomCrawl) DatastoreType() string

func (*CustomCrawl) Delete

func (c *CustomCrawl) Delete(store datastore.Datastore) error

Delete a custom crawl, should only do for erronious additions

func (CustomCrawl) GetId

func (c CustomCrawl) GetId() string

func (CustomCrawl) Key

func (u CustomCrawl) Key() datastore.Key

func (*CustomCrawl) NewSQLModel

func (c *CustomCrawl) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*CustomCrawl) Read

func (c *CustomCrawl) Read(store datastore.Datastore) error

Read custom crawl from db

func (*CustomCrawl) SQLParams

func (c *CustomCrawl) SQLParams(cmd sql_datastore.Cmd) []interface{}

SQLParams formats a custom crawl struct for inserting / updating into postgres

func (*CustomCrawl) SQLQuery

func (c *CustomCrawl) SQLQuery(cmd sql_datastore.Cmd) string

func (*CustomCrawl) Save

func (c *CustomCrawl) Save(store datastore.Datastore) (err error)

Save a custom crawl

func (*CustomCrawl) UnmarshalSQL

func (c *CustomCrawl) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the custom crawl receiver it expects the request to have used custom crawlCols() for selection

type DataRepo

type DataRepo struct {
	// version 4 uuid
	Id string
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// Title of this data repository
	Title string `json:"title"`
	// Human-readable description
	Description string `json:"description"`
	// Main url link to the DataRepository
	Url string `json:"url"`
}

DataRepo is a place that holds data in a structured format

func (*DataRepo) DatastoreType

func (d *DataRepo) DatastoreType() string

func (*DataRepo) Delete

func (d *DataRepo) Delete(store datastore.Datastore) error

Delete a dataRepo, should only do for erronious additions

func (*DataRepo) GetId

func (d *DataRepo) GetId() string

func (*DataRepo) Key

func (d *DataRepo) Key() datastore.Key

func (*DataRepo) NewSQLModel

func (d *DataRepo) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*DataRepo) Read

func (d *DataRepo) Read(store datastore.Datastore) error

Read dataRepo from db

func (DataRepo) SQLParams

func (d DataRepo) SQLParams(cmd sql_datastore.Cmd) []interface{}

func (DataRepo) SQLQuery

func (d DataRepo) SQLQuery(cmd sql_datastore.Cmd) string

func (*DataRepo) Save

func (d *DataRepo) Save(store datastore.Datastore) (err error)

Save a dataRepo

func (*DataRepo) UnmarshalSQL

func (d *DataRepo) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the dataRepo receiver it expects the request to have used dataRepoCols() for selection

type File

type File struct {
	Url  string
	Data []byte
	Hash string
}

File is a buffered byte slice often made from a GET response body. It provides easy hash-calculation & storage to S3 TODO - depricate, use s3-datastore, or, uh... the distributed web

func NewFileFromRes

func NewFileFromRes(url string, res *http.Response) (*File, error)

NewFileFromRes generates a new file by consuming & closing a given response body

func (*File) Delete

func (f *File) Delete() error

Delete a file from S3

func (*File) Filename

func (f *File) Filename() (string, error)

Filename returns the name of the file, which is it's sha2-256 hash

func (*File) GetS3

func (f *File) GetS3() error

GetS3 reads a given file's hash from S3

func (*File) PutS3

func (f *File) PutS3() error

PutS3 puts the file on S3 if it doesn't already exist

type Link struct {
	// Calculated Hash for fixed ID purposes
	Hash string
	// created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// origin url of the linking document
	Src *Url `json:"src"`
	// absolute url of the <a> href property
	Dst *Url `json:"dst"`
}

A link represents an <a> tag in an html document src who's href attribute points to the url that resolves to dst. both src & dst must be stored as urls

func ReadDstContentLinks(db sqlutil.Queryable, src *Url) ([]*Link, error)

ReadDstContentLinks returns a list of links that specify a gien url as src that are content urls

func ReadDstLinks(db sqlutil.Queryable, src *Url) ([]*Link, error)

ReadDstLinks returns all links that specify a given url as src

func ReadSrcLinks(db sqlutil.Queryable, dst *Url) ([]*Link, error)

ReadSrcLinks returns all links that specify a given url as dst

func (*Link) DatastoreType

func (l *Link) DatastoreType() string

func (*Link) Delete

func (l *Link) Delete(store datastore.Datastore) error

func (*Link) GetId

func (l *Link) GetId() string

func (*Link) Insert

func (l *Link) Insert(store datastore.Datastore) error

func (*Link) Key

func (l *Link) Key() datastore.Key

func (*Link) NewSQLModel

func (l *Link) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*Link) Read

func (l *Link) Read(store datastore.Datastore) (err error)

func (*Link) SQLParams

func (l *Link) SQLParams(cmd sql_datastore.Cmd) []interface{}

func (*Link) SQLQuery

func (l *Link) SQLQuery(cmd sql_datastore.Cmd) string

func (*Link) UnmarshalSQL

func (l *Link) UnmarshalSQL(row sqlutil.Scannable) error

func (*Link) Update

func (l *Link) Update(store datastore.Datastore) error

type Meta

type Meta struct {
	Url           string            `json:"url"`
	Date          *time.Time        `json:"date,omitempty"`
	HeadersTook   int               `json:"headersTook,omitempty"`
	Id            string            `json:"id"`
	Status        int               `json:"status"`
	ContentSniff  string            `json:"contentSniff,omitempty"`
	RawHeaders    []string          `json:"rawHeaders""`
	Headers       map[string]string `json:"headers"`
	DownloadTook  int               `json:"downloadTook,omitempty"`
	Sha256        string            `json:"sha256"`
	Multihash     string            `json:"multihash"`
	Consensus     *Consensus        `json:"consensus"`
	InboundLinks  []string          `json:"inboundLinks,omitempty"`
	OutboundLinks []string          `json:"outboundLinks,omitempty"`
}

Meta is a struct for sharing our knowledge of a url with other services

type Metadata

type Metadata struct {
	// Hash is the sha256 multihash of all other fields in metadata
	// as expressed by Metadata.HashableBytes()
	Hash string `json:"hash"`
	// Creation timestamp
	Timestamp time.Time `json:"timestamp"`
	// Sha256 multihash of the public key that signed this metadata
	KeyId string `json:"keyId"`
	// Sha256 multihash of the content this metadata is describing
	Subject string `json:"subject"`
	// Hash value of the metadata that came before this, if any
	Prev string `json:"prev"`
	// Acutal metadata, a valid json Object
	Meta map[string]interface{} `json:"meta"`
}

A snapshot is a record of a GET request to a url There can be many metadata of a given url

func LatestMetadata

func LatestMetadata(db sqlutil.Queryable, keyId, subject string) (m *Metadata, err error)

LatestMetadata gives the most recent metadata timestamp for a given keyId & subject combination if one exists

func MetadataByKey

func MetadataByKey(db sqlutil.Queryable, keyId string, limit, offset int) ([]*Metadata, error)

func MetadataBySubject

func MetadataBySubject(db sqlutil.Queryable, subject string) ([]*Metadata, error)

MetadatasBySubject returns all metadata for a given subject hash

func NextMetadata

func NextMetadata(db sqlutil.Queryable, keyId, subject string) (*Metadata, error)

NextMetadata returns the next metadata block for a given subject. If no metablock exists a new one is created

func (Metadata) DatastoreType

func (m Metadata) DatastoreType() string

func (Metadata) GetId

func (m Metadata) GetId() string

func (*Metadata) HashMaps

func (m *Metadata) HashMaps() (keyMap map[string]string, valueMap map[string]interface{}, err error)

TODO - this is ripped from metablocks

func (*Metadata) HashableBytes

func (m *Metadata) HashableBytes() ([]byte, error)

HashableBytes returns the exact structure to be used for hash

func (Metadata) Key

func (m Metadata) Key() datastore.Key

func (Metadata) String

func (m Metadata) String() string

String is metadata's abbreviated string representation

func (*Metadata) UnmarshalSQL

func (m *Metadata) UnmarshalSQL(row sqlutil.Scannable) error

UnmarshalSQL reads an SQL result into the snapshot receiver

func (*Metadata) Write

func (m *Metadata) Write(store datastore.Datastore) error

WriteMetadata creates a snapshot record in the DB from a given Url struct

type Primer

type Primer struct {
	// version 4 uuid
	Id string `json:"id"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// shortest possible expression of this primer's name, usually an acronym
	// called shortTitle b/c acronyms collide often & users should feel free to
	// expand on acronyms
	ShortTitle string `json:"shortTitle"`
	// human-readable title of this primer.
	Title string `json:"title"`
	// long-form description of this primer.
	// TODO - Maybe we should store this in markdown format?
	Description string `json:"description"`
	// parent primer (if any)
	Parent *Primer `json:"parent"`
	// child-primers list
	SubPrimers []*Primer `json:"subPrimers,omitempty"`
	// metadata to associate with this primer
	Meta map[string]interface{} `json:"meta"`
	// statistics about this primer
	Stats *PrimerStats `json:"stats"`
	// collection of child sources
	Sources []*Source `json:"sources,omitempty"`
}

Primer is tracking information about an abstract group of content. For example a government agency is a primer

func BasePrimers

func BasePrimers(db sqlutil.Queryable, limit, offset int) (primers []*Primer, err error)

BasePrimers lists primers that have no parent

func ListPrimers

func ListPrimers(store datastore.Datastore, limit, offset int) ([]*Primer, error)

ListPrimers

func UnmarshalBoundedPrimers

func UnmarshalBoundedPrimers(rows *sql.Rows, limit int) (primers []*Primer, err error)

UnmarshalBoundedPrimers turns sql.Rows into primers, expecting len(rows) <= limit

func (*Primer) CalcStats

func (p *Primer) CalcStats(db *sql.DB) error

func (Primer) DatastoreType

func (p Primer) DatastoreType() string

func (*Primer) Delete

func (p *Primer) Delete(store datastore.Datastore) error

func (Primer) GetId

func (p Primer) GetId() string

func (Primer) Key

func (p Primer) Key() datastore.Key

func (*Primer) NewSQLModel

func (p *Primer) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*Primer) Read

func (p *Primer) Read(store datastore.Datastore) error

func (*Primer) ReadSources

func (p *Primer) ReadSources(db sqlutil.Queryable) error

ReadSources reads child sources of this primer

func (*Primer) ReadSubPrimers

func (p *Primer) ReadSubPrimers(db sqlutil.Queryable) error

ReadSubPrimers reads child primers of this primer

func (*Primer) SQLParams

func (p *Primer) SQLParams(cmd sql_datastore.Cmd) []interface{}

func (*Primer) SQLQuery

func (p *Primer) SQLQuery(cmd sql_datastore.Cmd) string

func (*Primer) Save

func (p *Primer) Save(store datastore.Datastore) (err error)

func (*Primer) UnmarshalSQL

func (p *Primer) UnmarshalSQL(row sqlutil.Scannable) error

type PrimerStats

type PrimerStats struct {
	UrlCount                int `json:"urlCount"`
	ArchivedUrlCount        int `json:"archivedUrlCount"`
	ContentUrlCount         int `json:"contentUrlCount"`
	ContentMetadataCount    int `json:"contentMetadataCount"`
	SourcesUrlCount         int `json:"sourcesUrlCount"`
	SourcesArchivedUrlCount int `json:"sourcesArchivedUrlCount"`
}

TODO - finish

type Snapshot

type Snapshot struct {
	// The url that was requested
	Url string `json:"url"`
	// Time this request was issued
	Created time.Time `json:"date"`
	// Returned Status
	Status int `json:"status,omitempty"`
	// Time to complete response in milliseconds
	Duration int64 `json:"downloadTook,omitempty"`
	// Record of all returned headers in [key,value,key,value...]
	Headers []string `json:"headers,omitempty"`
	// Multihash of response body (if any)
	Hash string `json:"hash,omitempty"`
}

A snapshot is a record of a GET request to a url There can be many snapshots of a given url

func SnapshotsForUrl

func SnapshotsForUrl(db sqlutil.Queryable, url string) ([]*Snapshot, error)

SnapshotsForUrl returns all snapshots for a given url string

func (*Snapshot) UnmarshalSQL

func (s *Snapshot) UnmarshalSQL(row sqlutil.Scannable) error

UnmarshalSQL reads an SQL result into the snapshot receiver

type Source

type Source struct {
	// version 4 uuid
	Id string `json:"id"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// human-readable title for this source
	Title string `json:"title"`
	// description of the source, ideally one paragraph
	Description string `json:"description"`
	// absolute url to serve as the root of the
	Url string `json:"url"`
	// primer this source is connected to
	Primer *Primer `json:"primer"`
	// weather or not this url should be crawled be a web crawler
	Crawl bool `json:"crawl"`
	// amount of time before a link within this tree is considered in need
	// of re-checking for changes. currently not in use, but planned.
	StaleDuration time.Duration `json:"staleDuration"`
	// yeah this'll probably get depricated. Part of a half-baked alerts feature idea.
	LastAlertSent *time.Time `json:"lastAlertSent"`
	// Metadata associated with this source that should be added to all
	// child urls, currently not in use, but planned
	Meta map[string]interface{} `json:"meta"`
	// Stats about this source
	Stats *SourceStats `json:"stats"`
}

Source is a concreate handle for archiving. Crawlers use source's url as a base of a link tree. Sources are connected to a parent Primer to provide context & organization.

func CrawlingSources

func CrawlingSources(db sqlutil.Queryable, limit, offset int) ([]*Source, error)

CrawlingSources lists sources with crawling = true, paginated

func ListSources

func ListSources(store datastore.Datastore, limit, offset int) ([]*Source, error)

ListSources lists all sources from most to least recent, paginated

func UnmarshalBoundedSources

func UnmarshalBoundedSources(rows *sql.Rows, limit int) ([]*Source, error)

UnmarshalBoundedSources turns a standard sql.Rows of Source results into a *Source slice

func (*Source) AsUrl

func (c *Source) AsUrl(db *sql.DB) (*Url, error)

AsUrl retrieves the url that corresponds for the crawlUrl. If one doesn't exist & the url is saved, a new url is created

func (*Source) CalcStats

func (s *Source) CalcStats(db *sql.DB) error

func (Source) DatastoreType

func (s Source) DatastoreType() string

func (*Source) Delete

func (s *Source) Delete(store datastore.Datastore) error

func (*Source) DescribedContent

func (s *Source) DescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)

TODO - this currently doesn't check the status of metadata, gonna need to do that DescribedContent returns a list of content-urls from this subprimer that need work.

func (Source) GetId

func (s Source) GetId() string

func (Source) Key

func (s Source) Key() datastore.Key

func (*Source) MatchesUrl

func (s *Source) MatchesUrl(rawurl string) bool

MatchesUrl checks to see if the url pattern of Source is contained within the passed-in url string TODO - make this more sophisticated, checking against the beginning of the url to avoid things like accidental matches, or urls in query params matching within rawurl

func (*Source) NewSQLModel

func (s *Source) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*Source) Read

func (s *Source) Read(store datastore.Datastore) error

func (*Source) SQLParams

func (s *Source) SQLParams(cmd sql_datastore.Cmd) []interface{}

func (*Source) SQLQuery

func (s *Source) SQLQuery(cmd sql_datastore.Cmd) string

func (*Source) Save

func (s *Source) Save(store datastore.Datastore) (err error)

func (*Source) UndescribedContent

func (s *Source) UndescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)

TODO - this currently doesn't check the status of metadata, gonna need to do that UndescribedContent returns a list of content-urls from this subprimer that need work.

func (*Source) UnmarshalSQL

func (c *Source) UnmarshalSQL(row sqlutil.Scannable) error

type SourceStats

type SourceStats struct {
	UrlCount             int `json:"urlCount"`
	ArchivedUrlCount     int `json:"archivedUrlCount"`
	ContentUrlCount      int `json:"contentUrlCount"`
	ContentMetadataCount int `json:"contentMetadataCount"`
}

type Uncrawlable

type Uncrawlable struct {
	// version 4 uuid
	Id string `json:"id"`
	// url from urls table, must be unique
	Url string `json:"url"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated"`
	// sha256 multihash of the public key that created this uncrawlable
	Creator string `json:"creator"`
	// name of person making submission
	Name string `json:"name"`
	// email address of person making submission
	Email string `json:"email"`
	// name of data rescue event where uncrawlable was added
	EventName string `json:"eventName"`
	// agency name
	Agency string `json:"agency"`
	// EDGI agency Id
	AgencyId string `json:"agencyId"`
	// EDGI subagency Id
	SubagencyId string `json:"subagencyId"`
	// EDGI organization Id
	OrgId string `json:"orgId"`
	// EDGI Suborganization Id
	SuborgId string `json:"orgId"`
	// EDGI subprimer Id
	SubprimerId string `json:"subprimerId"`
	// flag for ftp content
	Ftp bool `json:"ftp"`
	// flag for 'database'
	// TODO - refine this?
	Database bool `json:"database"`
	// flag for visualization / interactive content
	// obfuscating data
	Interactive bool `json:"interactive"`
	// flag for a page that links to many files
	ManyFiles bool `json:"manyFiles"`
	// uncrawlable comments
	Comments string `json:"comments"`
}

Uncrawlables are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the uncrawlable, setting the stage for writing custom scripts to extract the underlying content.

func ListUncrawlables

func ListUncrawlables(store datastore.Datastore, limit, offset int) ([]*Uncrawlable, error)

func (Uncrawlable) DatastoreType

func (u Uncrawlable) DatastoreType() string

func (*Uncrawlable) Delete

func (u *Uncrawlable) Delete(store datastore.Datastore) error

Delete a uncrawlable, should only do for erronious additions

func (Uncrawlable) GetId

func (u Uncrawlable) GetId() string

func (Uncrawlable) Key

func (u Uncrawlable) Key() datastore.Key

func (*Uncrawlable) NewSQLModel

func (u *Uncrawlable) NewSQLModel(key datastore.Key) sql_datastore.Model

func (*Uncrawlable) Read

func (u *Uncrawlable) Read(store datastore.Datastore) error

Read uncrawlable from db

func (*Uncrawlable) SQLParams

func (u *Uncrawlable) SQLParams(cmd sql_datastore.Cmd) []interface{}

SQLParams formats a uncrawlable struct for inserting / updating into postgres

func (*Uncrawlable) SQLQuery

func (u *Uncrawlable) SQLQuery(cmd sql_datastore.Cmd) string

func (*Uncrawlable) Save

func (u *Uncrawlable) Save(store datastore.Datastore) (err error)

Save a uncrawlable

func (*Uncrawlable) UnmarshalSQL

func (u *Uncrawlable) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the uncrawlable receiver it expects the request to have used uncrawlableCols() for selection

type Url

type Url struct {
	// version 4 uuid
	// urls can/should/must also be be uniquely identified by Url
	Id string `json:"id,omitempty"`
	// A Url is uniquely identified by URI string without
	// any normalization. Url strings must always be absolute.
	Url string `json:"url"`
	// Created timestamp rounded to seconds in UTC
	Created time.Time `json:"created,omitempty"`
	// Updated timestamp rounded to seconds in UTC
	Updated time.Time `json:"updated,omitempty"`

	// Timestamp for most recent GET request
	LastGet *time.Time `json:"lastGet,omitempty"`
	// Timestamp for most revent HEAD request
	LastHead *time.Time `json:"lastHead,omitempty"`

	// Returned HTTP status code
	Status int `json:"status,omitempty"`
	// Returned HTTP 'Content-Type' header
	ContentType string `json:"contentType,omitempty"`
	// Result of mime sniffing to GET response body, as detailed at https://mimesniff.spec.whatwg.org
	ContentSniff string `json:"contentSniff,omitempty"`
	// ContentLength in bytes, will be the header value if only a HEAD request has been issued
	// After a valid GET response, it will be set to the length of the returned response
	ContentLength int64 `json:"contentLength,omitempty"`

	// best guess at a filename based on url string analysis
	// if you just want to know what type of file this is, this is the field to use.
	FileName string `json:"fileName,omitempty"`

	// HTML Title tag attribute
	Title string `json:"title,omitempty"`

	// Time remote server took to transfer content in miliseconds.
	// TODO - currently not implemented
	DownloadTook int `json:"downloadTook,omitempty"`
	// Time taken to  in miliseconds. currently not implemented
	HeadersTook int `json:"headersTook,omitempty"`

	// key-value slice of returned headers from most recent HEAD or GET request
	// stored in the form [key,value,key,value...]
	Headers []string `json:"headers,omitempty"`
	// any associative metadata
	Meta map[string]interface{} `json:"meta,omitempty"`

	// Hash is a multihash sha-256 of res.Body
	Hash string `json:"hash,omitempty"`

	// Url to saved content
	ContentUrl string `json:"contentUrl,omitempty"`

	// Uncrawlable information
	Uncrawlable *Uncrawlable `json:"uncrawlable,omitempty"`
}

URL represents... a url. TODO - consider renaming to Resource

func ContentUrls

func ContentUrls(db sqlutil.Queryable, limit, skip int) ([]*Url, error)

func FetchedUrls

func FetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)

func ListUrls

func ListUrls(store datastore.Datastore, limit, offset int) ([]*Url, error)
func Search(db sqlutil.Queryable, q string, limit, offset int) ([]*Url, error)

func UnfetchedUrls

func UnfetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)

func UnmarshalBoundedUrls

func UnmarshalBoundedUrls(rows *sql.Rows, limit int) ([]*Url, error)

func UnmarshalUrls

func UnmarshalUrls(rows *sql.Rows) ([]*Url, error)

UnmarshalUrls takes an sql cursor & returns a slice of url pointers expects columns to math urlCols()

func UrlsForHash

func UrlsForHash(db sqlutil.Queryable, hash string) ([]*Url, error)

func (Url) DatastoreType

func (u Url) DatastoreType() string

func (*Url) Delete

func (u *Url) Delete(store datastore.Datastore) error

Delete a url, should only do for erronious additions

func (u *Url) ExtractDocLinks(store datastore.Datastore, doc *goquery.Document) ([]*Link, error)

ExtractDocLinks extracts & stores a page's linked documents by selecting all a[href] links from a given qoquery document, using the receiver *Url as the base

func (*Url) File

func (u *Url) File() (*File, error)

File leverages a url's hash to generate a file that can have it's bytes read back

func (*Url) Get

func (u *Url) Get(store datastore.Datastore) (body []byte, links []*Link, err error)

Issue a GET request to this URL if it's eligible for one

func (Url) GetId

func (u Url) GetId() string

func (*Url) HandleGetResponse

func (u *Url) HandleGetResponse(store datastore.Datastore, res *http.Response) (body []byte, links []*Link, err error)

HandleGetResponse performs all necessary actions in response to a GET request, regardless of weather it came from a crawl or archive request

func (*Url) HeadersMap

func (u *Url) HeadersMap() (headers map[string]string)

HeadersMap formats u.Headers (a string slice) as a map[header]value

func (u *Url) InboundLinks(db sqlutil.Queryable) ([]string, error)

InboundLinks returns a slice of url strings that link to this url

func (Url) Key

func (u Url) Key() datastore.Key

func (*Url) NewSQLModel

func (u *Url) NewSQLModel(key datastore.Key) sql_datastore.Model
func (u *Url) OutboundLinks(db sqlutil.Queryable) ([]string, error)

Outbound returns a slice of url strings that this url links to

func (*Url) ParsedUrl

func (u *Url) ParsedUrl() (*url.URL, error)

ParsedUrl is a convenience wrapper around url.Parse

func (*Url) Read

func (u *Url) Read(store datastore.Datastore) error

Read url from db

func (*Url) SQLParams

func (u *Url) SQLParams(cmd sql_datastore.Cmd) []interface{}

SQLArgs formats a url struct for inserting / updating into postgres

func (*Url) SQLQuery

func (u *Url) SQLQuery(cmd sql_datastore.Cmd) string

func (*Url) Save

func (u *Url) Save(store datastore.Datastore) (err error)

func (*Url) ShouldEnqueueGet

func (u *Url) ShouldEnqueueGet() bool

ShouldEnqueueGet returns weather the url can be added to the que for a GET request. keep in mind only urls who's domain are are marked crawl : true in the domains list will be candidates for GET requests. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration

func (*Url) ShouldEnqueueHead

func (u *Url) ShouldEnqueueHead() bool

ShouldEnqueueHead returns weather the url can be added to the que for a HEAD request. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration

func (*Url) ShouldPutS3

func (u *Url) ShouldPutS3() bool

ShouldPutS3 is a chance to override weather the content should be stored

func (*Url) SuspectedContentUrl

func (u *Url) SuspectedContentUrl() bool

SuspectedContentUrl examines the url string, returns true if there's a reasonable chance the url leads to content

func (*Url) UnmarshalSQL

func (u *Url) UnmarshalSQL(row sqlutil.Scannable) (err error)

UnmarshalSQL reads an sql response into the url receiver it expects the request to have used urlCols() for selection

func (*Url) WarcRequest

func (u *Url) WarcRequest() *warc.Request

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL