Documentation ¶
Overview ¶
Archive holds all common model definitions for archivers 2.0.
TODO - turn "Metadata" into github.com/datatogether/metablocks.Metablock
Index ¶
- Variables
- func CalcHash(data []byte) (string, error)
- func ContentUrlsCount(db sqlutil.Queryable) (count int, err error)
- func CountPrimers(db sqlutil.Queryable) (count int64, err error)
- func CountSources(db sqlutil.Queryable) (count int, err error)
- func FileUrl(url *Url) string
- func MetadataCountByKey(db sqlutil.Queryable, keyId string) (count int, err error)
- func NormalizeURL(u *url.URL) *url.URL
- func NormalizeURLString(url string) (string, error)
- func ValidArchivingUrl(db sqlutil.Queryable, url string) error
- func WriteSnapshot(store datastore.Datastore, u *Url) error
- type Collection
- func (c Collection) DatastoreType() string
- func (c *Collection) Delete(store datastore.Datastore) error
- func (c *Collection) DeleteItems(store datastore.Datastore, items []*CollectionItem) error
- func (c Collection) GetId() string
- func (c *Collection) ItemCount(store datastore.Datastore) (count int, err error)
- func (c Collection) Key() datastore.Key
- func (c *Collection) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *Collection) Read(store datastore.Datastore) error
- func (c *Collection) ReadItems(store datastore.Datastore, orderby string, limit, offset int) (items []*CollectionItem, err error)
- func (c *Collection) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c Collection) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *Collection) Save(store datastore.Datastore) (err error)
- func (c *Collection) SaveItems(store datastore.Datastore, items []*CollectionItem) error
- func (c *Collection) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type CollectionItem
- func (c CollectionItem) DatastoreType() string
- func (c *CollectionItem) Delete(store datastore.Datastore) error
- func (c CollectionItem) GetId() string
- func (c CollectionItem) Key() datastore.Key
- func (c *CollectionItem) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *CollectionItem) Read(store datastore.Datastore) error
- func (c *CollectionItem) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c CollectionItem) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *CollectionItem) Save(store datastore.Datastore) (err error)
- func (c *CollectionItem) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type Consensus
- type CustomCrawl
- func (CustomCrawl) DatastoreType() string
- func (c *CustomCrawl) Delete(store datastore.Datastore) error
- func (c CustomCrawl) GetId() string
- func (u CustomCrawl) Key() datastore.Key
- func (c *CustomCrawl) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *CustomCrawl) Read(store datastore.Datastore) error
- func (c *CustomCrawl) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c *CustomCrawl) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *CustomCrawl) Save(store datastore.Datastore) (err error)
- func (c *CustomCrawl) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type DataRepo
- func (d *DataRepo) DatastoreType() string
- func (d *DataRepo) Delete(store datastore.Datastore) error
- func (d *DataRepo) GetId() string
- func (d *DataRepo) Key() datastore.Key
- func (d *DataRepo) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (d *DataRepo) Read(store datastore.Datastore) error
- func (d DataRepo) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (d DataRepo) SQLQuery(cmd sql_datastore.Cmd) string
- func (d *DataRepo) Save(store datastore.Datastore) (err error)
- func (d *DataRepo) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type File
- type Link
- func (l *Link) DatastoreType() string
- func (l *Link) Delete(store datastore.Datastore) error
- func (l *Link) GetId() string
- func (l *Link) Insert(store datastore.Datastore) error
- func (l *Link) Key() datastore.Key
- func (l *Link) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (l *Link) Read(store datastore.Datastore) (err error)
- func (l *Link) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (l *Link) SQLQuery(cmd sql_datastore.Cmd) string
- func (l *Link) UnmarshalSQL(row sqlutil.Scannable) error
- func (l *Link) Update(store datastore.Datastore) error
- type Meta
- type Metadata
- func LatestMetadata(db sqlutil.Queryable, keyId, subject string) (m *Metadata, err error)
- func MetadataByKey(db sqlutil.Queryable, keyId string, limit, offset int) ([]*Metadata, error)
- func MetadataBySubject(db sqlutil.Queryable, subject string) ([]*Metadata, error)
- func NextMetadata(db sqlutil.Queryable, keyId, subject string) (*Metadata, error)
- func (m Metadata) DatastoreType() string
- func (m Metadata) GetId() string
- func (m *Metadata) HashMaps() (keyMap map[string]string, valueMap map[string]interface{}, err error)
- func (m *Metadata) HashableBytes() ([]byte, error)
- func (m Metadata) Key() datastore.Key
- func (m Metadata) String() string
- func (m *Metadata) UnmarshalSQL(row sqlutil.Scannable) error
- func (m *Metadata) Write(store datastore.Datastore) error
- type Primer
- func (p *Primer) CalcStats(db *sql.DB) error
- func (p Primer) DatastoreType() string
- func (p *Primer) Delete(store datastore.Datastore) error
- func (p Primer) GetId() string
- func (p Primer) Key() datastore.Key
- func (p *Primer) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (p *Primer) Read(store datastore.Datastore) error
- func (p *Primer) ReadSources(db sqlutil.Queryable) error
- func (p *Primer) ReadSubPrimers(db sqlutil.Queryable) error
- func (p *Primer) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (p *Primer) SQLQuery(cmd sql_datastore.Cmd) string
- func (p *Primer) Save(store datastore.Datastore) (err error)
- func (p *Primer) UnmarshalSQL(row sqlutil.Scannable) error
- type PrimerStats
- type Snapshot
- type Source
- func (c *Source) AsUrl(db *sql.DB) (*Url, error)
- func (s *Source) CalcStats(db *sql.DB) error
- func (s Source) DatastoreType() string
- func (s *Source) Delete(store datastore.Datastore) error
- func (s *Source) DescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func (s Source) GetId() string
- func (s Source) Key() datastore.Key
- func (s *Source) MatchesUrl(rawurl string) bool
- func (s *Source) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (s *Source) Read(store datastore.Datastore) error
- func (s *Source) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (s *Source) SQLQuery(cmd sql_datastore.Cmd) string
- func (s *Source) Save(store datastore.Datastore) (err error)
- func (s *Source) UndescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func (c *Source) UnmarshalSQL(row sqlutil.Scannable) error
- type SourceStats
- type Uncrawlable
- func (u Uncrawlable) DatastoreType() string
- func (u *Uncrawlable) Delete(store datastore.Datastore) error
- func (u Uncrawlable) GetId() string
- func (u Uncrawlable) Key() datastore.Key
- func (u *Uncrawlable) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (u *Uncrawlable) Read(store datastore.Datastore) error
- func (u *Uncrawlable) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (u *Uncrawlable) SQLQuery(cmd sql_datastore.Cmd) string
- func (u *Uncrawlable) Save(store datastore.Datastore) (err error)
- func (u *Uncrawlable) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type Url
- func ContentUrls(db sqlutil.Queryable, limit, skip int) ([]*Url, error)
- func FetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func ListUrls(store datastore.Datastore, limit, offset int) ([]*Url, error)
- func Search(db sqlutil.Queryable, q string, limit, offset int) ([]*Url, error)
- func UnfetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func UnmarshalBoundedUrls(rows *sql.Rows, limit int) ([]*Url, error)
- func UnmarshalUrls(rows *sql.Rows) ([]*Url, error)
- func UrlsForHash(db sqlutil.Queryable, hash string) ([]*Url, error)
- func (u Url) DatastoreType() string
- func (u *Url) Delete(store datastore.Datastore) error
- func (u *Url) ExtractDocLinks(store datastore.Datastore, doc *goquery.Document) ([]*Link, error)
- func (u *Url) File() (*File, error)
- func (u *Url) Get(store datastore.Datastore) (body []byte, links []*Link, err error)
- func (u Url) GetId() string
- func (u *Url) HandleGetResponse(store datastore.Datastore, res *http.Response) (body []byte, links []*Link, err error)
- func (u *Url) HeadersMap() (headers map[string]string)
- func (u *Url) InboundLinks(db sqlutil.Queryable) ([]string, error)
- func (u Url) Key() datastore.Key
- func (u *Url) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (u *Url) OutboundLinks(db sqlutil.Queryable) ([]string, error)
- func (u *Url) ParsedUrl() (*url.URL, error)
- func (u *Url) Read(store datastore.Datastore) error
- func (u *Url) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (u *Url) SQLQuery(cmd sql_datastore.Cmd) string
- func (u *Url) Save(store datastore.Datastore) (err error)
- func (u *Url) ShouldEnqueueGet() bool
- func (u *Url) ShouldEnqueueHead() bool
- func (u *Url) ShouldPutS3() bool
- func (u *Url) SuspectedContentUrl() bool
- func (u *Url) UnmarshalSQL(row sqlutil.Scannable) (err error)
- func (u *Url) WarcRequest() *warc.Request
Constants ¶
This section is empty.
Variables ¶
var ( // how long before a url is considered stale. default is 72 hours. StaleDuration = time.Hour * 72 // all these need to be set for file saving to work AwsRegion string AwsAccessKeyId string AwsSecretAccessKey string AwsS3BucketName string AwsS3BucketPath string )
var ( ErrNotFound = fmt.Errorf("Not Found") ErrInvalidResponse = fmt.Errorf("Datastore returned an invalid response") )
Functions ¶
func CalcHash ¶
CalcHash calculates the multihash key for a given slice of bytes TODO - find a proper home for this
func CountPrimers ¶
CountPrimers returns the total number of primers
func CountSources ¶
CountSources grabs the total number of sources
func MetadataCountByKey ¶
func NormalizeURL ¶
NormalizeURL removes inconsitencincies from a given url
func NormalizeURLString ¶
NormalizeURLString removes inconsitencincies from a given url string
func WriteSnapshot ¶
WriteSnapshot creates a snapshot record in the DB from a given Url struct
Types ¶
type Collection ¶
type Collection struct { // version 4 uuid Id string `json:"id"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // sha256 multihash of the public key that created this collection Creator string `json:"creator"` // human-readable title of the collection Title string `json:"title"` // description of the collection Description string `json:"description"` // url this collection originates from Url string `json:"url,omitempty"` }
Collections are generic groupings of content collections can be thought of as a csv file listing content hashes as the first column, and whatever other information is necessary in subsequent columns
func CollectionsByCreator ¶
func CollectionsByCreator(store datastore.Datastore, creator, orderby string, limit, offset int) ([]*Collection, error)
func ListCollections ¶
func ListCollections(store datastore.Datastore, limit, offset int) ([]*Collection, error)
func (Collection) DatastoreType ¶
func (c Collection) DatastoreType() string
func (*Collection) Delete ¶
func (c *Collection) Delete(store datastore.Datastore) error
Delete a collection, should only do for erronious additions
func (*Collection) DeleteItems ¶
func (c *Collection) DeleteItems(store datastore.Datastore, items []*CollectionItem) error
DeleteItems removes a given list of items from the collection
func (Collection) GetId ¶
func (c Collection) GetId() string
func (*Collection) ItemCount ¶
func (c *Collection) ItemCount(store datastore.Datastore) (count int, err error)
ItemCount gets the number of items in the collection
func (Collection) Key ¶
func (c Collection) Key() datastore.Key
func (*Collection) NewSQLModel ¶
func (c *Collection) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Collection) Read ¶
func (c *Collection) Read(store datastore.Datastore) error
Read collection from db
func (*Collection) ReadItems ¶
func (c *Collection) ReadItems(store datastore.Datastore, orderby string, limit, offset int) (items []*CollectionItem, err error)
ReadItems reads a bounded set of items from the collection the orderby param currently only supports SQL-style input of a single proprty, eg: "index" or "index DESC"
func (*Collection) SQLParams ¶
func (c *Collection) SQLParams(cmd sql_datastore.Cmd) []interface{}
func (Collection) SQLQuery ¶
func (c Collection) SQLQuery(cmd sql_datastore.Cmd) string
func (*Collection) Save ¶
func (c *Collection) Save(store datastore.Datastore) (err error)
Save a collection
func (*Collection) SaveItems ¶
func (c *Collection) SaveItems(store datastore.Datastore, items []*CollectionItem) error
SaveItems saves a slice of items to the collection. It's up to you to ensure that the "index" param doesn't get all messed up. TODO - validate / automate the Index param?
func (*Collection) UnmarshalSQL ¶
func (c *Collection) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection
type CollectionItem ¶
type CollectionItem struct { // Collection Items are Url's at heart Url // this item's index in the collection Index int `json:"index"` // unique description of this item Description string `json:"description"` // contains filtered or unexported fields }
CollectionItem is an item in a collection. They are urls with added collection-specific information. This has the effect of storing all of the "main properties" of a collection item in the common list of urls
func (CollectionItem) DatastoreType ¶
func (c CollectionItem) DatastoreType() string
DatastoreType is to satisfy sql_datastore.Model interface
func (*CollectionItem) Delete ¶
func (c *CollectionItem) Delete(store datastore.Datastore) error
Delete a collection item
func (CollectionItem) GetId ¶
func (c CollectionItem) GetId() string
GetId returns the Id of the collectionItem, which is the id of the underlying Url
func (CollectionItem) Key ¶
func (c CollectionItem) Key() datastore.Key
Key is somewhat special as CollectionItems always have a Collection as their parent. This relationship is represented in directory-form: /Collection:[collection-id]/CollectionItem:[item-id]
func (*CollectionItem) NewSQLModel ¶
func (c *CollectionItem) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*CollectionItem) Read ¶
func (c *CollectionItem) Read(store datastore.Datastore) error
Read collection from db
func (*CollectionItem) SQLParams ¶
func (c *CollectionItem) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLQuery is to satisfy the sql_datastore.Model interface, it returns this CollectionItem's parameters for a given type of SQL command
func (CollectionItem) SQLQuery ¶
func (c CollectionItem) SQLQuery(cmd sql_datastore.Cmd) string
SQLQuery is to satisfy the sql_datastore.Model interface, it returns the concrete query for a given type of SQL command
func (*CollectionItem) Save ¶
func (c *CollectionItem) Save(store datastore.Datastore) (err error)
Save a collection item to a store
func (*CollectionItem) UnmarshalSQL ¶
func (c *CollectionItem) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection
type Consensus ¶
Consensus is an enumeration of Meta graph values arranged by key
func SumConsensus ¶
func SumConsensus(subject string, blocks []*Metadata) (c Consensus, values map[string]interface{}, err error)
SumConsensus tallies the consensus around a given subject hash from a provided Metadata slice
type CustomCrawl ¶
type CustomCrawl struct { // version 4 uuid Id string `json:"id"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // Json Web token that created this request Jwt string `json:"jwt"` // MorphRunId MorphRunId string `json:"morphRunId"` // timestamp this run was completed DateCompleted time.Time // repository for code that ran the crawl GithubRepo string `json:"githubRepo"` // OriginalUrl OriginalUrl string `json:"originalUrl"` // SqliteChecksum SqliteChecksum string `json:"sqliteChecksum"` }
CustomCrawls are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the custom crawl, setting the stage for writing custom scripts to extract the underlying content.
func ListCustomCrawls ¶
func ListCustomCrawls(store datastore.Datastore, limit, offset int) ([]*CustomCrawl, error)
func (CustomCrawl) DatastoreType ¶
func (CustomCrawl) DatastoreType() string
func (*CustomCrawl) Delete ¶
func (c *CustomCrawl) Delete(store datastore.Datastore) error
Delete a custom crawl, should only do for erronious additions
func (CustomCrawl) GetId ¶
func (c CustomCrawl) GetId() string
func (CustomCrawl) Key ¶
func (u CustomCrawl) Key() datastore.Key
func (*CustomCrawl) NewSQLModel ¶
func (c *CustomCrawl) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*CustomCrawl) Read ¶
func (c *CustomCrawl) Read(store datastore.Datastore) error
Read custom crawl from db
func (*CustomCrawl) SQLParams ¶
func (c *CustomCrawl) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLParams formats a custom crawl struct for inserting / updating into postgres
func (*CustomCrawl) SQLQuery ¶
func (c *CustomCrawl) SQLQuery(cmd sql_datastore.Cmd) string
func (*CustomCrawl) Save ¶
func (c *CustomCrawl) Save(store datastore.Datastore) (err error)
Save a custom crawl
func (*CustomCrawl) UnmarshalSQL ¶
func (c *CustomCrawl) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the custom crawl receiver it expects the request to have used custom crawlCols() for selection
type DataRepo ¶
type DataRepo struct { // version 4 uuid Id string // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // Title of this data repository Title string `json:"title"` // Human-readable description Description string `json:"description"` // Main url link to the DataRepository Url string `json:"url"` }
DataRepo is a place that holds data in a structured format
func (*DataRepo) DatastoreType ¶
func (*DataRepo) NewSQLModel ¶
func (d *DataRepo) NewSQLModel(key datastore.Key) sql_datastore.Model
func (DataRepo) SQLParams ¶
func (d DataRepo) SQLParams(cmd sql_datastore.Cmd) []interface{}
type File ¶
File is a buffered byte slice often made from a GET response body. It provides easy hash-calculation & storage to S3 TODO - depricate, use s3-datastore, or, uh... the distributed web
func NewFileFromRes ¶
NewFileFromRes generates a new file by consuming & closing a given response body
type Link ¶
type Link struct { // Calculated Hash for fixed ID purposes Hash string // created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // origin url of the linking document Src *Url `json:"src"` // absolute url of the <a> href property Dst *Url `json:"dst"` }
A link represents an <a> tag in an html document src who's href attribute points to the url that resolves to dst. both src & dst must be stored as urls
func ReadDstContentLinks ¶
ReadDstContentLinks returns a list of links that specify a gien url as src that are content urls
func ReadDstLinks ¶
ReadDstLinks returns all links that specify a given url as src
func ReadSrcLinks ¶
ReadSrcLinks returns all links that specify a given url as dst
func (*Link) DatastoreType ¶
func (*Link) NewSQLModel ¶
func (l *Link) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Link) SQLParams ¶
func (l *Link) SQLParams(cmd sql_datastore.Cmd) []interface{}
type Meta ¶
type Meta struct { Url string `json:"url"` Date *time.Time `json:"date,omitempty"` HeadersTook int `json:"headersTook,omitempty"` Id string `json:"id"` Status int `json:"status"` ContentSniff string `json:"contentSniff,omitempty"` RawHeaders []string `json:"rawHeaders""` Headers map[string]string `json:"headers"` DownloadTook int `json:"downloadTook,omitempty"` Sha256 string `json:"sha256"` Multihash string `json:"multihash"` Consensus *Consensus `json:"consensus"` InboundLinks []string `json:"inboundLinks,omitempty"` OutboundLinks []string `json:"outboundLinks,omitempty"` }
Meta is a struct for sharing our knowledge of a url with other services
type Metadata ¶
type Metadata struct { // Hash is the sha256 multihash of all other fields in metadata // as expressed by Metadata.HashableBytes() Hash string `json:"hash"` // Creation timestamp Timestamp time.Time `json:"timestamp"` // Sha256 multihash of the public key that signed this metadata KeyId string `json:"keyId"` // Sha256 multihash of the content this metadata is describing Subject string `json:"subject"` // Hash value of the metadata that came before this, if any Prev string `json:"prev"` // Acutal metadata, a valid json Object Meta map[string]interface{} `json:"meta"` }
A snapshot is a record of a GET request to a url There can be many metadata of a given url
func LatestMetadata ¶
LatestMetadata gives the most recent metadata timestamp for a given keyId & subject combination if one exists
func MetadataByKey ¶
func MetadataBySubject ¶
MetadatasBySubject returns all metadata for a given subject hash
func NextMetadata ¶
NextMetadata returns the next metadata block for a given subject. If no metablock exists a new one is created
func (Metadata) DatastoreType ¶
func (*Metadata) HashMaps ¶
func (m *Metadata) HashMaps() (keyMap map[string]string, valueMap map[string]interface{}, err error)
TODO - this is ripped from metablocks
func (*Metadata) HashableBytes ¶
HashableBytes returns the exact structure to be used for hash
func (*Metadata) UnmarshalSQL ¶
UnmarshalSQL reads an SQL result into the snapshot receiver
type Primer ¶
type Primer struct { // version 4 uuid Id string `json:"id"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // shortest possible expression of this primer's name, usually an acronym // called shortTitle b/c acronyms collide often & users should feel free to // expand on acronyms ShortTitle string `json:"shortTitle"` // human-readable title of this primer. Title string `json:"title"` // long-form description of this primer. // TODO - Maybe we should store this in markdown format? Description string `json:"description"` // parent primer (if any) Parent *Primer `json:"parent"` // child-primers list SubPrimers []*Primer `json:"subPrimers,omitempty"` // metadata to associate with this primer Meta map[string]interface{} `json:"meta"` // statistics about this primer Stats *PrimerStats `json:"stats"` // collection of child sources Sources []*Source `json:"sources,omitempty"` }
Primer is tracking information about an abstract group of content. For example a government agency is a primer
func BasePrimers ¶
BasePrimers lists primers that have no parent
func ListPrimers ¶
ListPrimers
func UnmarshalBoundedPrimers ¶
UnmarshalBoundedPrimers turns sql.Rows into primers, expecting len(rows) <= limit
func (Primer) DatastoreType ¶
func (*Primer) NewSQLModel ¶
func (p *Primer) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Primer) ReadSources ¶
ReadSources reads child sources of this primer
func (*Primer) ReadSubPrimers ¶
ReadSubPrimers reads child primers of this primer
func (*Primer) SQLParams ¶
func (p *Primer) SQLParams(cmd sql_datastore.Cmd) []interface{}
type PrimerStats ¶
type PrimerStats struct { UrlCount int `json:"urlCount"` ArchivedUrlCount int `json:"archivedUrlCount"` ContentUrlCount int `json:"contentUrlCount"` ContentMetadataCount int `json:"contentMetadataCount"` SourcesUrlCount int `json:"sourcesUrlCount"` SourcesArchivedUrlCount int `json:"sourcesArchivedUrlCount"` }
TODO - finish
type Snapshot ¶
type Snapshot struct { // The url that was requested Url string `json:"url"` // Time this request was issued Created time.Time `json:"date"` // Returned Status Status int `json:"status,omitempty"` // Time to complete response in milliseconds Duration int64 `json:"downloadTook,omitempty"` // Record of all returned headers in [key,value,key,value...] Headers []string `json:"headers,omitempty"` // Multihash of response body (if any) Hash string `json:"hash,omitempty"` }
A snapshot is a record of a GET request to a url There can be many snapshots of a given url
func SnapshotsForUrl ¶
SnapshotsForUrl returns all snapshots for a given url string
type Source ¶
type Source struct { // version 4 uuid Id string `json:"id"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // human-readable title for this source Title string `json:"title"` // description of the source, ideally one paragraph Description string `json:"description"` // absolute url to serve as the root of the Url string `json:"url"` // primer this source is connected to Primer *Primer `json:"primer"` // weather or not this url should be crawled be a web crawler Crawl bool `json:"crawl"` // amount of time before a link within this tree is considered in need // of re-checking for changes. currently not in use, but planned. StaleDuration time.Duration `json:"staleDuration"` // yeah this'll probably get depricated. Part of a half-baked alerts feature idea. LastAlertSent *time.Time `json:"lastAlertSent"` // Metadata associated with this source that should be added to all // child urls, currently not in use, but planned Meta map[string]interface{} `json:"meta"` // Stats about this source Stats *SourceStats `json:"stats"` }
Source is a concreate handle for archiving. Crawlers use source's url as a base of a link tree. Sources are connected to a parent Primer to provide context & organization.
func CrawlingSources ¶
CrawlingSources lists sources with crawling = true, paginated
func ListSources ¶
ListSources lists all sources from most to least recent, paginated
func UnmarshalBoundedSources ¶
UnmarshalBoundedSources turns a standard sql.Rows of Source results into a *Source slice
func (*Source) AsUrl ¶
AsUrl retrieves the url that corresponds for the crawlUrl. If one doesn't exist & the url is saved, a new url is created
func (Source) DatastoreType ¶
func (*Source) DescribedContent ¶
TODO - this currently doesn't check the status of metadata, gonna need to do that DescribedContent returns a list of content-urls from this subprimer that need work.
func (*Source) MatchesUrl ¶
MatchesUrl checks to see if the url pattern of Source is contained within the passed-in url string TODO - make this more sophisticated, checking against the beginning of the url to avoid things like accidental matches, or urls in query params matching within rawurl
func (*Source) NewSQLModel ¶
func (s *Source) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Source) SQLParams ¶
func (s *Source) SQLParams(cmd sql_datastore.Cmd) []interface{}
func (*Source) UndescribedContent ¶
TODO - this currently doesn't check the status of metadata, gonna need to do that UndescribedContent returns a list of content-urls from this subprimer that need work.
type SourceStats ¶
type Uncrawlable ¶
type Uncrawlable struct { // version 4 uuid Id string `json:"id"` // url from urls table, must be unique Url string `json:"url"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated"` // sha256 multihash of the public key that created this uncrawlable Creator string `json:"creator"` // name of person making submission Name string `json:"name"` // email address of person making submission Email string `json:"email"` // name of data rescue event where uncrawlable was added EventName string `json:"eventName"` // agency name Agency string `json:"agency"` // EDGI agency Id AgencyId string `json:"agencyId"` // EDGI subagency Id SubagencyId string `json:"subagencyId"` // EDGI organization Id OrgId string `json:"orgId"` // EDGI Suborganization Id SuborgId string `json:"orgId"` // EDGI subprimer Id SubprimerId string `json:"subprimerId"` // flag for ftp content Ftp bool `json:"ftp"` // flag for 'database' // TODO - refine this? Database bool `json:"database"` // flag for visualization / interactive content // obfuscating data Interactive bool `json:"interactive"` // flag for a page that links to many files ManyFiles bool `json:"manyFiles"` // uncrawlable comments Comments string `json:"comments"` }
Uncrawlables are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the uncrawlable, setting the stage for writing custom scripts to extract the underlying content.
func ListUncrawlables ¶
func ListUncrawlables(store datastore.Datastore, limit, offset int) ([]*Uncrawlable, error)
func (Uncrawlable) DatastoreType ¶
func (u Uncrawlable) DatastoreType() string
func (*Uncrawlable) Delete ¶
func (u *Uncrawlable) Delete(store datastore.Datastore) error
Delete a uncrawlable, should only do for erronious additions
func (Uncrawlable) GetId ¶
func (u Uncrawlable) GetId() string
func (Uncrawlable) Key ¶
func (u Uncrawlable) Key() datastore.Key
func (*Uncrawlable) NewSQLModel ¶
func (u *Uncrawlable) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Uncrawlable) Read ¶
func (u *Uncrawlable) Read(store datastore.Datastore) error
Read uncrawlable from db
func (*Uncrawlable) SQLParams ¶
func (u *Uncrawlable) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLParams formats a uncrawlable struct for inserting / updating into postgres
func (*Uncrawlable) SQLQuery ¶
func (u *Uncrawlable) SQLQuery(cmd sql_datastore.Cmd) string
func (*Uncrawlable) Save ¶
func (u *Uncrawlable) Save(store datastore.Datastore) (err error)
Save a uncrawlable
func (*Uncrawlable) UnmarshalSQL ¶
func (u *Uncrawlable) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the uncrawlable receiver it expects the request to have used uncrawlableCols() for selection
type Url ¶
type Url struct { // version 4 uuid // urls can/should/must also be be uniquely identified by Url Id string `json:"id,omitempty"` // A Url is uniquely identified by URI string without // any normalization. Url strings must always be absolute. Url string `json:"url"` // Created timestamp rounded to seconds in UTC Created time.Time `json:"created,omitempty"` // Updated timestamp rounded to seconds in UTC Updated time.Time `json:"updated,omitempty"` // Timestamp for most recent GET request LastGet *time.Time `json:"lastGet,omitempty"` // Timestamp for most revent HEAD request LastHead *time.Time `json:"lastHead,omitempty"` // Returned HTTP status code Status int `json:"status,omitempty"` // Returned HTTP 'Content-Type' header ContentType string `json:"contentType,omitempty"` // Result of mime sniffing to GET response body, as detailed at https://mimesniff.spec.whatwg.org ContentSniff string `json:"contentSniff,omitempty"` // ContentLength in bytes, will be the header value if only a HEAD request has been issued // After a valid GET response, it will be set to the length of the returned response ContentLength int64 `json:"contentLength,omitempty"` // best guess at a filename based on url string analysis // if you just want to know what type of file this is, this is the field to use. FileName string `json:"fileName,omitempty"` // HTML Title tag attribute Title string `json:"title,omitempty"` // Time remote server took to transfer content in miliseconds. // TODO - currently not implemented DownloadTook int `json:"downloadTook,omitempty"` // Time taken to in miliseconds. currently not implemented HeadersTook int `json:"headersTook,omitempty"` // key-value slice of returned headers from most recent HEAD or GET request // stored in the form [key,value,key,value...] Headers []string `json:"headers,omitempty"` // any associative metadata Meta map[string]interface{} `json:"meta,omitempty"` // Hash is a multihash sha-256 of res.Body Hash string `json:"hash,omitempty"` // Url to saved content ContentUrl string `json:"contentUrl,omitempty"` // Uncrawlable information Uncrawlable *Uncrawlable `json:"uncrawlable,omitempty"` }
URL represents... a url. TODO - consider renaming to Resource
func UnmarshalUrls ¶
UnmarshalUrls takes an sql cursor & returns a slice of url pointers expects columns to math urlCols()
func (Url) DatastoreType ¶
func (*Url) ExtractDocLinks ¶
ExtractDocLinks extracts & stores a page's linked documents by selecting all a[href] links from a given qoquery document, using the receiver *Url as the base
func (*Url) File ¶
File leverages a url's hash to generate a file that can have it's bytes read back
func (*Url) HandleGetResponse ¶
func (u *Url) HandleGetResponse(store datastore.Datastore, res *http.Response) (body []byte, links []*Link, err error)
HandleGetResponse performs all necessary actions in response to a GET request, regardless of weather it came from a crawl or archive request
func (*Url) HeadersMap ¶
HeadersMap formats u.Headers (a string slice) as a map[header]value
func (*Url) InboundLinks ¶
InboundLinks returns a slice of url strings that link to this url
func (*Url) NewSQLModel ¶
func (u *Url) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Url) OutboundLinks ¶
Outbound returns a slice of url strings that this url links to
func (*Url) SQLParams ¶
func (u *Url) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLArgs formats a url struct for inserting / updating into postgres
func (*Url) ShouldEnqueueGet ¶
ShouldEnqueueGet returns weather the url can be added to the que for a GET request. keep in mind only urls who's domain are are marked crawl : true in the domains list will be candidates for GET requests. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration
func (*Url) ShouldEnqueueHead ¶
ShouldEnqueueHead returns weather the url can be added to the que for a HEAD request. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration
func (*Url) ShouldPutS3 ¶
ShouldPutS3 is a chance to override weather the content should be stored
func (*Url) SuspectedContentUrl ¶
SuspectedContentUrl examines the url string, returns true if there's a reasonable chance the url leads to content
func (*Url) UnmarshalSQL ¶
UnmarshalSQL reads an sql response into the url receiver it expects the request to have used urlCols() for selection
func (*Url) WarcRequest ¶
Source Files ¶
- archive.go
- collection.go
- collection_item.go
- collection_items.go
- collections.go
- consensus.go
- custom_crawl.go
- custom_crawls.go
- datarepo.go
- errors.go
- file.go
- link.go
- links.go
- meta.go
- metadata.go
- normalize.go
- primer.go
- primers.go
- queries.go
- search.go
- snapshot.go
- source.go
- sources.go
- uncrawlable.go
- uncrawlables.go
- url.go
- urls.go