wikipedia

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 22, 2024 License: Apache-2.0 Imports: 32 Imported by: 0

Documentation

Index

Constants

View Source
const (
	WikidataReference                 = "Wikidata"
	WikimediaCommonsEntityReference   = "CommonsEntity"
	WikimediaCommonsFileReference     = "CommonsFile"
	WikipediaCategoryReference        = "WikipediaCategory"
	WikipediaTemplateReference        = "WikipediaTemplate"
	WikimediaCommonsCategoryReference = "CommonsCategory"
	WikimediaCommonsTemplateReference = "CommonsTemplate"
)
View Source
const (
	// All pages API has this limit and it does not depend on the token used.
	APILimit = 500
)

Variables

View Source
var (
	ErrSkipped       = errors.Base("skipped")
	ErrSilentSkipped = errors.BaseWrap(ErrSkipped, "silent skipped")
)
View Source
var (
	NameSpaceWikidata = uuid.MustParse("8f8ba777-bcce-4e45-8dd4-a328e6722c82")

	ErrNotFound = errors.Base("not found")
)
View Source
var (
	//nolint:gochecknoglobals
	NameSpaceWikipediaFile = uuid.MustParse("94b1c372-bc28-454c-a45a-2e4d29d15146")

	ErrWikimediaCommonsFile = errors.Base("file is from Wikimedia Commons error")
)
View Source
var (
	NameSpaceWikimediaCommonsFile = uuid.MustParse("31974ea8-ab0c-466d-9aaa-e1bf3c959edc")
)

Functions

func ConvertArticleInCategories

func ConvertArticleInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove categories which has previously been added but are later on removed?

func ConvertArticleRedirects

func ConvertArticleRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove redirects which has previously been added but are later on removed?

func ConvertArticleUsedTemplates

func ConvertArticleUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove templates which has previously been added but are later on removed?

func ConvertCategoryDescription

func ConvertCategoryDescription(id, from, html string, doc *peerdb.Document) errors.E

func ConvertEntity

func ConvertEntity(
	ctx context.Context, index string, logger zerolog.Logger, esClient *elastic.Client, cache *es.Cache,
	namespace uuid.UUID, entity mediawiki.Entity,
) (*peerdb.Document, errors.E)

ConvertEntity converts both Wikidata entities and Wikimedia Commons entities. Entities can reference only Wikimedia Commons files and not Wikipedia files.

func ConvertFileDescription

func ConvertFileDescription(namespace uuid.UUID, id, from, html string, doc *peerdb.Document) errors.E

func ConvertPageInCategories

func ConvertPageInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove categories which has previously been added but are later on removed?

func ConvertPageRedirects

func ConvertPageRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove redirects which has previously been added but are later on removed?

func ConvertPageUsedTemplates

func ConvertPageUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove templates which has previously been added but are later on removed?

func ConvertTemplateDescription

func ConvertTemplateDescription(id, from string, html string, doc *peerdb.Document) errors.E

func ConvertWikimediaCommonsImage

func ConvertWikimediaCommonsImage(
	ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, token string, apiLimit int, image Image,
) (*peerdb.Document, errors.E)

func ConvertWikipediaArticle

func ConvertWikipediaArticle(id, html string, doc *peerdb.Document) errors.E

TODO: Store the revision, license, and source used for the HTML into a meta claim. TODO: Investigate how to make use of additional entities metadata. See: https://www.mediawiki.org/wiki/Topic:Wotwu75akwx2wnsb TODO: Make internal links to other articles work in HTML (link to PeerDB documents instead). TODO: Remove links to other articles which do not exist, if there are any. TODO: Clean custom tags and attributes used in HTML to add metadata into HTML, potentially extract and store that. See: https://www.mediawiki.org/wiki/Specs/HTML/2.4.0 TODO: Remove some templates (e.g., infobox, top-level notices) and convert them to claims. TODO: Extract all links pointing out of the article into claims and reverse claims (so if they point to other documents, they should have backlink as claim).

func ConvertWikipediaImage

func ConvertWikipediaImage(
	ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, token string, apiLimit int, image Image,
) (*peerdb.Document, errors.E)

func ExtractArticle

func ExtractArticle(input string) (string, *goquery.Document, errors.E)

func ExtractArticleSummary

func ExtractArticleSummary(doc *goquery.Document) (string, errors.E)

ExtractArticleSummary should be called on the output of ExtractArticle.

func ExtractCategoryDescription

func ExtractCategoryDescription(input string) (string, errors.E)

func ExtractFileDescriptions

func ExtractFileDescriptions(input string) ([]string, errors.E)

func ExtractTemplateDescription

func ExtractTemplateDescription(input string) (string, errors.E)

func FirstUpperCase

func FirstUpperCase(str string) string

Implementation changes case only of ASCII characters. Using unicode.ToUpper sometimes changes case of characters for which Mediawiki does not change it. If we do change case when Mediawiki does not a corresponding file is not found. On the other hand, if we do not change case when Mediawiki does, then API returns a "normalized" field which fails JSON decoding so we detect such cases, if and when they happen. See: https://phabricator.wikimedia.org/T301758

func GetMediawikiFilePrefix

func GetMediawikiFilePrefix(filename string) string

func GetPageHTML

func GetPageHTML(ctx context.Context, httpClient *retryablehttp.Client, site, title string) (string, errors.E)

func GetWikidataDocumentID

func GetWikidataDocumentID(id string) identifier.Identifier

func GetWikidataItem

func GetWikidataItem(ctx context.Context, index string, esClient *elastic.Client, id string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func GetWikimediaCommonsFile

func GetWikimediaCommonsFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func GetWikipediaFile

func GetWikipediaFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func ListAllPages

func ListAllPages(
	ctx context.Context, httpClient *retryablehttp.Client, namespaces []int, site string, limiter *rate.Limiter, output chan<- AllPagesPage,
) errors.E

func SetPageID

func SetPageID(namespace uuid.UUID, mnemonicPrefix string, id string, pageID int64, doc *peerdb.Document) errors.E

func UpdateEmbeddedDocuments

func UpdateEmbeddedDocuments(
	ctx context.Context, index string, logger zerolog.Logger, esClient *elastic.Client, cache *es.Cache,
	skippedWikidataEntities *sync.Map, skippedWikimediaCommonsFiles *sync.Map, doc *peerdb.Document,
) (bool, errors.E)

Types

type AllPagesPage

type AllPagesPage struct {
	Identifier int64             `json:"pageid"`
	Namespace  int               `json:"ns"`
	Title      string            `json:"title"`
	Properties map[string]string `json:"pageprops"`
	Categories []PageReference   `json:"categories,omitempty"`
	Templates  []PageReference   `json:"templates,omitempty"`
	Redirects  []PageReference   `json:"redirects,omitempty"`
}

type Image

type Image struct {
	Name          string                 `json:"img_name"`
	Size          int64                  `json:"img_size"`
	Width         int64                  `json:"img_width"`
	Height        int64                  `json:"img_height"`
	Metadata      map[string]interface{} `json:"-"`
	Bits          int64                  `json:"img_bits"`
	MediaType     string                 `json:"img_media_type"`
	MajorMIME     string                 `json:"img_major_mime"`
	MinorMIME     string                 `json:"img_minor_mime"`
	DescriptionID int64                  `json:"img_description_id"`
	ActorID       int64                  `json:"img_actor"`
	Timestamp     time.Time              `json:"-"`
	SHA1          string                 `json:"img_sha1"`
}

func (*Image) UnmarshalJSON

func (i *Image) UnmarshalJSON(b []byte) error

type ImageInfo

type ImageInfo struct {
	Mime                string  `json:"mime"`
	Size                int     `json:"size"`
	Width               int     `json:"width"`
	Height              int     `json:"height"`
	PageCount           int     `json:"pagecount"`
	Duration            float64 `json:"duration"`
	URL                 string  `json:"url"`
	DescriptionURL      string  `json:"descriptionurl"`
	DescriptionShortURL string  `json:"descriptionshorturl"`
	// Set if the requested page redirected to another page and info is from that other page.
	Redirect string `json:"-"`
}

func GetImageInfo

func GetImageInfo(ctx context.Context, httpClient *retryablehttp.Client, site, token string, apiLimit int, title string) (ImageInfo, errors.E)

type PageReference

type PageReference struct {
	Identifier int64  `json:"pageid,omitempty"`
	Namespace  int    `json:"ns"`
	Title      string `json:"title"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL