wikipedia

package

v0.3.0 Latest Latest Go to latest Published: Mar 22, 2024 License: Apache-2.0 Imports: 32 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

gitlab.com/peerdb/peerdb

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func ConvertArticleInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
func ConvertArticleRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, ...) errors.E
func ConvertArticleUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
func ConvertCategoryDescription(id, from, html string, doc *peerdb.Document) errors.E
func ConvertEntity(ctx context.Context, index string, logger zerolog.Logger, ...) (*peerdb.Document, errors.E)
func ConvertFileDescription(namespace uuid.UUID, id, from, html string, doc *peerdb.Document) errors.E
func ConvertPageInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
func ConvertPageRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, page AllPagesPage, ...) errors.E
func ConvertPageUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, ...) errors.E
func ConvertTemplateDescription(id, from string, html string, doc *peerdb.Document) errors.E
func ConvertWikimediaCommonsImage(ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, ...) (*peerdb.Document, errors.E)
func ConvertWikipediaArticle(id, html string, doc *peerdb.Document) errors.E
func ConvertWikipediaImage(ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, ...) (*peerdb.Document, errors.E)
func ExtractArticle(input string) (string, *goquery.Document, errors.E)
func ExtractArticleSummary(doc *goquery.Document) (string, errors.E)
func ExtractCategoryDescription(input string) (string, errors.E)
func ExtractFileDescriptions(input string) ([]string, errors.E)
func ExtractTemplateDescription(input string) (string, errors.E)
func FirstUpperCase(str string) string
func GetMediawikiFilePrefix(filename string) string
func GetPageHTML(ctx context.Context, httpClient *retryablehttp.Client, site, title string) (string, errors.E)
func GetWikidataDocumentID(id string) identifier.Identifier
func GetWikidataItem(ctx context.Context, index string, esClient *elastic.Client, id string) (*peerdb.Document, *elastic.SearchHit, errors.E)
func GetWikimediaCommonsFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)
func GetWikipediaFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)
func ListAllPages(ctx context.Context, httpClient *retryablehttp.Client, namespaces []int, ...) errors.E
func SetPageID(namespace uuid.UUID, mnemonicPrefix string, id string, pageID int64, ...) errors.E
func UpdateEmbeddedDocuments(ctx context.Context, index string, logger zerolog.Logger, ...) (bool, errors.E)
type AllPagesPage
type Image
- func (i *Image) UnmarshalJSON(b []byte) error
type ImageInfo
- func GetImageInfo(ctx context.Context, httpClient *retryablehttp.Client, site, token string, ...) (ImageInfo, errors.E)
type PageReference

Constants ¶

View Source

const (
	WikidataReference                 = "Wikidata"
	WikimediaCommonsEntityReference   = "CommonsEntity"
	WikimediaCommonsFileReference     = "CommonsFile"
	WikipediaCategoryReference        = "WikipediaCategory"
	WikipediaTemplateReference        = "WikipediaTemplate"
	WikimediaCommonsCategoryReference = "CommonsCategory"
	WikimediaCommonsTemplateReference = "CommonsTemplate"
)

View Source

const (
	// All pages API has this limit and it does not depend on the token used.
	APILimit = 500
)

Variables ¶

View Source

var (
	ErrSkipped       = errors.Base("skipped")
	ErrSilentSkipped = errors.BaseWrap(ErrSkipped, "silent skipped")
)

View Source

var (
	NameSpaceWikidata = uuid.MustParse("8f8ba777-bcce-4e45-8dd4-a328e6722c82")

	ErrNotFound = errors.Base("not found")
)

View Source

var (
	//nolint:gochecknoglobals
	NameSpaceWikipediaFile = uuid.MustParse("94b1c372-bc28-454c-a45a-2e4d29d15146")

	ErrWikimediaCommonsFile = errors.Base("file is from Wikimedia Commons error")
)

View Source

var (
	NameSpaceWikimediaCommonsFile = uuid.MustParse("31974ea8-ab0c-466d-9aaa-e1bf3c959edc")
)

Functions ¶

func ConvertArticleInCategories ¶

func ConvertArticleInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove categories which has previously been added but are later on removed?

func ConvertArticleRedirects ¶

func ConvertArticleRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove redirects which has previously been added but are later on removed?

func ConvertArticleUsedTemplates ¶

func ConvertArticleUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, article mediawiki.Article, doc *peerdb.Document) errors.E

TODO: How to remove templates which has previously been added but are later on removed?

func ConvertCategoryDescription ¶

func ConvertCategoryDescription(id, from, html string, doc *peerdb.Document) errors.E

func ConvertEntity ¶

func ConvertEntity(
	ctx context.Context, index string, logger zerolog.Logger, esClient *elastic.Client, cache *es.Cache,
	namespace uuid.UUID, entity mediawiki.Entity,
) (*peerdb.Document, errors.E)

ConvertEntity converts both Wikidata entities and Wikimedia Commons entities. Entities can reference only Wikimedia Commons files and not Wikipedia files.

func ConvertFileDescription ¶

func ConvertFileDescription(namespace uuid.UUID, id, from, html string, doc *peerdb.Document) errors.E

func ConvertPageInCategories ¶

func ConvertPageInCategories(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove categories which has previously been added but are later on removed?

func ConvertPageRedirects ¶

func ConvertPageRedirects(logger zerolog.Logger, namespace uuid.UUID, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove redirects which has previously been added but are later on removed?

func ConvertPageUsedTemplates ¶

func ConvertPageUsedTemplates(logger zerolog.Logger, namespace uuid.UUID, mnemonicPrefix, id string, page AllPagesPage, doc *peerdb.Document) errors.E

TODO: How to remove templates which has previously been added but are later on removed?

func ConvertTemplateDescription ¶

func ConvertTemplateDescription(id, from string, html string, doc *peerdb.Document) errors.E

func ConvertWikimediaCommonsImage ¶

func ConvertWikimediaCommonsImage(
	ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, token string, apiLimit int, image Image,
) (*peerdb.Document, errors.E)

func ConvertWikipediaArticle ¶

func ConvertWikipediaArticle(id, html string, doc *peerdb.Document) errors.E

TODO: Store the revision, license, and source used for the HTML into a meta claim. TODO: Investigate how to make use of additional entities metadata. See: https://www.mediawiki.org/wiki/Topic:Wotwu75akwx2wnsb TODO: Make internal links to other articles work in HTML (link to PeerDB documents instead). TODO: Remove links to other articles which do not exist, if there are any. TODO: Clean custom tags and attributes used in HTML to add metadata into HTML, potentially extract and store that. See: https://www.mediawiki.org/wiki/Specs/HTML/2.4.0 TODO: Remove some templates (e.g., infobox, top-level notices) and convert them to claims. TODO: Extract all links pointing out of the article into claims and reverse claims (so if they point to other documents, they should have backlink as claim).

func ConvertWikipediaImage ¶

func ConvertWikipediaImage(
	ctx context.Context, logger zerolog.Logger, httpClient *retryablehttp.Client, token string, apiLimit int, image Image,
) (*peerdb.Document, errors.E)

func ExtractArticle ¶

func ExtractArticle(input string) (string, *goquery.Document, errors.E)

func ExtractArticleSummary ¶

func ExtractArticleSummary(doc *goquery.Document) (string, errors.E)

ExtractArticleSummary should be called on the output of ExtractArticle.

func ExtractCategoryDescription ¶

func ExtractCategoryDescription(input string) (string, errors.E)

func ExtractFileDescriptions ¶

func ExtractFileDescriptions(input string) ([]string, errors.E)

func ExtractTemplateDescription ¶

func ExtractTemplateDescription(input string) (string, errors.E)

func FirstUpperCase ¶

func FirstUpperCase(str string) string

Implementation changes case only of ASCII characters. Using unicode.ToUpper sometimes changes case of characters for which Mediawiki does not change it. If we do change case when Mediawiki does not a corresponding file is not found. On the other hand, if we do not change case when Mediawiki does, then API returns a "normalized" field which fails JSON decoding so we detect such cases, if and when they happen. See: https://phabricator.wikimedia.org/T301758

func GetMediawikiFilePrefix ¶

func GetMediawikiFilePrefix(filename string) string

func GetPageHTML ¶

func GetPageHTML(ctx context.Context, httpClient *retryablehttp.Client, site, title string) (string, errors.E)

func GetWikidataDocumentID ¶

func GetWikidataDocumentID(id string) identifier.Identifier

func GetWikidataItem ¶

func GetWikidataItem(ctx context.Context, index string, esClient *elastic.Client, id string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func GetWikimediaCommonsFile ¶

func GetWikimediaCommonsFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func GetWikipediaFile ¶

func GetWikipediaFile(ctx context.Context, index string, esClient *elastic.Client, name string) (*peerdb.Document, *elastic.SearchHit, errors.E)

func ListAllPages ¶

func ListAllPages(
	ctx context.Context, httpClient *retryablehttp.Client, namespaces []int, site string, limiter *rate.Limiter, output chan<- AllPagesPage,
) errors.E

func SetPageID ¶

func SetPageID(namespace uuid.UUID, mnemonicPrefix string, id string, pageID int64, doc *peerdb.Document) errors.E

func UpdateEmbeddedDocuments ¶

func UpdateEmbeddedDocuments(
	ctx context.Context, index string, logger zerolog.Logger, esClient *elastic.Client, cache *es.Cache,
	skippedWikidataEntities *sync.Map, skippedWikimediaCommonsFiles *sync.Map, doc *peerdb.Document,
) (bool, errors.E)

Types ¶

type AllPagesPage ¶

type AllPagesPage struct {
	Identifier int64             `json:"pageid"`
	Namespace  int               `json:"ns"`
	Title      string            `json:"title"`
	Properties map[string]string `json:"pageprops"`
	Categories []PageReference   `json:"categories,omitempty"`
	Templates  []PageReference   `json:"templates,omitempty"`
	Redirects  []PageReference   `json:"redirects,omitempty"`
}

type Image ¶

type Image struct {
	Name          string                 `json:"img_name"`
	Size          int64                  `json:"img_size"`
	Width         int64                  `json:"img_width"`
	Height        int64                  `json:"img_height"`
	Metadata      map[string]interface{} `json:"-"`
	Bits          int64                  `json:"img_bits"`
	MediaType     string                 `json:"img_media_type"`
	MajorMIME     string                 `json:"img_major_mime"`
	MinorMIME     string                 `json:"img_minor_mime"`
	DescriptionID int64                  `json:"img_description_id"`
	ActorID       int64                  `json:"img_actor"`
	Timestamp     time.Time              `json:"-"`
	SHA1          string                 `json:"img_sha1"`
}

func (*Image) UnmarshalJSON ¶

func (i *Image) UnmarshalJSON(b []byte) error

type ImageInfo ¶

type ImageInfo struct {
	Mime                string  `json:"mime"`
	Size                int     `json:"size"`
	Width               int     `json:"width"`
	Height              int     `json:"height"`
	PageCount           int     `json:"pagecount"`
	Duration            float64 `json:"duration"`
	URL                 string  `json:"url"`
	DescriptionURL      string  `json:"descriptionurl"`
	DescriptionShortURL string  `json:"descriptionshorturl"`
	// Set if the requested page redirected to another page and info is from that other page.
	Redirect string `json:"-"`
}

func GetImageInfo ¶

func GetImageInfo(ctx context.Context, httpClient *retryablehttp.Client, site, token string, apiLimit int, title string) (ImageInfo, errors.E)

type PageReference ¶

type PageReference struct {
	Identifier int64  `json:"pageid,omitempty"`
	Namespace  int    `json:"ns"`
	Title      string `json:"title"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL