db

package
v0.1.7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 30, 2024 License: MIT Imports: 10 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GetValue

func GetValue(kv *badger.DB, key string) int

func GetValues

func GetValues(kv *badger.DB, keys []string) (map[string][]byte, error)

func InitKeyVal

func InitKeyVal(dir string) *badger.DB

InitBadger finds and initializes connection to a badger key-value store. If the store does not exist, InitBadger creates it.

func NewDB

func NewDB(cnf config.Config) *sql.DB

func NewDbGorm

func NewDbGorm(cnf config.Config) *gorm.DB

func QuoteString

func QuoteString(s string) string

QuoteString makes a string value compatible with SQL synthax by wrapping it in quotes and escaping internal quotes.

func ResetKeyVal

func ResetKeyVal(dir string) error

func RunQuery

func RunQuery(d *sql.DB, q string) *sql.Rows

func Truncate

func Truncate(d *sql.DB, tables []string) error

Types

type ColBhlRefs

type ColBhlRefs struct {
	// RecordID is the Catalogue of Life identifier of a name-string.
	RecordID string `gorm:"type:varchar(100);index:record_id_bhl"`

	// ItemID is automatically generated identifier from BHL database.
	// It corresponds to ID field in Item.
	ItemID uint

	// PartID is an automatically generated identifier from BHL database.
	PartID uint

	// PageID is the identifier autogenerated by BHL database.
	PageID uint

	// Odds calculated by Naive Bayes algorithm. We consider odds from 0.01 and
	// higher.
	// Here are the Odds of the best result.
	Odds float64

	// Quality is the probability that a reference is 'real'
	// 0 - nothing is found
	// 1 - 15% (Odds > 0.01)
	// 2 - 50% (Odds > 0.1)
	// 3 - 80% (Odds > 1)
	// 4 - 98% (Odds > 10)
	Quality int
}

type ColNomenRef

type ColNomenRef struct {
	// ID is automatically generated.
	ID uint `gorm:"primary_key"`

	// RecordID is the Catalogue of Life identifier of a name-string.
	RecordID string `gorm:"type:varchar(100);primary_key;auto_increment:false"`

	// Name is the verbatim name-string from the CoL.
	Name string `sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// Ref is a nomenclatural reference from Catalogue of Life.
	Ref string

	// Kingdom is a kingdom name of the record.
	Kingdom string `gorm:"type:varchar(100)"`

	// Phylum is a phylum name of the record.
	Phylum string `gorm:"type:varchar(100)"`

	// Class is a class name of the record.
	Class string `gorm:"type:varchar(100)"`

	// Ordr is a order name of the record.
	Ordr string `gorm:"type:varchar(100)"`

	// Family is a family name of the record.
	Family string `gorm:"type:varchar(100);index"`

	// Genus is a genus name of the record.
	Genus string `gorm:"type:varchar(100);index"`

	// CanonicalSimple is a canonical form without hybrid signs, ranks etc.
	CanonicalSimple string `gorm:"type:varchar(255);index:canonical_simple" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// CanonicalStem is a canonical form after removal of suffixes and
	// substitution of some characters.
	CanonicalStem string `gorm:"type:varchar(255);index:canonical_stem" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// ItemID is automatically generated identifier from BHL database.
	// It corresponds to ID field in Item.
	ItemID uint

	// PartID is an automatically generated identifier from BHL database.
	PartID uint

	// PageID is the identifier autogenerated by BHL database.
	PageID uint

	// RefsNum is the number of found links to potential nomenclatural references.
	RefsNum uint

	// Odds calculated by Naive Bayes algorithm. We consider odds from 0.01 and
	// higher.
	// Here are the Odds of the best result.
	Odds float64

	// Quality is the probability that a reference is 'real'
	// 0 - nothing is found
	// 1 - 15% (Odds > 0.01)
	// 2 - 50% (Odds > 0.1)
	// 3 - 80% (Odds > 1)
	// 4 - 98% (Odds > 10)
	Quality int `gorm:"index"`
}

ColNomenRef contains

type Item

type Item struct {
	// ID is the identifier autogenerated by BHL database.
	ID uint `gorm:"primary_key;auto_increment:false"`

	// Identifier generated by Internet Archive for the Item.
	BarCode string `gorm:"type:varchar(100);unique_index;not null"`

	// Vol contains not normalized volume field from BHl database.
	Vol string `gorm:"type:varchar(100)"`

	// YearStart contains the earliest year of publication. For journal volume
	// it would be a publication of the first journal issue, for a book it
	// would be the date of publication.
	YearStart sql.NullInt32

	// YearEnd contains the latest year of publication. The field is often
	// empty, if the Item was published at once.
	YearEnd sql.NullInt32

	// TitleID contains automatically generated id for the parent title of the
	// item.
	TitleID uint `gorm:"not null"`

	// TitleDOI is the DOI of an item.
	TitleDOI string `gorm:"type:varchar(100)"`

	// TitleName is the name of a journal or a book.
	TitleName string `gorm:"type:varchar(255)"`

	// TitleAbbr1 is an acronym of a title where the first letter of each
	// word is used.
	TitleAbbr1 string `gorm:"type:varchar(10)"`

	// TitleAbbr2 is an acronym of a title where 'common' words like 'and'
	// 'the' etc. are ommitted.
	TitleAbbr2 string `gorm:"type:varchar(10)"`

	// TitleYearStart the first year when a title was published.
	TitleYearStart sql.NullInt32

	// TitleYearEnd is the last year when a title was published.
	TitleYearEnd sql.NullInt32

	// TitleLang is the most prevalent language of a title.
	TitleLang string `gorm:"type:varchar(20)"`
}

Item is a physical entity digitized and aggregated by Internet Archive and BHL. It can be a volume of a journal, a book etc.

type ItemStats

type ItemStats struct {
	// ID is the Item identifier autogenerated by BHL database.
	ID uint `gorm:"primary_key;auto_increment:false"`

	// NamesTotal is the number of unique names in the item (rank genus and
	// lower) verified by the Catalogue of Life and used in statistics
	// calculations.
	NamesTotal uint `gorm:"not null"`

	// MainTaxon is the taxon containing more that 50% of all taxa in the item.
	MainTaxon string `gorm:"type:varchar(100)"`

	// MainTaxonRank is the rank of the MainTaxon.
	MainTaxonRank string `gorm:"type:varchar(100)"`

	// MainTaxonPercent is the percentage of taxa belonging to the MainTaxon.
	MainTaxonPercent uint `gorm:"type:varchar(100)"`

	// MainKingdom is the kingdom that contains most of the taxa in the item.
	MainKingdom string `gorm:"type:varchar(100)"`

	// MainKingdomPercent is the percentage of taxa associated with the
	// MainKingdom.
	MainKingdomPercent uint

	// AnimaliaNum is the number of unique names in the item associated with
	// Animalia by the Catalogue of Life.
	AnimaliaNum uint `gorm:"not null"`

	// PlantaeNum is the number of unique names in the item associated with
	// Plantae by the Catalogue of Life.
	PlantaeNum uint `gorm:"not null"`

	// FungiNum is the number of unique names in the item associated with Fungi
	// by the Catalogue of Life.
	FungiNum uint `gorm:"not null"`

	// BacteriaNum is the number of unique names in the item associated with
	// Bacteria by the Catalogue of Life.
	BacteriaNum uint `gorm:"not null"`

	// MainPhylum is the phylum that contains most of the taxa in the item.
	MainPhylum string `gorm:"type:varchar(100)"`

	// MainPhylumPercent is the percentage of taxa associated with the
	// MainPhylum.
	MainPhylumPercent uint

	// MainClass is the class that contains most of the taxa in the item.
	MainClass string `gorm:"type:varchar(100)"`

	// MainClassPercent is the percentage of taxa associated with the
	// MainClass.
	MainClassPercent uint

	// MainOrder is the order that contains most of the taxa in the item.
	MainOrder string `gorm:"type:varchar(100)"`

	// MainOrderPercent is the percentage of taxa associated with the
	// MainOrder.
	MainOrderPercent uint

	// MainFamily is the family that contains most of the taxa in the item.
	MainFamily string `gorm:"type:varchar(100)"`

	// MainFamilyPercent is the percentage of taxa associated with the
	// MainFamily.
	MainFamilyPercent uint

	// MainGenus is the family that contains most of the taxa in the item.
	MainGenus string `gorm:"type:varchar(100)"`

	// MainGenusPercent is the percentage of taxa associated with the
	// MainGenus.
	MainGenusPercent uint
}

ItemStats contains taxonomical statistics for items.

type NameOccurrence

type NameOccurrence struct {
	// NameStringID corresponds to ID field in NameString.
	// It is UUID v5 generated from the normalized version of
	// a detected name.
	NameStringID string `sql:"type:uuid;index:name_string"`

	// PageID corresponds to ID field in Page. It is a number automatically
	// generated by BHL database.
	PageID uint

	// OffsetStart is the starting position of a detected name on the page.
	// It is calculated using UTF-8 characters.
	OffsetStart uint

	// OffsetEnd is the ending position of a detected name on the page.
	// It is calculated using UTF-8 characters.
	OffsetEnd uint

	// OddsLog10 is a logarithm with base 10 of odds that a detected string is
	// actually a scientific name according to a Naive Bayes algorithm.
	OddsLog10 float64

	// AnnotNomen is a normalized nomenclatural annotation detected in a vicinity
	// of the occurrence. Examples of annotations are `NO_ANNOT`, `SP_NOV` etc.
	AnnotNomen string `gorm:"type:varchar(50);index:annot"`
}

NameOccurrence is the occurrence of a name-string in BHL.

type NameString

type NameString struct {
	// ID is UUID v5 generated from the Name field. There is always
	// 1:1 relationship between Name and ID.
	ID string `gorm:"type:uuid;primary_key"`

	// Name is the normalized version of detected in BHL name.
	Name string `sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// RecordID is the Catalogue of Life identifier of a matched taxon.
	RecordID string `gorm:"type:varchar(100);index:record_id"`

	// MatchType describes a resulting kind of a name-string match.
	// The following match types are possible:
	//
	// NoMatch - GNverifier did not find a match for the name-string.
	// Exact - Canonical form of a name matched exactly
	// PartialExact - Canonical form matched exactly after removal of some words.
	// Fuzzy - Canonical form matched, but with some differences.
	// PartialFuzzy - Canonical form matched with differences after removal of some words.
	// Virus - Name-string matched as a virus name.
	MatchType string `gorm:"type:varchar(100)"`

	// MatchSortOrder is used when verification has not only the best result, but
	// all results. The best match always has MatchSortOrder = 0, the higher the
	// number the less quality is assigned to the match.
	MatchSortOrder int

	// EditDistance shows how much difference exists between name-string and a
	// match according to Levenshtein algorithm.
	EditDistance uint

	// StemEditDistance shows how much difference exists between name-string and
	// a match according to Levenshtein algorithm.
	StemEditDistance uint

	// MatchedName provides the complete complete name-string.
	MatchedName string `gorm:"type:varchar(255)"`

	// MatchedCanonical provides canonical form of the matched name-string.
	MatchedCanonical string `gorm:"type:varchar(255);index:canonical" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// CurrentName is the full currently accepted name of the match
	// provided by the DataSource.
	CurrentName string `gorm:"type:varchar(255)"`

	// CurrentCanonical is a canonical form of the currently accepted name of
	// the match.
	CurrentCanonical string `gorm:"type:varchar(255);index:current_canonical" sql:"type:CHARACTER VARYING(255) COLLATE \"C\" NOT NULL"`

	// Classification contains a classification to the name provided by the
	// Catalogue of Life.
	Classification string

	// ClassificationRanks provides ranks information for classification path.
	ClassificationRanks string

	// ClassificationIDs provides RankIDs for classification path.
	ClassificationIDs string

	// DataSourceID is the ID of the data-source according to GNverifier.
	// The mapping of IDs to data-sources can be found at
	// https://verifier.globalnames.org/data_sources
	// site. In this case it should always be 1.
	DataSourceId sql.NullInt32

	// DataSourceTitle provides a title of the data-source that matched the
	// name-string. In this case it should always be `The Catalogue of Life`.
	DataSourceTitle string `gorm:"type:varchar(255)"`

	// DataSourcesNumber is the number of dataSources that matched the name.
	DataSourcesNumber uint

	// Curation provides information about a level of curation according to
	// GNverifier. The following categories are supported:
	//
	// NotCurated -- None of data-sources that matched a name-string are marked as curated.
	// Curated -- Some data-sources with a match are marked as curated.
	// AutoCurated -- Some data-sources have automatic quality control, but not much human curation.
	Curation bool `gorm:"index:curation"`

	// Occurrences is the number of times this name appeared in BHL texts.
	Occurences uint

	// OddsLog10 is a logarithm with base 10 of odds that a detected string is
	// actually a scientific name according to a Naive Bayes algorithm.
	OddsLog10 float32

	// Error contains error that happened during verification. If this field
	// is empty then verification was completed successfully for the name-string.
	Error string `gorm:"type:varchar(255)"`
}

NameString is a unique normalize name-string that had been matched, at least partially to the Catalogue of Life.

type Page

type Page struct {
	// ID is the identifier autogenerated by BHL database.
	ID uint `gorm:"primary_key;auto_increment:false"`

	// ItemID is automatically generated identifier from BHL database.
	// It corresponds to ID field in Item.
	ItemID uint `gorm:"index:item;not null"`

	// SequenceOrder corresponds to ordered position of a page in an item.
	// For example a an item page that is preceded by 4 other pages should
	// have SequenceOrder 5.
	SequenceOrder uint `gorm:"not null"`

	// PageNum corresponds to the page number/label assigned by the publisher
	// of the item.
	PageNum sql.NullInt64
}

Page contains metadata about a page file from BHL archive.

type Part

type Part struct {
	// ID is an automatically generated identifier from BHL database.
	ID uint `gorm:"primary_key;auto_increment:false"`

	// PageID is an automatically generated identifier for a page. It comes
	// from BHL database.
	PageID sql.NullInt32

	// ItemID is an automatically generated identifier for an item. It comes
	// from BHL database.
	ItemID sql.NullInt32

	// Length is the length of a part in pages.
	Length sql.NullInt32

	// DOI is a DOI assigned to the part.
	DOI string `gorm:"type:varchar(100)"`

	// ContributorName is a name of a project/person which provided information
	// about a part.
	ContributorName string `gorm:"type:varchar(255)"`

	// SequenceOrder is a sequencial position of a part in the item. For
	// example the second scientific paper in a journal will have a
	// the SequenceOrder 2.
	SequenceOrder sql.NullInt32

	// SegmentType describe a type of a part. For example chapter, article, etc.
	SegmentType string `gorm:"type:varchar(100)"`

	// Title is the title of the part.
	Title string `gorm:"type:text"`

	// ContainerTitle is a title of a parent unit (items title?).
	ContainerTitle string `gorm:"type:text"`

	// PublicationDetails describes information about publisher.
	PublicationDetails string `gorm:"type:text"`

	// Volume is the volume of a citation.
	Volume string `gorm:"type:varchar(100)"`

	// Series is series of a citation.
	Series string `gorm:"type:varchar(100)"`

	// Issue is an issue of a citation.
	Issue string `gorm:"type:varchar(100)"`

	// Date is the date of the part publication.
	Date string `gorm:"type:varchar(100)"`

	// Year is the year of a part.
	Year sql.NullInt32 `gorm:"index:year"`

	// YearEnd is the year when a part finished its publication.
	YearEnd sql.NullInt32

	// Month is the month when a part was published.
	Month sql.NullInt32

	// Day is the day when a part was published.
	Day sql.NullInt32

	// PageNumStart is the page number where a part starts.
	PageNumStart sql.NullInt32

	// PageNumEnd is the page number where a part ends.
	PageNumEnd sql.NullInt32

	// Language is the prevalent language of a part.
	Language string `gorm:"type:varchar(20)"`
}

Part is a distinct part of an item. It can be a chapter, an article, a scientific paper.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL