slo

package
v0.12.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 17, 2023 License: Apache-2.0 Imports: 46 Imported by: 0

Documentation

Overview

Reference : https://github.dev/slok/sloth/tree/main/internal/alert/window.go

Index

Constants

View Source
const (

	// alert rule names
	RecordingRuleSuffix = "-recording"
	MetadataRuleSuffix  = "-metadata"
	AlertRuleSuffix     = "-alerts"
)

Variables

View Source
var EnabledFilters = map[string]embed.FS{"metricgroups": MetricGroups, "servicegroups": ServiceGroups}

map of directory names to their embed.FS

View Source
var MetricGroups embed.FS
View Source
var ServiceGroups embed.FS

Functions

func ApplyFiltersToCortexEvents added in v0.6.0

func ApplyFiltersToCortexEvents(seriesInfo *cortexadmin.SeriesInfoList) (*sloapi.MetricGroupList, error)

func DetectActiveWindows added in v0.6.0

func DetectActiveWindows(severity string, matrix *prommodel.Matrix) ([]*sloapi.AlertFiringWindows, error)

DetectActiveWindows

@warning Expectation is that the timestamps are ordered when traversing matrix --> sample streams --> [] values but this may not always be the case

func LeftJoinSlice added in v0.6.0

func LeftJoinSlice[T comparable](arr1, arr2 []T) []T

func LeftJoinSliceAbstract added in v0.6.0

func LeftJoinSliceAbstract[T any, S comparable](arr1, arr2 []T, getId func(T) S) []T

func MergeLabels

func MergeLabels(ms ...map[string]string) map[string]string

func MergeRuleGroups added in v0.6.0

func MergeRuleGroups(left rulefmt.RuleGroup, right *rulefmt.RuleGroup) *rulefmt.RuleGroup

func NewWindowRange added in v0.6.0

func NewWindowRange(sloPeriod string) []string

func QuerySLOComponentByRawQuery added in v0.6.0

func QuerySLOComponentByRawQuery(
	ctx context.Context,
	client cortexadmin.CortexAdminClient,
	rawQuery string,
	clusterId string,
) (*model.Vector, error)

func QuerySLOComponentByRawQueryRange added in v0.6.0

func QuerySLOComponentByRawQueryRange(
	ctx context.Context,
	client cortexadmin.CortexAdminClient,
	rawQuery string,
	clusterId string,
	start time.Time,
	end time.Time,
	step time.Duration,
) (*model.Matrix, error)

func QuerySLOComponentByRecordName added in v0.6.0

func QuerySLOComponentByRecordName(
	ctx context.Context,
	client cortexadmin.CortexAdminClient,
	recordName string,
	clusterId string,
) (*model.Vector, error)

func RegisterDatasource

func RegisterDatasource(datasource string, sloImpl SLOStore, serviceImpl ServiceBackend)

func Scheme

func Scheme(ctx context.Context) meta.Scheme

func TimeDurationToPromStr added in v0.6.0

func TimeDurationToPromStr(t time.Duration) string

Pretty simple durations for prometheus.

func ToMatchingSubsetIdenticalMetric added in v0.6.0

func ToMatchingSubsetIdenticalMetric(goodEvents, totalEvents []*sloapi.Event) (good, total []*sloapi.Event)

ToMatchingSubsetIdenticalMetric only applies when the good metric & total metric id is the same

Types

type Filter added in v0.6.0

type Filter struct {
	Name    string        `yaml:"name"`
	Filters []FilterValue `yaml:"filters"`
	Ignore  []FilterValue `yaml:"ignore"`
}

func GetGroupConfigsFromEmbed added in v0.6.0

func GetGroupConfigsFromEmbed(lg *slog.Logger, dirName string, dir embed.FS) []Filter

type FilterValue added in v0.6.0

type FilterValue struct {
	Value Regexp `yaml:"value"`
	Score int    `yaml:"score"`
}

type IdentificationLabels added in v0.6.0

type IdentificationLabels map[string]string

type LabelPair added in v0.6.0

type LabelPair struct {
	Key  string
	Vals []string
}

type LabelPairs added in v0.6.0

type LabelPairs []LabelPair

func (LabelPairs) Construct added in v0.6.0

func (l LabelPairs) Construct() string

type Metric added in v0.6.0

type Metric string

type MetricIds

type MetricIds struct {
	Good  string
	Total string
}

type MonitoringServiceBackend

type MonitoringServiceBackend struct {
	RequestBase
}

func (MonitoringServiceBackend) ListEvents added in v0.6.0

func (m MonitoringServiceBackend) ListEvents() (*sloapi.EventList, error)

func (MonitoringServiceBackend) ListMetrics added in v0.6.0

func (MonitoringServiceBackend) ListServices added in v0.6.0

func (m MonitoringServiceBackend) ListServices() (*sloapi.ServiceList, error)

func (*MonitoringServiceBackend) WithCurrentRequest

func (m *MonitoringServiceBackend) WithCurrentRequest(ctx context.Context, req proto.Message) ServiceBackend

type Plugin

type Plugin struct {
	slo.UnsafeSLOServer
	system.UnimplementedSystemPluginClient
	// contains filtered or unexported fields
}

func NewPlugin

func NewPlugin(ctx context.Context) *Plugin

func (*Plugin) CloneSLO

func (p *Plugin) CloneSLO(ctx context.Context, ref *corev1.Reference) (*sloapi.SLOData, error)

func (*Plugin) CloneToClusters added in v0.6.0

func (p *Plugin) CloneToClusters(ctx context.Context, req *sloapi.MultiClusterSLO) (*sloapi.MultiClusterFailures, error)

func (*Plugin) CreateSLO

func (p *Plugin) CreateSLO(ctx context.Context, slorequest *sloapi.CreateSLORequest) (*corev1.Reference, error)

func (*Plugin) DeleteSLO

func (p *Plugin) DeleteSLO(ctx context.Context, req *corev1.Reference) (*emptypb.Empty, error)

func (*Plugin) GetSLO

func (p *Plugin) GetSLO(ctx context.Context, ref *corev1.Reference) (*sloapi.SLOData, error)

func (*Plugin) ListEvents added in v0.6.0

func (p *Plugin) ListEvents(ctx context.Context, req *sloapi.ListEventsRequest) (*sloapi.EventList, error)

func (*Plugin) ListMetrics

func (*Plugin) ListSLOs

func (*Plugin) ListServices

func (p *Plugin) ListServices(ctx context.Context, req *sloapi.ListServicesRequest) (*sloapi.ServiceList, error)

func (*Plugin) Preview added in v0.6.0

func (*Plugin) Status

func (p *Plugin) Status(ctx context.Context, ref *corev1.Reference) (*sloapi.SLOStatus, error)

func (*Plugin) UpdateSLO

func (p *Plugin) UpdateSLO(ctx context.Context, req *sloapi.SLOData) (*emptypb.Empty, error)

func (*Plugin) UseAPIExtensions

func (p *Plugin) UseAPIExtensions(intf system.ExtensionClientInterface)

func (*Plugin) UseKeyValueStore

func (p *Plugin) UseKeyValueStore(client system.KeyValueStoreClient)

func (*Plugin) UseManagementAPI

func (p *Plugin) UseManagementAPI(client managementv1.ManagementClient)

type Regexp added in v0.6.0

type Regexp struct {
	*regexp.Regexp
}

Regexp adds unmarshalling from json for regexp.Regexp

func (*Regexp) MarshalText added in v0.6.0

func (r *Regexp) MarshalText() ([]byte, error)

MarshalText marshals regexp.Regexp as string

func (*Regexp) UnmarshalText added in v0.6.0

func (r *Regexp) UnmarshalText(b []byte) error

UnmarshalText unmarshals json into a regexp.Regexp

type RequestBase

type RequestBase struct {
	// contains filtered or unexported fields
}

type SLO added in v0.6.0

type SLO struct {
	// contains filtered or unexported fields
}

func CreateSLORequestToStruct added in v0.6.0

func CreateSLORequestToStruct(c *sloapi.CreateSLORequest) *SLO

func NewSLO added in v0.6.0

func NewSLO(
	sloName string,
	sloPeriod string,
	objective float64,
	svc Service,
	goodMetric Metric,
	totalMetric Metric,
	userLabels map[string]string,
	goodEvents []LabelPair,
	totalEvents []LabelPair,
) *SLO

func SLODataToStruct added in v0.6.0

func SLODataToStruct(s *sloapi.SLOData) *SLO

func SLOFromId added in v0.6.0

func SLOFromId(
	sloName string,
	sloPeriod string,
	objective float64,
	svc Service,
	goodMetric Metric,
	totalMetric Metric,
	userLabels map[string]string,
	goodEvents []LabelPair,
	totalEvents []LabelPair,
	id string,
) *SLO

func (*SLO) AlertPageThreshold added in v0.6.0

func (s *SLO) AlertPageThreshold() float64

func (*SLO) ConstructAlertingRuleGroup added in v0.6.0

func (s *SLO) ConstructAlertingRuleGroup(interval *time.Duration) rulefmt.RuleGroup

ConstructAlertingRuleGroup

Note: first two are expected to be the recording rules Note: second two are expected to be the alerting rules

func (*SLO) ConstructCortexRules added in v0.6.0

func (s *SLO) ConstructCortexRules(interval *time.Duration) (sli, metadata, alerts rulefmt.RuleGroup)

func (*SLO) ConstructMetadataRules added in v0.6.0

func (s *SLO) ConstructMetadataRules(interval *time.Duration) rulefmt.RuleGroup

func (*SLO) ConstructRawAlertQueries added in v0.6.0

func (s *SLO) ConstructRawAlertQueries() (yaml.Node, yaml.Node)

func (*SLO) ConstructRecordingRuleGroup added in v0.6.0

func (s *SLO) ConstructRecordingRuleGroup(interval *time.Duration) rulefmt.RuleGroup

func (*SLO) GetId added in v0.6.0

func (s *SLO) GetId() string

func (*SLO) GetName added in v0.6.0

func (s *SLO) GetName() string

func (*SLO) GetObjective added in v0.6.0

func (s *SLO) GetObjective() float64

func (*SLO) GetPeriod added in v0.6.0

func (s *SLO) GetPeriod() string

func (*SLO) GetPrometheusRuleFilterByIdLabels added in v0.6.0

func (s *SLO) GetPrometheusRuleFilterByIdLabels() (string, error)

func (*SLO) RawBudgetRemainingQuery added in v0.6.0

func (s *SLO) RawBudgetRemainingQuery() string

func (*SLO) RawCurrentBurnRateQuery added in v0.6.0

func (s *SLO) RawCurrentBurnRateQuery() string

RawCurrentBurnRateQuery ratioRate : slo:sli_error:ratio_rate<some-period-string>

func (*SLO) RawDashboardInfoQuery added in v0.6.0

func (s *SLO) RawDashboardInfoQuery() string

func (*SLO) RawErrorBudgetQuery added in v0.6.0

func (s *SLO) RawErrorBudgetQuery() string

func (*SLO) RawGoodEventsQuery added in v0.6.0

func (s *SLO) RawGoodEventsQuery(w string) (string, error)

func (*SLO) RawObjectiveQuery added in v0.6.0

func (s *SLO) RawObjectiveQuery() string

func (*SLO) RawPeriodBurnRateQuery added in v0.6.0

func (s *SLO) RawPeriodBurnRateQuery() string

func (*SLO) RawPeriodDurationQuery added in v0.6.0

func (s *SLO) RawPeriodDurationQuery() string

func (*SLO) RawSLIQuery added in v0.6.0

func (s *SLO) RawSLIQuery(w string) (string, error)

func (*SLO) RawTotalEventsQuery added in v0.6.0

func (s *SLO) RawTotalEventsQuery(w string) (string, error)

func (*SLO) SetId added in v0.6.0

func (s *SLO) SetId(id string)

func (*SLO) SetName added in v0.6.0

func (s *SLO) SetName(input string)

type SLOLogging

type SLOLogging struct {
	RequestBase
}

type SLOMonitoring

type SLOMonitoring struct {
	RequestBase
}

func (SLOMonitoring) Clone

func (SLOMonitoring) Create

func (s SLOMonitoring) Create() (*corev1.Reference, error)

func (SLOMonitoring) Delete

func (s SLOMonitoring) Delete(existing *sloapi.SLOData) error

func (SLOMonitoring) MultiClusterClone added in v0.6.0

func (s SLOMonitoring) MultiClusterClone(
	base *sloapi.SLOData,
	inputClusters []*corev1.Reference,
	svcBackend ServiceBackend,
) ([]*corev1.Reference, []*sloapi.SLOData, []error)

func (SLOMonitoring) Preview added in v0.6.0

func (s SLOMonitoring) Preview(slo *SLO) (*sloapi.SLOPreviewResponse, error)

func (SLOMonitoring) Status

func (s SLOMonitoring) Status(existing *sloapi.SLOData) (*sloapi.SLOStatus, error)

Status Only return errors here that should be considered severe InternalServerErrors - Check if enough time has passed to evaluate the rules - First Checks if it has NoData - If it has Data, check if it is within budget - If is within budget, check if any alerts are firing

func (SLOMonitoring) Update

func (s SLOMonitoring) Update(existing *sloapi.SLOData) (*sloapi.SLOData, error)

func (*SLOMonitoring) WithCurrentRequest

func (s *SLOMonitoring) WithCurrentRequest(ctx context.Context, req proto.Message) SLOStore

type SLOStore

type SLOStore interface {
	// This method has to handle storage of the SLO in the KVStore itself
	// since there can be partial successes inside the method
	Create() (*corev1.Reference, error)
	Update(existing *sloapi.SLOData) (*sloapi.SLOData, error)
	Delete(existing *sloapi.SLOData) error
	Clone(clone *sloapi.SLOData) (*corev1.Reference, *sloapi.SLOData, error)
	MultiClusterClone(
		base *sloapi.SLOData,
		clusters []*corev1.Reference,
		svcBackend ServiceBackend,
	) ([]*corev1.Reference, []*sloapi.SLOData, []error)
	Status(existing *sloapi.SLOData) (*sloapi.SLOStatus, error)
	Preview(s *SLO) (*sloapi.SLOPreviewResponse, error)
	WithCurrentRequest(ctx context.Context, req proto.Message) SLOStore
}

func NewSLOMonitoringStore

func NewSLOMonitoringStore(p *Plugin, lg *slog.Logger) SLOStore

type Service added in v0.6.0

type Service string

type ServiceBackend

type ServiceBackend interface {
	ListServices() (*sloapi.ServiceList, error)
	ListMetrics() (*sloapi.MetricGroupList, error)
	ListEvents() (*sloapi.EventList, error)
	WithCurrentRequest(ctx context.Context, req proto.Message) ServiceBackend
}

func NewMonitoringServiceBackend

func NewMonitoringServiceBackend(p *Plugin, lg *slog.Logger) ServiceBackend

type SliQueryInfo added in v0.6.0

type SliQueryInfo struct {
	GoodQuery  string
	TotalQuery string
}

SliQueryInfo used for filling sli query templates

type SloFiltersInfo added in v0.6.0

type SloFiltersInfo struct {
	SloIdLabel      string
	SloServiceLabel string
	SloNameLabel    string
	SloId           string
	SloService      string
	SloName         string
}

type StorageAPIs

type StorageAPIs struct {
	SLOs     storage.KeyValueStoreT[*slo.SLOData]
	Services storage.KeyValueStoreT[*slo.Service]
	Metrics  storage.KeyValueStoreT[*slo.Metric]
}

type UserLabels added in v0.6.0

type UserLabels []string

type Window

type Window struct {
	// ErrorBudgetPercent is the error budget % consumed for a full time window.
	// Google gives us some defaults in its SRE workbook that work correctly most of the times:
	// - Page quick:   2%
	// - Page slow:    5%
	// - Ticket quick: 10%
	// - Ticket slow:  10%
	ErrorBudgetPercent float64
	// ShortWindow is the small window used on the alerting part to stop alerting
	// during a long window because we consumed a lot of error budget but the problem
	// is already gone.
	ShortWindow time.Duration
	// LongWindow is the long window used to alert based on the errors happened on that
	// long window.
	LongWindow time.Duration
}

func (Window) Validate

func (w Window) Validate() error

type Windows

type Windows struct {
	SLOPeriod   time.Duration
	PageQuick   Window
	PageSlow    Window
	TicketQuick Window
	TicketSlow  Window
}

func GenerateGoogleWindows

func GenerateGoogleWindows(budgetingInterval time.Duration) *Windows

https://sre.google/workbook/alerting-on-slos/

budgeting interval is the shortest interval to monitor in a window

func (Windows) GetBurnRateFactor

func (w Windows) GetBurnRateFactor(totalWindow time.Duration, errorBudgetPercent float64, consumptionWindow time.Duration) float64

getBurnRateFactor calculates the burnRateFactor (speed) needed to consume all the error budget available percent in a specific time window taking into account the total time window.

func (Windows) GetSpeedPageQuick

func (w Windows) GetSpeedPageQuick() float64

Error budget speeds based on a full time window, however once we have the factor (speed) the value can be used with any time window.

func (Windows) GetSpeedPageSlow

func (w Windows) GetSpeedPageSlow() float64

func (Windows) GetSpeedTicketQuick

func (w Windows) GetSpeedTicketQuick() float64

func (Windows) GetSpeedTicketSlow

func (w Windows) GetSpeedTicketSlow() float64

func (Windows) Validate

func (w Windows) Validate() error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL