aurorabridge

package
v0.0.0-...-c0686e8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 10, 2022 License: Apache-2.0 Imports: 48 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ProcedureAbortJobUpdate         = "auroraschedulermanager__abortjobupdate"
	ProcedureGetConfigSummary       = "readonlyscheduler__getconfigsummary"
	ProcedureGetJobSummary          = "readonlyscheduler__getjobsummary"
	ProcedureGetJobUpdateDetails    = "readonlyscheduler__getjobupdatedetails"
	ProcedureGetJobUpdateDiff       = "readonlyscheduler__getjobupdatediff"
	ProcedureGetJobUpdateSummaries  = "readonlyscheduler__getjobupdatesummaries"
	ProcedureGetJobs                = "readonlyscheduler__getjobs"
	ProcedureGetTasksWithoutConfigs = "readonlyscheduler__gettaskswithoutconfigs"
	ProcedureGetTierConfigs         = "readonlyscheduler__gettierconfigs"
	ProcedureKillTasks              = "auroraschedulermanager__killtasks"
	ProcedurePauseJobUpdate         = "auroraschedulermanager__pausejobupdate"
	ProcedurePulseJobUpdate         = "auroraschedulermanager__pulsejobupdate"
	ProcedureResumeJobUpdate        = "auroraschedulermanager__resumejobupdate"
	ProcedureRollbackJobUpdate      = "auroraschedulermanager__rollbackjobupdate"
	ProcedureStartJobUpdate         = "auroraschedulermanager__startjobupdate"

	// Metric tag names
	TagProcedure    = "procedure"     // handler procedure name
	TagResponseCode = "responsecode"  // handler response code
	TagService      = "updateservice" // input service name

	// Metric names
	MetricNameCalls       = "calls"
	MetricNameCallLatency = "call_latency"
)

Variables

This section is empty.

Functions

This section is empty.

Types

type DefaultRespoolSpec

type DefaultRespoolSpec struct {
	OwningTeam      string                    `yaml:"owning_team"`
	LDAPGroups      []string                  `yaml:"ldap_groups"`
	Description     string                    `yaml:"description"`
	Resources       []*respool.ResourceConfig `yaml:"resources"`
	Policy          respool.SchedulingPolicy  `yaml:"policy"`
	ControllerLimit *respool.ControllerLimit  `yaml:"controller_limit"`
	SlackLimit      *respool.SlackLimit       `yaml:"slack_limit"`
}

DefaultRespoolSpec defines parameters used to create a default respool for bridge when boostrapping a new cluster.

type EventPublisher

type EventPublisher interface {
	// Start the watch on pod state changes and
	// publish of kafka
	Start()

	// Stop the watch on pod state changes and
	// publishing to kafka
	Stop()
}

EventPublisher sets up a watch on pod state change event and then publishes them to kafka stream.

func NewEventPublisher

func NewEventPublisher(
	kafkaURL string,
	jobClient statelesssvc.JobServiceYARPCClient,
	podClient podsvc.PodServiceYARPCClient,
	watchClient watchsvc.WatchServiceYARPCClient,
	client *http.Client,
	publishEvents bool,
) EventPublisher

NewEventPublisher return event publisher to stream pod state changes to kafka

type EventPublisherConfig

type EventPublisherConfig struct {
	// KafkaURL represents the stream on which task state changes to publish
	KafkaURL string `yaml:"kafka_url"`

	// PublishEvents defines whether to publish task state changes to kafka
	PublishEvents bool `yaml:"publish_events"`

	// GRPCMsgSize defines the max payload size that can be send and recv
	GRPCMsgSize int `yaml:"grpc_msg_size"`
}

EventPublisherConfig represents config for publishing task state change events to kafks

type Metrics

type Metrics struct {
	Procedures map[string]*PerProcedureMetrics
}

Metrics is the struct containing all metrics relevant for aurora api parrity

func NewMetrics

func NewMetrics(scope tally.Scope) *Metrics

NewMetrics returns a new Metrics struct, with all metrics initialized and rooted at the given tally.Scope

type PerProcedureMetrics

type PerProcedureMetrics struct {
	ResponseCodes map[api.ResponseCode]*PerResponseCodeMetrics
}

type PerResponseCodeMetrics

type PerResponseCodeMetrics struct {
	Scope       tally.Scope
	Calls       tally.Counter
	CallLatency tally.Timer
}

type RespoolLoader

type RespoolLoader interface {
	Load(context.Context, bool) (*v1peloton.ResourcePoolID, error)
}

RespoolLoader lazily loads a resource pool. If the resource pool does not exist, it boostraps one with provided defaults.

func NewRespoolLoader

func NewRespoolLoader(
	config RespoolLoaderConfig,
	client respool.ResourceManagerYARPCClient,
) RespoolLoader

NewRespoolLoader creates a new RespoolLoader.

type RespoolLoaderConfig

type RespoolLoaderConfig struct {
	RetryInterval      time.Duration      `yaml:"retry_interval"`
	RespoolPath        string             `yaml:"respool_path"`
	GPURespoolPath     string             `yaml:"gpu_respool_path"`
	DefaultRespoolSpec DefaultRespoolSpec `yaml:"default_respool_spec"`
}

RespoolLoaderConfig defines RespoolLoader configuration.

type Server

type Server struct {
	sync.Mutex

	ID string
	// contains filtered or unexported fields
}

Server contains all structs necessary to run a aurorabrdige server. This struct also implements leader.Node interface so that it can perform leader election among multiple aurorabridge server instances.

func NewServer

func NewServer(
	httpPort int,
	cfg leader.ElectionConfig,
	eventPublisher EventPublisher,
	role string) (*Server, error)

NewServer creates a aurorabridge Server instance.

func (*Server) GainedLeadershipCallback

func (s *Server) GainedLeadershipCallback() (err error)

GainedLeadershipCallback is the callback when the current node becomes the leader

func (*Server) GetID

func (s *Server) GetID() string

GetID function returns jobmgr app address. This implements leader.Nomination.

func (*Server) HasGainedLeadership

func (s *Server) HasGainedLeadership() bool

HasGainedLeadership returns true iff once GainedLeadershipCallback completes

func (*Server) LostLeadershipCallback

func (s *Server) LostLeadershipCallback() error

LostLeadershipCallback is the callback when the current node lost leadership

func (*Server) ShutDownCallback

func (s *Server) ShutDownCallback() error

ShutDownCallback is the callback to shut down gracefully if possible

type ServiceHandler

type ServiceHandler struct {
	// contains filtered or unexported fields
}

ServiceHandler implements a partial Aurora API. Various unneeded methods have been left intentionally unimplemented.

func NewServiceHandler

func NewServiceHandler(
	config ServiceHandlerConfig,
	parent tally.Scope,
	jobClient statelesssvc.JobServiceYARPCClient,
	jobmgrClient jobmgrsvc.JobManagerServiceYARPCClient,
	podClient podsvc.PodServiceYARPCClient,
	respoolLoader RespoolLoader,
	random common.Random,
	jobIdCache cache.JobIDCache,
) (*ServiceHandler, error)

NewServiceHandler creates a new ServiceHandler.

func (*ServiceHandler) AbortJobUpdate

func (h *ServiceHandler) AbortJobUpdate(
	ctx context.Context,
	key *api.JobUpdateKey,
	message *string,
) (*api.Response, error)

AbortJobUpdate permanently aborts the job update. Does not remove the update history.

func (*ServiceHandler) AddInstances

func (h *ServiceHandler) AddInstances(
	ctx context.Context,
	key *api.InstanceKey,
	count *int32) (*api.Response, error)

AddInstances will remain unimplemented.

func (*ServiceHandler) CreateJob

func (h *ServiceHandler) CreateJob(
	ctx context.Context,
	description *api.JobConfiguration) (*api.Response, error)

CreateJob will remain unimplemented.

func (*ServiceHandler) DescheduleCronJob

func (h *ServiceHandler) DescheduleCronJob(
	ctx context.Context,
	job *api.JobKey) (*api.Response, error)

DescheduleCronJob will remain unimplemented.

func (*ServiceHandler) GetConfigSummary

func (h *ServiceHandler) GetConfigSummary(
	ctx context.Context,
	job *api.JobKey) (*api.Response, error)

GetConfigSummary fetches the configuration summary of active tasks for the specified job.

func (*ServiceHandler) GetJobSummary

func (h *ServiceHandler) GetJobSummary(
	ctx context.Context,
	role *string,
) (*api.Response, error)

GetJobSummary returns a summary of jobs, optionally only those owned by a specific role.

func (*ServiceHandler) GetJobUpdateDetails

func (h *ServiceHandler) GetJobUpdateDetails(
	ctx context.Context,
	key *api.JobUpdateKey,
	query *api.JobUpdateQuery,
) (*api.Response, error)

GetJobUpdateDetails gets job update details. jobUpdateKey is marked to be deprecated from Aurora, and not used Aggregator It will be ignored to get job update details

func (*ServiceHandler) GetJobUpdateDiff

func (h *ServiceHandler) GetJobUpdateDiff(
	ctx context.Context,
	request *api.JobUpdateRequest) (*api.Response, error)

GetJobUpdateDiff gets the diff between client (desired) and server (current) job states. TaskConfig is not set in GetJobUpdateDiffResult, since caller is not using it and fetching previous podspec is expensive

func (*ServiceHandler) GetJobUpdateSummaries

func (h *ServiceHandler) GetJobUpdateSummaries(
	ctx context.Context,
	query *api.JobUpdateQuery,
) (*api.Response, error)

GetJobUpdateSummaries gets job update summaries.

func (*ServiceHandler) GetJobs

func (h *ServiceHandler) GetJobs(
	ctx context.Context,
	ownerRole *string,
) (*api.Response, error)

GetJobs fetches the status of jobs. ownerRole is optional, in which case all jobs are returned.

func (*ServiceHandler) GetPendingReason

func (h *ServiceHandler) GetPendingReason(
	ctx context.Context,
	query *api.TaskQuery) (*api.Response, error)

GetPendingReason will remain unimplemented.

func (*ServiceHandler) GetQuota

func (h *ServiceHandler) GetQuota(
	ctx context.Context,
	ownerRole *string) (*api.Response, error)

GetQuota will remain unimplemented.

func (*ServiceHandler) GetRoleSummary

func (h *ServiceHandler) GetRoleSummary(
	ctx context.Context) (*api.Response, error)

GetRoleSummary will remain unimplemented.

func (*ServiceHandler) GetTasksStatus

func (h *ServiceHandler) GetTasksStatus(
	ctx context.Context,
	query *api.TaskQuery) (*api.Response, error)

GetTasksStatus will remain unimplemented.

func (*ServiceHandler) GetTasksWithoutConfigs

func (h *ServiceHandler) GetTasksWithoutConfigs(
	ctx context.Context,
	query *api.TaskQuery,
) (*api.Response, error)

GetTasksWithoutConfigs is the same as getTasksStatus but without the TaskConfig.ExecutorConfig data set.

func (*ServiceHandler) GetTierConfigs

func (h *ServiceHandler) GetTierConfigs(
	ctx context.Context,
) (*api.Response, error)

GetTierConfigs is a no-op. It is only used to determine liveness of the scheduler.

func (*ServiceHandler) KillTasks

func (h *ServiceHandler) KillTasks(
	ctx context.Context,
	job *api.JobKey,
	instances map[int32]struct{},
	message *string,
) (*api.Response, error)

KillTasks initiates a kill on tasks.

func (*ServiceHandler) PauseJobUpdate

func (h *ServiceHandler) PauseJobUpdate(
	ctx context.Context,
	key *api.JobUpdateKey,
	message *string,
) (*api.Response, error)

PauseJobUpdate pauses the specified job update. Can be resumed by resumeUpdate call.

func (*ServiceHandler) PopulateJobConfig

func (h *ServiceHandler) PopulateJobConfig(
	ctx context.Context,
	description *api.JobConfiguration) (*api.Response, error)

PopulateJobConfig will remain unimplemented.

func (*ServiceHandler) PulseJobUpdate

func (h *ServiceHandler) PulseJobUpdate(
	ctx context.Context,
	key *api.JobUpdateKey,
) (*api.Response, error)

PulseJobUpdate allows progress of the job update in case blockIfNoPulsesAfterMs is specified in JobUpdateSettings. Unblocks progress if the update was previously blocked. Responds with ResponseCode.INVALID_REQUEST in case an unknown update key is specified.

func (*ServiceHandler) ReplaceCronTemplate

func (h *ServiceHandler) ReplaceCronTemplate(
	ctx context.Context,
	config *api.JobConfiguration) (*api.Response, error)

ReplaceCronTemplate will remain unimplemented.

func (*ServiceHandler) RestartShards

func (h *ServiceHandler) RestartShards(
	ctx context.Context,
	job *api.JobKey,
	shardIds map[int32]struct{}) (*api.Response, error)

RestartShards will remain unimplemented.

func (*ServiceHandler) ResumeJobUpdate

func (h *ServiceHandler) ResumeJobUpdate(
	ctx context.Context,
	key *api.JobUpdateKey,
	message *string,
) (*api.Response, error)

ResumeJobUpdate resumes progress of a previously paused job update.

func (*ServiceHandler) RollbackJobUpdate

func (h *ServiceHandler) RollbackJobUpdate(
	ctx context.Context,
	key *api.JobUpdateKey,
	message *string,
) (*api.Response, error)

RollbackJobUpdate rollbacks the specified active job update to the initial state.

func (*ServiceHandler) ScheduleCronJob

func (h *ServiceHandler) ScheduleCronJob(
	ctx context.Context,
	description *api.JobConfiguration) (*api.Response, error)

ScheduleCronJob will remain unimplemented.

func (*ServiceHandler) StartCronJob

func (h *ServiceHandler) StartCronJob(
	ctx context.Context,
	job *api.JobKey) (*api.Response, error)

StartCronJob will remain unimplemented.

func (*ServiceHandler) StartJobUpdate

func (h *ServiceHandler) StartJobUpdate(
	ctx context.Context,
	request *api.JobUpdateRequest,
	message *string,
) (*api.Response, error)

StartJobUpdate starts update of the existing service job.

type ServiceHandlerConfig

type ServiceHandlerConfig struct {
	GetJobUpdateWorkers           int `yaml:"get_job_update_workers"`
	GetJobsWorkers                int `yaml:"get_jobs_workers"`
	GetJobSummaryWorkers          int `yaml:"get_job_summary_workers"`
	StopPodWorkers                int `yaml:"stop_pod_workers"`
	CreateJobSpecForUpdateWorkers int `yaml:"create_job_spec_for_update_workers"`

	// Config for number of workers for getTasksWithoutConfigs endpoint.
	GetTasksWithoutConfigsWorkers         int `yaml:"get_tasks_without_configs_workers"`
	GetTasksWithoutConfigsMediumWorkers   int `yaml:"get_tasks_without_configs_medium_workers"`
	GetTasksWithoutConfigsMediumThreshold int `yaml:"get_tasks_without_configs_medium_threshold"`
	GetTasksWithoutConfigsLargeWorkers    int `yaml:"get_tasks_without_configs_large_workers"`
	GetTasksWithoutConfigsLargeThreshold  int `yaml:"get_tasks_without_configs_large_threshold"`

	// getTasksWithoutConfigs task querying depth. It limits the number
	// of pods to be included in the return result - return pods from
	// current run if set to 1, return pods from current and previous one
	// run if set to 2, etc.
	// In Aurora, getTasksWithoutConfigs will return all the current and
	// completed tasks stored in the DB. However, in Peloton, since we
	// keep the complete history of pods, this parameter is used to limit
	// the number of pods returned.
	PodRunsDepth int `yaml:"pod_runs_depth"`

	// Maximum number of pods that will get returned, while meeting
	// minPodRunsDepth requirement.
	GetTasksPodMax int `yaml:"get_tasks_pod_max"`

	// QueryJobsLimit specifies Limit parameter passed to QueryJobs request
	QueryJobsLimit uint32 `yaml:"query_jobs_limit"`

	// InstanceEventsLimit specifies the limit on number of events per instance
	InstanceEventsLimit uint32 `yaml:"instance_events_limit"`

	// UpdatesLimit specifies the limit on number of updates to include per job
	UpdatesLimit uint32 `yaml:"updates_limit"`

	// ThemrosExecutor is config used to generate mesos CommandInfo / ExecutorInfo
	// for Thermos executor
	ThermosExecutor config.ThermosExecutorConfig `yaml:"thermos_executor"`

	// Enable Peloton inplace update
	EnableInPlace bool `yaml:"enable-inplace-update"`
}

ServiceHandlerConfig defines ServiceHandler configuration.

Directories

Path Synopsis
Package fixture provides testing fixtures for aurorabridge.
Package fixture provides testing fixtures for aurorabridge.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL