v1alpha1

package
v0.0.0-...-e139c8d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 17, 2020 License: Apache-2.0 Imports: 9 Imported by: 0

Documentation

Overview

Package v1alpha1 is the v1alpha1 version of the API. +groupName=paddlepaddle.org

Index

Constants

View Source
const (
	// CRDKind is the kind of K8s CRD.
	CRDKind = "TrainingJob"
	// CRDKindPlural is the plural of CRDKind.
	CRDKindPlural = "trainingjobs"
	// CRDShortName is the short name of CRD.
	CRDShortName = "tj"
	// CRDGroup is the name of group.
	CRDGroup = "paddlepaddle.org"
	// CRDVersion is the version of CRD.
	CRDVersion = "v1alpha1"
)
View Source
const (
	// TrainingJobPhaseNone is empty TrainingJobPhase.
	TrainingJobPhaseNone TrainingJobPhase = ""
	// TrainingJobPhaseCreating is creating TrainingJobPhase.
	TrainingJobPhaseCreating = "Creating"
	// TrainingJobPhaseRunning is running TrainingJobPhase.
	TrainingJobPhaseRunning = "Running"
	// TrainingJobPhaseScaling is scaling TrainingJobPhase.
	TrainingJobPhaseScaling = "Scaling"
	// TrainingJobPhaseSucceeded is succeeded TrainingJobPhase.
	TrainingJobPhaseSucceeded = "Succeed"
	// TrainingJobPhaseFailed is failed TrainingJobPhase.
	TrainingJobPhaseFailed = "Failed"
	// TrainingJobPhaseTimeout is failed TrainingJobPhase.
	TrainingJobPhaseTimeout = "Timeout"
)
View Source
const (
	// ResourceStateNone is the initial state of training job
	ResourceStateNone ResourceState = ""
	// ResourceStateStarting is the starting state of ResourceState.
	ResourceStateStarting = "starting"
	// ResourceStateRunning is the  running state of ResourceState.
	ResourceStateRunning = "running"
	// ResourceStateFailed is the failed state of ResourceState.
	ResourceStateFailed = "failed"
	// ResourceStateSucceeded is the succeeded state of ResourceState
	ResourceStateSucceeded = "succeeded"
)
View Source
const (
	Local JobType = "local"
	Nccl2         = "nccl2"
	Multi         = "multi"
)

Job type const.

Variables

View Source
var (
	// SchemeBuilder will call register
	SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
	// AddToScheme will apply all the stored functions to the scheme
	AddToScheme = SchemeBuilder.AddToScheme
)
View Source
var SchemeGroupVersion = schema.GroupVersion{Group: CRDGroup, Version: CRDVersion}

SchemeGroupVersion is the group version used to register these objects.

Functions

func CRDName

func CRDName() string

CRDName returns name of crd

func Resource

func Resource(resource string) schema.GroupResource

Resource takes an unqualified resource and returns a Group-qualified GroupResource.

Types

type Annotations

type Annotations struct {
	Usergroupid string `json:"usergroupid"`
	Userid      string `json:"userid"`
	Priority    string `json:"priority"`
	Scheduler   string `json:"scheduler"`
	Walltime    int    `json:"walltime"`
}

Annotations that offering additional metadata.

func (*Annotations) DeepCopy

func (in *Annotations) DeepCopy() *Annotations

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Annotations.

func (*Annotations) DeepCopyInto

func (in *Annotations) DeepCopyInto(out *Annotations)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type Framework

type Framework struct {
	Name FrameworkName `json:"name"`
	Type JobType       `json:"type"`
}

Framework which operator support.

func (*Framework) DeepCopy

func (in *Framework) DeepCopy() *Framework

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Framework.

func (*Framework) DeepCopyInto

func (in *Framework) DeepCopyInto(out *Framework)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type FrameworkName

type FrameworkName string

FrameworkName that operator support.

const (
	Paddle     FrameworkName = "paddle"
	TensorFlow               = "tensorflow"
)

Framework name const.

type JobType

type JobType string

JobType that operator support.

type MasterSpec

type MasterSpec struct {
	EtcdEndpoint string                      `json:"etcd-endpoint"`
	Resources    corev1.ResourceRequirements `json:"resources"`
	ReplicaSpec  *v1beta1.ReplicaSet         `json:"replicaSpec"`
	Envs         map[string]string           `json:"envs"`

	//for preStop
	GracePeriodSeconds *int64              `json:"grace_period_seconds"`
	PreStopCmd         []string            `json:"pre_stop_cmd"`
	Tolerations        []corev1.Toleration `json:"tolerations"`
	NodeSelector       map[string]string   `json:"node_selector"`
	LivenessProbe      *corev1.Probe       `json:"liveness_probe"`
	ReadinessProbe     *corev1.Probe       `json:"readiness_probe"`
}

MasterSpec is the spec for a master in the paddle job

func (*MasterSpec) DeepCopy

func (in *MasterSpec) DeepCopy() *MasterSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MasterSpec.

func (*MasterSpec) DeepCopyInto

func (in *MasterSpec) DeepCopyInto(out *MasterSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type PserverSpec

type PserverSpec struct {
	Entrypoint       string                        `json:"entrypoint"`
	MinInstance      int                           `json:"min-instance"`
	MaxInstance      int                           `json:"max-instance"`
	Resources        corev1.ResourceRequirements   `json:"resources"`
	ReplicaSpec      *v1beta1.ReplicaSet           `json:"replicaSpec"`
	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets"`
	Envs             map[string]string             `json:"envs"`
	//for preStop
	GracePeriodSeconds *int64              `json:"grace_period_seconds"`
	PreStopCmd         []string            `json:"pre_stop_cmd"`
	Tolerations        []corev1.Toleration `json:"tolerations"`
	NodeSelector       map[string]string   `json:"node_selector"`
	//IndexSucceed marks if the operator has added labels to pservers successfully in the initial phase
	IndexSucceed   bool          `json:"index_succeed"`
	LivenessProbe  *corev1.Probe `json:"liveness_probe"`
	ReadinessProbe *corev1.Probe `json:"readiness_probe"`
}

PserverSpec is the spec for pservers in the paddle job

func (*PserverSpec) DeepCopy

func (in *PserverSpec) DeepCopy() *PserverSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PserverSpec.

func (*PserverSpec) DeepCopyInto

func (in *PserverSpec) DeepCopyInto(out *PserverSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type ResourceState

type ResourceState string

ResourceState is the state of a type of resource

type ScaleResults

type ScaleResults string

ScaleResults is the result of scale

const (
	// ScaleTrue means scale succeed.
	ScaleTrue ScaleResults = "True"
	// ScaleFalse means scale failed.
	ScaleFalse ScaleResults = "False"
	// ScaleUnknown means kubernetes can't decide if a scale succeed or not.
	ScaleUnknown ScaleResults = "Unknown"
)

type TrainerJobScaleRecord

type TrainerJobScaleRecord struct {
	// ScaleTimestamp is the time to scale a TrainingJob
	ScaleTimestamp metav1.Time `json:"scaleTimestamp"`
	// Additional is the additional the job to scale
	Additional int32 `json:"additional"`
	// Status is the result of the scale。
	Status ScaleResults `json:"status"`
	// reason is the reason for the scale failed.
	// +optional
	Reason string `json:"reason,omitempty"`
}

TrainerJobScaleRecord is record of trainer jobs.

func (*TrainerJobScaleRecord) DeepCopy

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerJobScaleRecord.

func (*TrainerJobScaleRecord) DeepCopyInto

func (in *TrainerJobScaleRecord) DeepCopyInto(out *TrainerJobScaleRecord)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainerJobScaleRecords

type TrainerJobScaleRecords struct {
	ScaleRecords []*TrainerJobScaleRecord
}

TrainerJobScaleRecords is records of trainer jobs.

func (*TrainerJobScaleRecords) DeepCopy

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerJobScaleRecords.

func (*TrainerJobScaleRecords) DeepCopyInto

func (in *TrainerJobScaleRecords) DeepCopyInto(out *TrainerJobScaleRecords)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainerSpec

type TrainerSpec struct {
	EtcdEndpoint     string                        `json:"etcd-endpoint"`
	Entrypoint       string                        `json:"entrypoint"`
	Workspace        string                        `json:"workspace"`
	MinInstance      int                           `json:"min-instance"`
	MaxInstance      int                           `json:"max-instance"`
	Resources        corev1.ResourceRequirements   `json:"resources"`
	ReplicaSpec      *batchv1.Job                  `json:"replicaSpec"`
	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets"`
	Envs             map[string]string             `json:"envs"`
	//for preStop
	GracePeriodSeconds *int64              `json:"grace_period_seconds"`
	PreStopCmd         []string            `json:"pre_stop_cmd"`
	Tolerations        []corev1.Toleration `json:"tolerations"`
	NodeSelector       map[string]string   `json:"node_selector"`
	//IndexSucceed marks if the operator has added labels to trainers successfully in the initial phase
	IndexSucceed   bool          `json:"index_succeed"`
	LivenessProbe  *corev1.Probe `json:"liveness_probe"`
	ReadinessProbe *corev1.Probe `json:"readiness_probe"`
}

TrainerSpec is the spec for trainers in the paddle job

func (*TrainerSpec) DeepCopy

func (in *TrainerSpec) DeepCopy() *TrainerSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerSpec.

func (*TrainerSpec) DeepCopyInto

func (in *TrainerSpec) DeepCopyInto(out *TrainerSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingJob

type TrainingJob struct {
	metav1.TypeMeta   `json:",inline"`
	metav1.ObjectMeta `json:"metadata,omitempty"`
	Spec              TrainingJobSpec   `json:"spec"`
	Status            TrainingJobStatus `json:"status"`
}

TrainingJob is a specification for a TrainingJob resource

func (*TrainingJob) DeepCopy

func (in *TrainingJob) DeepCopy() *TrainingJob

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJob.

func (*TrainingJob) DeepCopyInto

func (in *TrainingJob) DeepCopyInto(out *TrainingJob)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

func (*TrainingJob) DeepCopyObject

func (in *TrainingJob) DeepCopyObject() runtime.Object

DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.

func (*TrainingJob) Elastic

func (s *TrainingJob) Elastic() bool

Elastic returns true if the job can scale to more workers.

func (*TrainingJob) Fulfillment

func (s *TrainingJob) Fulfillment() float64

Fulfillment returns the fulfillment of a trainingjob

func (*TrainingJob) GPU

func (s *TrainingJob) GPU() int

GPU convert Resource Limit Quantity to int

func (*TrainingJob) NeedGPU

func (s *TrainingJob) NeedGPU() bool

NeedGPU returns true if the job need GPU resource to run.

func (*TrainingJob) String

func (s *TrainingJob) String() string

String returns marshal string of TrainingJob

func (*TrainingJob) TrainerCPURequestMilli

func (s *TrainingJob) TrainerCPURequestMilli() int64

TrainerCPURequestMilli returns cpu request of each trainer instance

func (*TrainingJob) TrainerGPULimit

func (s *TrainingJob) TrainerGPULimit() int

TrainerGPULimit returns gpu limit of each trainer instance

func (*TrainingJob) TrainerMemRequestMega

func (s *TrainingJob) TrainerMemRequestMega() int64

TrainerMemRequestMega returns memory request of each trainer instance

type TrainingJobList

type TrainingJobList struct {
	metav1.TypeMeta `json:",inline"`
	metav1.ListMeta `json:"metadata"`
	// Items means the list of paddle job/TrainingJob
	Items []TrainingJob `json:"items"`
}

TrainingJobList is a list of TrainingJob resources

func (*TrainingJobList) DeepCopy

func (in *TrainingJobList) DeepCopy() *TrainingJobList

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobList.

func (*TrainingJobList) DeepCopyInto

func (in *TrainingJobList) DeepCopyInto(out *TrainingJobList)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

func (*TrainingJobList) DeepCopyObject

func (in *TrainingJobList) DeepCopyObject() runtime.Object

DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.

type TrainingJobPhase

type TrainingJobPhase string

TrainingJobPhase is the phase of TrainingJob

type TrainingJobSpec

type TrainingJobSpec struct {
	// General job attributes.
	Image             string               `json:"image"`
	HostNetwork       bool                 `json:"host_network"`
	Port              int                  `json:"port"`
	PortsNum          int                  `json:"ports_num"`
	PortsNumForSparse int                  `json:"ports_num_for_sparse"`
	TrainerPort       int                  `json:"trainer_port"`
	TrainerPortsNum   int                  `json:"trainer_ports_num"`
	FaultTolerant     bool                 `json:"fault_tolerant"`
	LocalJob          bool                 `json:"local_job"` // LocalJob indicates if the job is local job or cluster job
	Passes            int                  `json:"passes"`
	Volumes           []corev1.Volume      `json:"volumes"`
	VolumeMounts      []corev1.VolumeMount `json:"VolumeMounts"`

	// TODO how to use these two params in matrix
	Mountpath string `json:"mountpath"`
	Nfsmount  string `json:"nfsmount"`

	Annotations Annotations `json:"annotations"`

	//TrainingJob components.
	Master  MasterSpec  `json:"master"`
	Pserver PserverSpec `json:"pserver"`
	Trainer TrainerSpec `json:"trainer"`

	IsNccl    bool       `json:"is_nccl"`
	FrameWork *Framework `json:"frame_work"`
	//Scheduling components.
	SchedulerName string `json:"schedulerName,omitempty"`
	PodGroupName  string `json:"podGroupName,omitempty"`

	// Matrix field indicates whether the backend container is matrix
	Matrix bool `json:"matrix"`
}

TrainingJobSpec is the spec for a TrainingJob resource

func (*TrainingJobSpec) DeepCopy

func (in *TrainingJobSpec) DeepCopy() *TrainingJobSpec

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobSpec.

func (*TrainingJobSpec) DeepCopyInto

func (in *TrainingJobSpec) DeepCopyInto(out *TrainingJobSpec)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingJobStatus

type TrainingJobStatus struct {
	// Phase is phase of TrainingJob
	Phase TrainingJobPhase `json:"phase"`
	// Reason is the reason of job phase failed
	Reason string `json:"reason"`
	// ScaleStatus is autoscale status of trainer jobs
	// TODO(ZhengQi): this will used in autoscale mode in future.
	ScaleRecords TrainerJobScaleRecords `json:"scale_records"`
	// ReplicaStatuses is detail status of resources
	// TODO(ZhengQi): should we only considered trainer job now?
	ReplicaStatuses []*TrainingResourceStatus `json:"replica_statuses"`
	// StartTime marks when the trainingjob is Running
	StartTime metav1.Time `json:"startTime"`
	// Released marks resource have been released
	Released bool `json:"released"`
}

TrainingJobStatus is the status for a TrainingJob resource.

func (*TrainingJobStatus) DeepCopy

func (in *TrainingJobStatus) DeepCopy() *TrainingJobStatus

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobStatus.

func (*TrainingJobStatus) DeepCopyInto

func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingResourceStatus

type TrainingResourceStatus struct {
	// TrainingResourceType the type of TrainingJob resource, include MASTER PSERVER and TRAINER
	TrainingResourceType `json:"training_resource_type"`
	// State is the state of a type of resource
	State ResourceState `json:"state"`
	// ResourceStates is the number of resource in different state
	ResourceStates map[ResourceState]int `json:"resource_states"`
}

TrainingResourceStatus is the status of every resource

func (*TrainingResourceStatus) DeepCopy

DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingResourceStatus.

func (*TrainingResourceStatus) DeepCopyInto

func (in *TrainingResourceStatus) DeepCopyInto(out *TrainingResourceStatus)

DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.

type TrainingResourceType

type TrainingResourceType string

TrainingResourceType the type of TrainingJob resource, include MASTER PSERVER and TRAINER

const (
	// MASTER is the master name of TrainingResourceType.
	MASTER TrainingResourceType = "master"
	// PSERVER is the pserver name of TrainingResourceType.
	PSERVER TrainingResourceType = "pserver"
	// TRAINER is the trainer name of TrainingResourceType.
	TRAINER TrainingResourceType = "trainer"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL