Documentation ¶
Index ¶
- func GetPodRequestsFromPodTemplate(template *corev1.PodTemplateSpec) *cluster.PodRequest
- func GetPodRequestsFromTFJobReplica(replica *common.ReplicaSpec) *cluster.PodRequest
- func InitClientSets(itk kubeclientset.Interface, itt tfjobclientset.Interface, ...)
- func NewWorkerID(n int) string
- func ScheduleJob(requestsGroups *[]*cluster.PodRequests, constNodeRes cluster.NodeResources) (okNum []int, placementPlansPtr *[]*JobPlacementPlan)
- func SchedulingAlgorithm(waitingQueue *JobQueue, runningQueue *JobQueue, ...)
- func SortNodeFromJob(job *TrainingJob) (sortedNodes []string)
- func SortNodeFromNodeRes(nodes cluster.NodeResources, maxNum string) (sortedNodes []string)
- type JobPlacementPlan
- type JobQueue
- type JobsPlacementPlan
- type NodeResPlacePlan
- type TrainingJob
- type WorkerResources
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetPodRequestsFromPodTemplate ¶
func GetPodRequestsFromPodTemplate(template *corev1.PodTemplateSpec) *cluster.PodRequest
func GetPodRequestsFromTFJobReplica ¶
func GetPodRequestsFromTFJobReplica(replica *common.ReplicaSpec) *cluster.PodRequest
func InitClientSets ¶
func InitClientSets(itk kubeclientset.Interface, itt tfjobclientset.Interface, itm kubeshareclientset.Interface, op *options.ServerOption)
func NewWorkerID ¶
func ScheduleJob ¶
func ScheduleJob(requestsGroups *[]*cluster.PodRequests, constNodeRes cluster.NodeResources) (okNum []int, placementPlansPtr *[]*JobPlacementPlan)
ScheduleJob returns: * okNum: the max number of worker can be scheduled, * placementPlan: placement plan of workers, * PSPlace: nodeName of parameter server.
func SchedulingAlgorithm ¶
func SchedulingAlgorithm( waitingQueue *JobQueue, runningQueue *JobQueue, highPrioritySharePodsQueue *[]*kubesharev1.SharePod, highPrioritySharePodsQueueMutex *sync.Mutex, nodeRes cluster.NodeResources, )
func SortNodeFromJob ¶
func SortNodeFromJob(job *TrainingJob) (sortedNodes []string)
SortNodeFromJob sort node priority from job's placement paln, from the least important to the most
func SortNodeFromNodeRes ¶
func SortNodeFromNodeRes(nodes cluster.NodeResources, maxNum string) (sortedNodes []string)
SortNodeFromNodeRes sort node priority from cluster.NodeResource, from the most important to the least
Types ¶
type JobPlacementPlan ¶
type JobPlacementPlan map[string]*NodeResPlacePlan
NodeName => Node Placement Resource
func (*JobPlacementPlan) Count ¶
func (this *JobPlacementPlan) Count() (sum int)
func (*JobPlacementPlan) DeepCopy ¶
func (this *JobPlacementPlan) DeepCopy() *JobPlacementPlan
func (*JobPlacementPlan) PrintMe ¶
func (this *JobPlacementPlan) PrintMe()
type JobQueue ¶
type JobQueue []*TrainingJob
func (*JobQueue) Add ¶
func (this *JobQueue) Add(job *TrainingJob)
func (*JobQueue) Remove ¶
func (this *JobQueue) Remove(job *TrainingJob) error
type JobsPlacementPlan ¶
type JobsPlacementPlan map[*TrainingJob]*JobPlacementPlan
Job NS/Name => Job's Placement Plan
func ScaleDown ¶
func ScaleDown(highPriorityJob *cluster.PodRequests, runningQueue JobQueue, constNodeRes cluster.NodeResources) (can bool, scaleDownTarget JobsPlacementPlan, highPriorityJobPlacementPlan *[]*JobPlacementPlan)
ScaleDown scale down other jobs let high priority job runs. ScaleDown is only called if high priority job exists.
func ScaleUp ¶
func ScaleUp(runningQueue JobQueue, constNodeRes cluster.NodeResources) (can bool, scaleUpTarget JobsPlacementPlan)
func (*JobsPlacementPlan) DeepCopy ¶
func (this *JobsPlacementPlan) DeepCopy() *JobsPlacementPlan
func (*JobsPlacementPlan) PrintMe ¶
func (this *JobsPlacementPlan) PrintMe()
type NodeResPlacePlan ¶
type NodeResPlacePlan map[string]*WorkerResources
Worker ID => Worker Resources
func (*NodeResPlacePlan) DeepCopy ¶
func (this *NodeResPlacePlan) DeepCopy() *NodeResPlacePlan
func (*NodeResPlacePlan) PrintMe ¶
func (this *NodeResPlacePlan) PrintMe(prefix string)
type TrainingJob ¶
type TrainingJob struct { *tfv1.TFJob ReplicasPlacementPlan map[tfv1.TFReplicaType]*JobPlacementPlan ReplicaRequest map[tfv1.TFReplicaType]*cluster.PodRequest }
func NewTrainingJob ¶
func NewTrainingJob(tfjob *tfv1.TFJob) *TrainingJob
func (*TrainingJob) GetMinInstanceWorkerPodRequests ¶
func (this *TrainingJob) GetMinInstanceWorkerPodRequests() *cluster.PodRequests
func (*TrainingJob) GetPodRequests ¶
func (this *TrainingJob) GetPodRequests(rt tfv1.TFReplicaType) *cluster.PodRequests
func (*TrainingJob) UpdateTFJobTime ¶
func (this *TrainingJob) UpdateTFJobTime() error
type WorkerResources ¶
type WorkerResources struct { // ResourceName => ResourceId Workers map[string]string Critical bool }
func (*WorkerResources) DeepCopy ¶
func (this *WorkerResources) DeepCopy() *WorkerResources
Click to show internal directories.
Click to hide internal directories.