tensorflow

package
v1.7.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 1, 2023 License: Apache-2.0 Imports: 37 Imported by: 0

Documentation

Overview

Package controller provides a Kubernetes controller for a TFJob resource.

Index

Constants

View Source
const (
	FailedDeleteJobReason     = "FailedDeleteJob"
	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
)
View Source
const (
	// EnvCustomClusterDomain is the custom defined cluster domain, such as "svc.cluster.local".
	// Ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#a-records
	EnvCustomClusterDomain = "CUSTOM_CLUSTER_DOMAIN"
)

Variables

This section is empty.

Functions

func ContainsChiefOrMasterSpec added in v1.4.0

func ContainsChiefOrMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool

ContainsChiefOrMasterSpec returns true if the tfjob contains chief or master spec.

func GetPortFromTFJob

func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType) (int32, error)

GetPortFromTFJob gets the port of tensorflow container.

Types

type ClusterSpec

type ClusterSpec map[string][]string

ClusterSpec represents a cluster TensorFlow specification. https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster It is a map from job names to network addresses.

type SparseClusterSpec added in v1.4.0

type SparseClusterSpec struct {
	Worker map[int32]string `json:"worker"`
	PS     []string         `json:"ps"`
}

SparseClusterSpec enables a server to be configured without needing to know the identity of (for example) all other worker tasks. https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec

type SparseTFConfig added in v1.4.0

type SparseTFConfig struct {
	Cluster SparseClusterSpec `json:"cluster"`
	Task    TaskSpec          `json:"task"`
}

type TFConfig

type TFConfig struct {
	// Cluster represents a TensorFlow ClusterSpec.
	// See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec
	Cluster ClusterSpec `json:"cluster"`
	Task    TaskSpec    `json:"task"`
	// Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3
	// TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it.
	Environment string `json:"environment"`
}

TFConfig is a struct representing the distributed TensorFlow config. This struct is turned into an environment variable TF_CONFIG which is used by TensorFlow processes to configure themselves. https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig#methods https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details

type TFJobReconciler added in v1.4.0

type TFJobReconciler struct {
	common.JobController
	client.Client
	Scheme *runtime.Scheme

	Log logr.Logger
	// contains filtered or unexported fields
}

TFJobReconciler reconciles a TFJob object

func NewReconciler added in v1.4.0

func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *TFJobReconciler

func (*TFJobReconciler) ControllerName added in v1.4.0

func (r *TFJobReconciler) ControllerName() string

func (*TFJobReconciler) DeleteJob added in v1.4.0

func (r *TFJobReconciler) DeleteJob(job interface{}) error

func (*TFJobReconciler) GetAPIGroupVersion added in v1.4.0

func (r *TFJobReconciler) GetAPIGroupVersion() schema.GroupVersion

func (*TFJobReconciler) GetAPIGroupVersionKind added in v1.4.0

func (r *TFJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind

func (*TFJobReconciler) GetDefaultContainerName added in v1.4.0

func (r *TFJobReconciler) GetDefaultContainerName() string

func (*TFJobReconciler) GetDefaultContainerPortName added in v1.4.0

func (r *TFJobReconciler) GetDefaultContainerPortName() string

func (*TFJobReconciler) GetFrameworkName added in v1.7.0

func (r *TFJobReconciler) GetFrameworkName() string

func (*TFJobReconciler) GetGroupNameLabelValue added in v1.4.0

func (r *TFJobReconciler) GetGroupNameLabelValue() string

func (*TFJobReconciler) GetJobFromAPIClient added in v1.4.0

func (r *TFJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)

func (*TFJobReconciler) GetJobFromInformerCache added in v1.4.0

func (r *TFJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)

func (*TFJobReconciler) GetPodsForJob added in v1.4.0

func (r *TFJobReconciler) GetPodsForJob(jobObject interface{}) ([]*corev1.Pod, error)

GetPodsForJob returns the set of pods that this job should manage. It also reconciles ControllerRef by adopting/orphaning. Note that the returned Pods are pointers into the cache.

func (*TFJobReconciler) GetServicesForJob added in v1.4.0

func (r *TFJobReconciler) GetServicesForJob(jobObject interface{}) ([]*corev1.Service, error)

GetServicesForJob returns the set of services that this job should manage. It also reconciles ControllerRef by adopting/orphaning. Note that the returned services are pointers into the cache.

func (*TFJobReconciler) IsMasterRole added in v1.4.0

func (r *TFJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
	rtype kubeflowv1.ReplicaType, index int) bool

func (*TFJobReconciler) IsWorker0Completed added in v1.4.0

func (r *TFJobReconciler) IsWorker0Completed(tfJob *kubeflowv1.TFJob, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) (bool, error)

IsWorker0Completed returns true if pod of worker0 succeeded and exited with 0

func (*TFJobReconciler) Reconcile added in v1.4.0

func (r *TFJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state.

func (*TFJobReconciler) SetClusterSpec added in v1.4.0

func (r *TFJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error

Same as Func (tc *TFController) SetClusterSpec(...) in pod.go

func (*TFJobReconciler) SetupWithManager added in v1.4.0

func (r *TFJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error

SetupWithManager sets up the controller with the Manager.

func (*TFJobReconciler) UpdateJobStatus added in v1.4.0

func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error

func (*TFJobReconciler) UpdateJobStatusInApiServer added in v1.4.0

func (r *TFJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error

type TaskSpec

type TaskSpec struct {
	Type  string `json:"type"`
	Index int    `json:"index"`
}

TaskSpec is the specification for a task (PS or worker) of the TFJob.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL