Documentation ¶
Index ¶
- Constants
- Variables
- func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)
- func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
- func FormatMetrics(t *template.Template, m [][]Metric) (string, error)
- func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string
- func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool
- func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool
- func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool
- func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool
- func LinkIdExists(sysInfo *SystemInfo, linkId int) bool
- func ListPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, error)
- func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short
- func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
- func NewGroup() (dcgm.GroupHandle, func(), error)
- func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error
- func ReadCSVFile(filename string) ([][]string, error)
- func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool
- func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
- func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error)
- func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool
- func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo
- func ToString(value dcgm.FieldValue_v1) string
- func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error
- func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
- func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error
- func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, ...) error
- type ComputeInstanceInfo
- type Config
- type Counter
- type DCGMCollector
- type DeviceOptions
- type GpuInfo
- type GpuInstanceInfo
- type GroupInfo
- type KubernetesGPUIDType
- type Metric
- type MetricsPipeline
- type MetricsServer
- type MonitoringInfo
- func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo
- func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo
- func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo
- func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo
- func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo
- func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo
- func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo
- type PodInfo
- type PodMapper
- type SwitchInfo
- type SystemInfo
- type Transform
Constants ¶
View Source
const PARENT_ID_IGNORED = 0
Variables ¶
View Source
var ( SkipDCGMValue = "SKIPPING DCGM VALUE" FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" MIG_UUID_PREFIX = "MIG-" )
Functions ¶
func CreateGroupFromSystemInfo ¶
func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)
func CreateLinkGroupsFromSystemInfo ¶
func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
func FormatMetrics ¶
Template is passed here so that it isn't recompiled at each iteration
func GetGpuInstanceIdentifier ¶
func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string
func GpuIdExists ¶
func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool
func GpuInstanceIdExists ¶
func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool
func IsLinkWatched ¶
func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool
func IsSwitchWatched ¶
func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool
func LinkIdExists ¶
func LinkIdExists(sysInfo *SystemInfo, linkId int) bool
func ListPods ¶
func ListPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, error)
func NewDeviceFields ¶
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short
func NewFieldGroup ¶
func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
func NewGroup ¶
func NewGroup() (dcgm.GroupHandle, func(), error)
func PopulateMigProfileNames ¶
func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error
func ReadCSVFile ¶
func SetGpuInstanceProfileName ¶
func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool
func SetMigProfileNames ¶
func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
func SetupDcgmFieldsWatch ¶
func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error)
func SwitchIdExists ¶
func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool
func ToDeviceToPod ¶
func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo
func ToString ¶
func ToString(value dcgm.FieldValue_v1) string
func VerifyDevicePresence ¶
func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error
func VerifySwitchDevicePresence ¶
func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
func WatchFieldGroup ¶
func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error
Types ¶
type ComputeInstanceInfo ¶
type ComputeInstanceInfo struct { InstanceInfo dcgm.MigEntityInfo ProfileName string EntityId uint }
type Config ¶
type Config struct { CollectorsFile string Address string CollectInterval int Kubernetes bool KubernetesGPUIdType KubernetesGPUIDType CollectDCP bool UseOldNamespace bool UseRemoteHE bool RemoteHEInfo string GPUDevices DeviceOptions SwitchDevices DeviceOptions NoHostname bool UseFakeGpus bool ConfigMapData string MetricGroups []dcgm.MetricGroup }
type Counter ¶
func ExtractCounters ¶
type DCGMCollector ¶
type DCGMCollector struct { Counters []Counter DeviceFields []dcgm.Short Cleanups []func() UseOldNamespace bool SysInfo SystemInfo Hostname string }
func NewDCGMCollector ¶
func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error)
func (*DCGMCollector) Cleanup ¶
func (c *DCGMCollector) Cleanup()
func (*DCGMCollector) GetMetrics ¶
func (c *DCGMCollector) GetMetrics() ([][]Metric, error)
type DeviceOptions ¶
type DeviceOptions struct { Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all }
type GpuInfo ¶
type GpuInfo struct { DeviceInfo dcgm.Device GpuInstances []GpuInstanceInfo MigEnabled bool }
type GpuInstanceInfo ¶
type GpuInstanceInfo struct { Info dcgm.MigEntityInfo ProfileName string EntityId uint ComputeInstances []ComputeInstanceInfo }
type KubernetesGPUIDType ¶
type KubernetesGPUIDType string
const ( GPUUID KubernetesGPUIDType = "uid" DeviceName KubernetesGPUIDType = "device-name" )
type Metric ¶
type Metric struct { Counter *Counter Value string GPU string GPUUUID string GPUDevice string GPUModelName string UUID string MigProfile string GPUInstanceID string Hostname string Labels *map[string]string Attributes map[string]string }
func ToMetric ¶
func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric
func ToSwitchMetric ¶
func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric
type MetricsPipeline ¶
type MetricsPipeline struct {
// contains filtered or unexported fields
}
func NewMetricsPipeline ¶
func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error)
func NewMetricsPipelineWithGPUCollector ¶
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error)
Primarely for testing, caller expected to cleanup the collector
type MetricsServer ¶
func NewMetricsServer ¶
func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error)
func (*MetricsServer) Health ¶
func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request)
func (*MetricsServer) Metrics ¶
func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request)
func (*MetricsServer) Run ¶
func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup)
type MonitoringInfo ¶
type MonitoringInfo struct { Entity dcgm.GroupEntityPair DeviceInfo dcgm.Device InstanceInfo *GpuInstanceInfo ParentId uint }
func AddAllGpuInstances ¶
func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo
func AddAllGpus ¶
func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo
func AddAllLinks ¶
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo
func AddAllSwitches ¶
func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo
func GetMonitoredEntities ¶
func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo
func GetMonitoringInfoForGpu ¶
func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo
func GetMonitoringInfoForGpuInstance ¶
func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo
type SwitchInfo ¶
type SwitchInfo struct { EntityId uint NvLinks []dcgm.NvLinkStatus }
type SystemInfo ¶
type SystemInfo struct { GpuCount uint Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo InfoType dcgm.Field_Entity_Group Switches []SwitchInfo // contains filtered or unexported fields }
func InitializeGpuInfo ¶
func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error)
func InitializeNvSwitchInfo ¶
func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)
func InitializeSystemInfo ¶
func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error)
Click to show internal directories.
Click to hide internal directories.