dcgmexporter

package
v0.0.0-...-c3bd378 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 23, 2023 License: Apache-2.0 Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const PARENT_ID_IGNORED = 0

Variables

View Source
var (
	SkipDCGMValue   = "SKIPPING DCGM VALUE"
	FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING"

	MIG_UUID_PREFIX = "MIG-"
)

Functions

func CreateGroupFromSystemInfo

func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)

func CreateLinkGroupsFromSystemInfo

func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)

func FormatMetrics

func FormatMetrics(t *template.Template, m [][]Metric) (string, error)

Template is passed here so that it isn't recompiled at each iteration

func GetGpuInstanceIdentifier

func GetGpuInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string

func GpuIdExists

func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool

func GpuInstanceIdExists

func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool

func IsLinkWatched

func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool

func IsSwitchWatched

func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool

func LinkIdExists

func LinkIdExists(sysInfo *SystemInfo, linkId int) bool

func NewDeviceFields

func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short

func NewFieldGroup

func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)

func NewGroup

func NewGroup() (dcgm.GroupHandle, func(), error)

func PopulateMigProfileNames

func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error

func ReadCSVFile

func ReadCSVFile(filename string) ([][]string, error)

func SetGpuInstanceProfileName

func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool

func SetMigProfileNames

func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error

func SetupDcgmFieldsWatch

func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error)

func SwitchIdExists

func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool

func ToDeviceToPod

func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo

func ToString

func ToString(value dcgm.FieldValue_v1) string

func VerifyDevicePresence

func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error

func VerifySwitchDevicePresence

func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error

func WaitWithTimeout

func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error

func WatchFieldGroup

func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error

Types

type ComputeInstanceInfo

type ComputeInstanceInfo struct {
	InstanceInfo dcgm.MigEntityInfo
	ProfileName  string
	EntityId     uint
}

type Config

type Config struct {
	CollectorsFile      string
	Address             string
	CollectInterval     int
	Kubernetes          bool
	KubernetesGPUIdType KubernetesGPUIDType
	CollectDCP          bool
	UseOldNamespace     bool
	UseRemoteHE         bool
	RemoteHEInfo        string
	GPUDevices          DeviceOptions
	SwitchDevices       DeviceOptions
	NoHostname          bool
	UseFakeGpus         bool
	ConfigMapData       string
	MetricGroups        []dcgm.MetricGroup
}

type Counter

type Counter struct {
	FieldID   dcgm.Short
	FieldName string
	PromType  string
	Help      string
}

func ExtractCounters

func ExtractCounters(c *Config) ([]Counter, error)

func FindCounterField

func FindCounterField(c []Counter, fieldId uint) (*Counter, error)

type DCGMCollector

type DCGMCollector struct {
	Counters        []Counter
	DeviceFields    []dcgm.Short
	Cleanups        []func()
	UseOldNamespace bool
	SysInfo         SystemInfo
	Hostname        string
}

func NewDCGMCollector

func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error)

func (*DCGMCollector) Cleanup

func (c *DCGMCollector) Cleanup()

func (*DCGMCollector) GetMetrics

func (c *DCGMCollector) GetMetrics() ([][]Metric, error)

type DeviceOptions

type DeviceOptions struct {
	Flex       bool  // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled.
	MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all
	MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all
}

type GpuInfo

type GpuInfo struct {
	DeviceInfo   dcgm.Device
	GpuInstances []GpuInstanceInfo
	MigEnabled   bool
}

type GpuInstanceInfo

type GpuInstanceInfo struct {
	Info             dcgm.MigEntityInfo
	ProfileName      string
	EntityId         uint
	ComputeInstances []ComputeInstanceInfo
}

type GroupInfo

type GroupInfo struct {
	// contains filtered or unexported fields
}

type KubernetesGPUIDType

type KubernetesGPUIDType string
const (
	GPUUID     KubernetesGPUIDType = "uid"
	DeviceName KubernetesGPUIDType = "device-name"
)

type Metric

type Metric struct {
	Counter *Counter
	Value   string

	GPU          string
	GPUUUID      string
	GPUDevice    string
	GPUModelName string

	UUID string

	MigProfile    string
	GPUInstanceID string
	Hostname      string

	Labels     *map[string]string
	Attributes map[string]string
}

func ToMetric

func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric

func ToSwitchMetric

func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric

type MetricsPipeline

type MetricsPipeline struct {
	// contains filtered or unexported fields
}

func NewMetricsPipeline

func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error)

func NewMetricsPipelineWithGPUCollector

func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error)

Primarely for testing, caller expected to cleanup the collector

func (*MetricsPipeline) Run

func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup)

type MetricsServer

type MetricsServer struct {
	sync.Mutex
	// contains filtered or unexported fields
}

func NewMetricsServer

func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error)

func (*MetricsServer) Health

func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request)

func (*MetricsServer) Metrics

func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request)

func (*MetricsServer) Run

func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup)

type MonitoringInfo

type MonitoringInfo struct {
	Entity       dcgm.GroupEntityPair
	DeviceInfo   dcgm.Device
	InstanceInfo *GpuInstanceInfo
	ParentId     uint
}

func AddAllGpuInstances

func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo

func AddAllGpus

func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo

func AddAllSwitches

func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo

func GetMonitoredEntities

func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo

func GetMonitoringInfoForGpu

func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo

func GetMonitoringInfoForGpuInstance

func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo

type PodInfo

type PodInfo struct {
	Name      string
	Namespace string
	Container string
}

type PodMapper

type PodMapper struct {
	Config *Config
}

func NewPodMapper

func NewPodMapper(c *Config) (*PodMapper, error)

func (*PodMapper) Name

func (p *PodMapper) Name() string

func (*PodMapper) Process

func (p *PodMapper) Process(metrics [][]Metric, sysInfo SystemInfo) error

type SwitchInfo

type SwitchInfo struct {
	EntityId uint
	NvLinks  []dcgm.NvLinkStatus
}

type SystemInfo

type SystemInfo struct {
	GpuCount uint
	Gpus     [dcgm.MAX_NUM_DEVICES]GpuInfo

	InfoType dcgm.Field_Entity_Group
	Switches []SwitchInfo
	// contains filtered or unexported fields
}

func InitializeGpuInfo

func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error)

func InitializeNvSwitchInfo

func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)

func InitializeSystemInfo

func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error)

type Transform

type Transform interface {
	Process(metrics [][]Metric, sysInfo SystemInfo) error
	Name() string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL