Add fail count metrics for memory and swap

This commit is contained in:
Trey Dockendorf 2020-02-14 12:08:22 -05:00
parent a40dc18d63
commit 88c000ca82
2 changed files with 60 additions and 26 deletions

View File

@ -45,35 +45,39 @@ var (
) )
type CgroupMetric struct { type CgroupMetric struct {
name string name string
cpuUser float64 cpuUser float64
cpuSystem float64 cpuSystem float64
cpuTotal float64 cpuTotal float64
cpus int cpus int
memoryUsed float64 memoryUsed float64
memoryTotal float64 memoryTotal float64
swapUsed float64 memoryFailCount float64
swapTotal float64 swapUsed float64
userslice bool swapTotal float64
job bool swapFailCount float64
uid string userslice bool
username string job bool
jobid string uid string
username string
jobid string
} }
type Exporter struct { type Exporter struct {
paths []string paths []string
cpuUser *prometheus.Desc cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc cpuTotal *prometheus.Desc
cpus *prometheus.Desc cpus *prometheus.Desc
memoryUsed *prometheus.Desc memoryUsed *prometheus.Desc
memoryTotal *prometheus.Desc memoryTotal *prometheus.Desc
swapUsed *prometheus.Desc memoryFailCount *prometheus.Desc
swapTotal *prometheus.Desc swapUsed *prometheus.Desc
userslice *prometheus.Desc swapTotal *prometheus.Desc
jobinfo *prometheus.Desc swapFailCount *prometheus.Desc
success *prometheus.Desc userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc
} }
func fileExists(filename string) bool { func fileExists(filename string) bool {
@ -223,10 +227,14 @@ func NewExporter(paths []string) *Exporter {
"Memory used in bytes", []string{"cgroup"}, nil), "Memory used in bytes", []string{"cgroup"}, nil),
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"), memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil), "Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
"Memory fail count", []string{"cgroup"}, nil),
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"), swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
"Swap used in bytes", []string{"cgroup"}, nil), "Swap used in bytes", []string{"cgroup"}, nil),
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"), swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil), "Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
swapFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "fail_count"),
"Swap fail count", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"), userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil), "User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"), jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
@ -277,8 +285,10 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0 metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
metric.memoryUsed = float64(stats.Memory.Usage.Usage) metric.memoryUsed = float64(stats.Memory.Usage.Usage)
metric.memoryTotal = float64(stats.Memory.Usage.Limit) metric.memoryTotal = float64(stats.Memory.Usage.Limit)
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
metric.swapFailCount = float64(stats.Memory.Swap.Failcnt) - metric.memoryFailCount
if cpus, err := getCPUs(name); err == nil { if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus metric.cpus = cpus
} }
@ -297,6 +307,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.cpus ch <- e.cpus
ch <- e.memoryUsed ch <- e.memoryUsed
ch <- e.memoryTotal ch <- e.memoryTotal
ch <- e.memoryFailCount
ch <- e.swapUsed
ch <- e.swapTotal
ch <- e.swapFailCount
ch <- e.success ch <- e.success
} }
@ -315,8 +329,10 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name) ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name) ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name) ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name) ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name) ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapFailCount, prometheus.GaugeValue, m.swapFailCount, m.name)
if m.userslice { if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid) ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
} }

View File

@ -76,12 +76,18 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].memoryTotal; val != 68719476736 { if val := metrics[0].memoryTotal; val != 68719476736 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 9223371968135295000 { if val := metrics[0].swapTotal; val != 9223371968135295000 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" { if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }
@ -125,12 +131,18 @@ func TestCollectSLURM(t *testing.T) {
if val := metrics[0].memoryTotal; val != 2147483648 { if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 0 { if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" { if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }
@ -177,12 +189,18 @@ func TestCollectTorque(t *testing.T) {
if val := metrics[0].memoryTotal; val != 196755132416 { if val := metrics[0].memoryTotal; val != 196755132416 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 0 { if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "" { if val := metrics[0].uid; val != "" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }