Add fail count metrics for memory and swap

This commit is contained in:
Trey Dockendorf 2020-02-14 12:08:22 -05:00
parent a40dc18d63
commit 88c000ca82
2 changed files with 60 additions and 26 deletions

View File

@ -52,8 +52,10 @@ type CgroupMetric struct {
cpus int cpus int
memoryUsed float64 memoryUsed float64
memoryTotal float64 memoryTotal float64
memoryFailCount float64
swapUsed float64 swapUsed float64
swapTotal float64 swapTotal float64
swapFailCount float64
userslice bool userslice bool
job bool job bool
uid string uid string
@ -69,8 +71,10 @@ type Exporter struct {
cpus *prometheus.Desc cpus *prometheus.Desc
memoryUsed *prometheus.Desc memoryUsed *prometheus.Desc
memoryTotal *prometheus.Desc memoryTotal *prometheus.Desc
memoryFailCount *prometheus.Desc
swapUsed *prometheus.Desc swapUsed *prometheus.Desc
swapTotal *prometheus.Desc swapTotal *prometheus.Desc
swapFailCount *prometheus.Desc
userslice *prometheus.Desc userslice *prometheus.Desc
jobinfo *prometheus.Desc jobinfo *prometheus.Desc
success *prometheus.Desc success *prometheus.Desc
@ -223,10 +227,14 @@ func NewExporter(paths []string) *Exporter {
"Memory used in bytes", []string{"cgroup"}, nil), "Memory used in bytes", []string{"cgroup"}, nil),
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"), memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil), "Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
"Memory fail count", []string{"cgroup"}, nil),
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"), swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
"Swap used in bytes", []string{"cgroup"}, nil), "Swap used in bytes", []string{"cgroup"}, nil),
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"), swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil), "Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
swapFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "fail_count"),
"Swap fail count", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"), userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil), "User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"), jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
@ -277,8 +285,10 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0 metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
metric.memoryUsed = float64(stats.Memory.Usage.Usage) metric.memoryUsed = float64(stats.Memory.Usage.Usage)
metric.memoryTotal = float64(stats.Memory.Usage.Limit) metric.memoryTotal = float64(stats.Memory.Usage.Limit)
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
metric.swapFailCount = float64(stats.Memory.Swap.Failcnt) - metric.memoryFailCount
if cpus, err := getCPUs(name); err == nil { if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus metric.cpus = cpus
} }
@ -297,6 +307,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.cpus ch <- e.cpus
ch <- e.memoryUsed ch <- e.memoryUsed
ch <- e.memoryTotal ch <- e.memoryTotal
ch <- e.memoryFailCount
ch <- e.swapUsed
ch <- e.swapTotal
ch <- e.swapFailCount
ch <- e.success ch <- e.success
} }
@ -315,8 +329,10 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name) ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name) ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name) ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name) ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name) ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapFailCount, prometheus.GaugeValue, m.swapFailCount, m.name)
if m.userslice { if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid) ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
} }

View File

@ -76,12 +76,18 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].memoryTotal; val != 68719476736 { if val := metrics[0].memoryTotal; val != 68719476736 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 9223371968135295000 { if val := metrics[0].swapTotal; val != 9223371968135295000 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" { if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }
@ -125,12 +131,18 @@ func TestCollectSLURM(t *testing.T) {
if val := metrics[0].memoryTotal; val != 2147483648 { if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 0 { if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" { if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }
@ -177,12 +189,18 @@ func TestCollectTorque(t *testing.T) {
if val := metrics[0].memoryTotal; val != 196755132416 { if val := metrics[0].memoryTotal; val != 196755132416 {
t.Errorf("Unexpected value for memoryTotal, got %v", val) t.Errorf("Unexpected value for memoryTotal, got %v", val)
} }
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 { if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val) t.Errorf("Unexpected value for swapUsed, got %v", val)
} }
if val := metrics[0].swapTotal; val != 0 { if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "" { if val := metrics[0].uid; val != "" {
t.Errorf("Unexpected value for uid, got %v", val) t.Errorf("Unexpected value for uid, got %v", val)
} }