Add fail count metrics for memory and swap

This commit is contained in:
Trey Dockendorf 2020-02-14 12:08:22 -05:00
parent a40dc18d63
commit 88c000ca82
2 changed files with 60 additions and 26 deletions

View File

@ -52,8 +52,10 @@ type CgroupMetric struct {
cpus int
memoryUsed float64
memoryTotal float64
memoryFailCount float64
swapUsed float64
swapTotal float64
swapFailCount float64
userslice bool
job bool
uid string
@ -69,8 +71,10 @@ type Exporter struct {
cpus *prometheus.Desc
memoryUsed *prometheus.Desc
memoryTotal *prometheus.Desc
memoryFailCount *prometheus.Desc
swapUsed *prometheus.Desc
swapTotal *prometheus.Desc
swapFailCount *prometheus.Desc
userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc
@ -223,10 +227,14 @@ func NewExporter(paths []string) *Exporter {
"Memory used in bytes", []string{"cgroup"}, nil),
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
"Memory fail count", []string{"cgroup"}, nil),
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
"Swap used in bytes", []string{"cgroup"}, nil),
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
swapFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "fail_count"),
"Swap fail count", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
@ -277,8 +285,10 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
metric.swapFailCount = float64(stats.Memory.Swap.Failcnt) - metric.memoryFailCount
if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus
}
@ -297,6 +307,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.cpus
ch <- e.memoryUsed
ch <- e.memoryTotal
ch <- e.memoryFailCount
ch <- e.swapUsed
ch <- e.swapTotal
ch <- e.swapFailCount
ch <- e.success
}
@ -315,8 +329,10 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapFailCount, prometheus.GaugeValue, m.swapFailCount, m.name)
if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
}

View File

@ -76,12 +76,18 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].memoryTotal; val != 68719476736 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 9223371968135295000 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
@ -125,12 +131,18 @@ func TestCollectSLURM(t *testing.T) {
if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
@ -177,12 +189,18 @@ func TestCollectTorque(t *testing.T) {
if val := metrics[0].memoryTotal; val != 196755132416 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "" {
t.Errorf("Unexpected value for uid, got %v", val)
}