Add fail count metrics for memory and swap

This commit is contained in:
Trey Dockendorf 2020-02-14 12:08:22 -05:00
parent a40dc18d63
commit 88c000ca82
2 changed files with 60 additions and 26 deletions

View File

@ -45,35 +45,39 @@ var (
)
type CgroupMetric struct {
name string
cpuUser float64
cpuSystem float64
cpuTotal float64
cpus int
memoryUsed float64
memoryTotal float64
swapUsed float64
swapTotal float64
userslice bool
job bool
uid string
username string
jobid string
name string
cpuUser float64
cpuSystem float64
cpuTotal float64
cpus int
memoryUsed float64
memoryTotal float64
memoryFailCount float64
swapUsed float64
swapTotal float64
swapFailCount float64
userslice bool
job bool
uid string
username string
jobid string
}
type Exporter struct {
paths []string
cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc
cpus *prometheus.Desc
memoryUsed *prometheus.Desc
memoryTotal *prometheus.Desc
swapUsed *prometheus.Desc
swapTotal *prometheus.Desc
userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc
paths []string
cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc
cpus *prometheus.Desc
memoryUsed *prometheus.Desc
memoryTotal *prometheus.Desc
memoryFailCount *prometheus.Desc
swapUsed *prometheus.Desc
swapTotal *prometheus.Desc
swapFailCount *prometheus.Desc
userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc
}
func fileExists(filename string) bool {
@ -223,10 +227,14 @@ func NewExporter(paths []string) *Exporter {
"Memory used in bytes", []string{"cgroup"}, nil),
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
"Memory fail count", []string{"cgroup"}, nil),
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
"Swap used in bytes", []string{"cgroup"}, nil),
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
swapFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "fail_count"),
"Swap fail count", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
@ -277,8 +285,10 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
metric.swapFailCount = float64(stats.Memory.Swap.Failcnt) - metric.memoryFailCount
if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus
}
@ -297,6 +307,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.cpus
ch <- e.memoryUsed
ch <- e.memoryTotal
ch <- e.memoryFailCount
ch <- e.swapUsed
ch <- e.swapTotal
ch <- e.swapFailCount
ch <- e.success
}
@ -315,8 +329,10 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapFailCount, prometheus.GaugeValue, m.swapFailCount, m.name)
if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
}

View File

@ -76,12 +76,18 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].memoryTotal; val != 68719476736 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 9223371968135295000 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
@ -125,12 +131,18 @@ func TestCollectSLURM(t *testing.T) {
if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
@ -177,12 +189,18 @@ func TestCollectTorque(t *testing.T) {
if val := metrics[0].memoryTotal; val != 196755132416 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].memoryFailCount; val != 0 {
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
}
if val := metrics[0].swapUsed; val != 0 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 0 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].swapFailCount; val != 0 {
t.Errorf("Unexpected value for swapFailCount, got %v", val)
}
if val := metrics[0].uid; val != "" {
t.Errorf("Unexpected value for uid, got %v", val)
}