Add fail count metrics for memory and swap
This commit is contained in:
parent
a40dc18d63
commit
88c000ca82
|
@ -52,8 +52,10 @@ type CgroupMetric struct {
|
||||||
cpus int
|
cpus int
|
||||||
memoryUsed float64
|
memoryUsed float64
|
||||||
memoryTotal float64
|
memoryTotal float64
|
||||||
|
memoryFailCount float64
|
||||||
swapUsed float64
|
swapUsed float64
|
||||||
swapTotal float64
|
swapTotal float64
|
||||||
|
swapFailCount float64
|
||||||
userslice bool
|
userslice bool
|
||||||
job bool
|
job bool
|
||||||
uid string
|
uid string
|
||||||
|
@ -69,8 +71,10 @@ type Exporter struct {
|
||||||
cpus *prometheus.Desc
|
cpus *prometheus.Desc
|
||||||
memoryUsed *prometheus.Desc
|
memoryUsed *prometheus.Desc
|
||||||
memoryTotal *prometheus.Desc
|
memoryTotal *prometheus.Desc
|
||||||
|
memoryFailCount *prometheus.Desc
|
||||||
swapUsed *prometheus.Desc
|
swapUsed *prometheus.Desc
|
||||||
swapTotal *prometheus.Desc
|
swapTotal *prometheus.Desc
|
||||||
|
swapFailCount *prometheus.Desc
|
||||||
userslice *prometheus.Desc
|
userslice *prometheus.Desc
|
||||||
jobinfo *prometheus.Desc
|
jobinfo *prometheus.Desc
|
||||||
success *prometheus.Desc
|
success *prometheus.Desc
|
||||||
|
@ -223,10 +227,14 @@ func NewExporter(paths []string) *Exporter {
|
||||||
"Memory used in bytes", []string{"cgroup"}, nil),
|
"Memory used in bytes", []string{"cgroup"}, nil),
|
||||||
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
|
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
|
||||||
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
|
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||||
|
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
|
||||||
|
"Memory fail count", []string{"cgroup"}, nil),
|
||||||
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
|
swapUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "used_bytes"),
|
||||||
"Swap used in bytes", []string{"cgroup"}, nil),
|
"Swap used in bytes", []string{"cgroup"}, nil),
|
||||||
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
|
swapTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "total_bytes"),
|
||||||
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||||
|
swapFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "swap", "fail_count"),
|
||||||
|
"Swap fail count", []string{"cgroup"}, nil),
|
||||||
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
|
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
|
||||||
"User slice information", []string{"cgroup", "username", "uid"}, nil),
|
"User slice information", []string{"cgroup", "username", "uid"}, nil),
|
||||||
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
|
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
|
||||||
|
@ -277,8 +285,10 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
||||||
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
|
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
|
||||||
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
|
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
|
||||||
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
|
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
|
||||||
|
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
|
||||||
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
|
metric.swapUsed = float64(stats.Memory.Swap.Usage) - metric.memoryUsed
|
||||||
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
|
metric.swapTotal = float64(stats.Memory.Swap.Limit) - metric.memoryTotal
|
||||||
|
metric.swapFailCount = float64(stats.Memory.Swap.Failcnt) - metric.memoryFailCount
|
||||||
if cpus, err := getCPUs(name); err == nil {
|
if cpus, err := getCPUs(name); err == nil {
|
||||||
metric.cpus = cpus
|
metric.cpus = cpus
|
||||||
}
|
}
|
||||||
|
@ -297,6 +307,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
|
||||||
ch <- e.cpus
|
ch <- e.cpus
|
||||||
ch <- e.memoryUsed
|
ch <- e.memoryUsed
|
||||||
ch <- e.memoryTotal
|
ch <- e.memoryTotal
|
||||||
|
ch <- e.memoryFailCount
|
||||||
|
ch <- e.swapUsed
|
||||||
|
ch <- e.swapTotal
|
||||||
|
ch <- e.swapFailCount
|
||||||
ch <- e.success
|
ch <- e.success
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -315,8 +329,10 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
||||||
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
|
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
|
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
||||||
|
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
|
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
|
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
|
||||||
|
ch <- prometheus.MustNewConstMetric(e.swapFailCount, prometheus.GaugeValue, m.swapFailCount, m.name)
|
||||||
if m.userslice {
|
if m.userslice {
|
||||||
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
|
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,12 +76,18 @@ func TestCollectUserSlice(t *testing.T) {
|
||||||
if val := metrics[0].memoryTotal; val != 68719476736 {
|
if val := metrics[0].memoryTotal; val != 68719476736 {
|
||||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].memoryFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].swapUsed; val != 0 {
|
if val := metrics[0].swapUsed; val != 0 {
|
||||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||||
}
|
}
|
||||||
if val := metrics[0].swapTotal; val != 9223371968135295000 {
|
if val := metrics[0].swapTotal; val != 9223371968135295000 {
|
||||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].swapFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].uid; val != "20821" {
|
if val := metrics[0].uid; val != "20821" {
|
||||||
t.Errorf("Unexpected value for uid, got %v", val)
|
t.Errorf("Unexpected value for uid, got %v", val)
|
||||||
}
|
}
|
||||||
|
@ -125,12 +131,18 @@ func TestCollectSLURM(t *testing.T) {
|
||||||
if val := metrics[0].memoryTotal; val != 2147483648 {
|
if val := metrics[0].memoryTotal; val != 2147483648 {
|
||||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].memoryFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].swapUsed; val != 0 {
|
if val := metrics[0].swapUsed; val != 0 {
|
||||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||||
}
|
}
|
||||||
if val := metrics[0].swapTotal; val != 0 {
|
if val := metrics[0].swapTotal; val != 0 {
|
||||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].swapFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].uid; val != "20821" {
|
if val := metrics[0].uid; val != "20821" {
|
||||||
t.Errorf("Unexpected value for uid, got %v", val)
|
t.Errorf("Unexpected value for uid, got %v", val)
|
||||||
}
|
}
|
||||||
|
@ -177,12 +189,18 @@ func TestCollectTorque(t *testing.T) {
|
||||||
if val := metrics[0].memoryTotal; val != 196755132416 {
|
if val := metrics[0].memoryTotal; val != 196755132416 {
|
||||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].memoryFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].swapUsed; val != 0 {
|
if val := metrics[0].swapUsed; val != 0 {
|
||||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||||
}
|
}
|
||||||
if val := metrics[0].swapTotal; val != 0 {
|
if val := metrics[0].swapTotal; val != 0 {
|
||||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].swapFailCount; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||||
|
}
|
||||||
if val := metrics[0].uid; val != "" {
|
if val := metrics[0].uid; val != "" {
|
||||||
t.Errorf("Unexpected value for uid, got %v", val)
|
t.Errorf("Unexpected value for uid, got %v", val)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue