From 9bfac687d6ba743005c7bf0fe019c7712fefc827 Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Thu, 20 Feb 2020 13:48:02 -0500 Subject: [PATCH] Add metric to indicate collect failures --- cgroup_exporter.go | 17 ++++++++++++----- cgroup_exporter_test.go | 11 ++++++++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/cgroup_exporter.go b/cgroup_exporter.go index 710ceb2..02a0552 100644 --- a/cgroup_exporter.go +++ b/cgroup_exporter.go @@ -65,6 +65,7 @@ type CgroupMetric struct { type Exporter struct { paths []string + collectError *prometheus.Desc cpuUser *prometheus.Desc cpuSystem *prometheus.Desc cpuTotal *prometheus.Desc @@ -215,6 +216,8 @@ func getName(p cgroups.Process, path string) (string, error) { func NewExporter(paths []string) *Exporter { return &Exporter{ paths: paths, + collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"), + "Indicates exporter error, 0=no error, 1=error", []string{"path", "error"}, nil), cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"), "Cumalitive CPU user seconds for cgroup", []string{"cgroup"}, nil), cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "kernel_seconds"), @@ -244,7 +247,7 @@ func NewExporter(paths []string) *Exporter { } } -func (e *Exporter) collect() ([]CgroupMetric, error) { +func (e *Exporter) collect(ch chan<- prometheus.Metric) ([]CgroupMetric, error) { var names []string var metrics []CgroupMetric for _, path := range e.paths { @@ -252,12 +255,14 @@ func (e *Exporter) collect() ([]CgroupMetric, error) { control, err := cgroups.Load(subsystem, cgroups.StaticPath(path)) if err != nil { log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error()) - return nil, err + ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, path, "load-subsystem") + continue } processes, err := control.Processes(cgroups.Cpuacct, true) if err != nil { log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error()) - return nil, err + ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, path, "load-processes") + continue } log.Debugf("Found %d processes", len(processes)) for _, p := range processes { @@ -277,7 +282,8 @@ func (e *Exporter) collect() ([]CgroupMetric, error) { }) if err != nil { log.Errorf("Failed to load cgroups for %s: %s", name, err.Error()) - return nil, err + ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, name, "load-subsystem") + continue } stats, _ := ctrl.Stat(cgroups.IgnoreNotExist) metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0 @@ -311,11 +317,12 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { ch <- e.swapUsed ch <- e.swapTotal ch <- e.swapFailCount + ch <- e.collectError ch <- e.success } func (e *Exporter) Collect(ch chan<- prometheus.Metric) { - metrics, err := e.collect() + metrics, err := e.collect(ch) if err != nil { log.Errorf("Exporter error: %s", err.Error()) ch <- prometheus.MustNewConstMetric(e.success, prometheus.GaugeValue, 0) diff --git a/cgroup_exporter_test.go b/cgroup_exporter_test.go index e0a438d..beebc7c 100644 --- a/cgroup_exporter_test.go +++ b/cgroup_exporter_test.go @@ -14,6 +14,7 @@ package main import ( + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" kingpin "gopkg.in/alecthomas/kingpin.v2" "path/filepath" @@ -21,6 +22,10 @@ import ( "testing" ) +var ( + ch = make(chan prometheus.Metric) +) + func TestParseCpuSet(t *testing.T) { if cpus, err := parseCpuSet("0-2"); err != nil { t.Errorf("Unexpected error: %s", err.Error()) @@ -49,7 +54,7 @@ func TestCollectUserSlice(t *testing.T) { cgroupRoot = &fixture exporter := NewExporter([]string{"/user.slice"}) - metrics, err := exporter.collect() + metrics, err := exporter.collect(ch) if err != nil { t.Errorf("Unexpected error: %s", err.Error()) return @@ -104,7 +109,7 @@ func TestCollectSLURM(t *testing.T) { cgroupRoot = &fixture exporter := NewExporter([]string{"/slurm"}) - metrics, err := exporter.collect() + metrics, err := exporter.collect(ch) if err != nil { t.Errorf("Unexpected error: %s", err.Error()) return @@ -162,7 +167,7 @@ func TestCollectTorque(t *testing.T) { cgroupRoot = &fixture exporter := NewExporter([]string{"/torque"}) - metrics, err := exporter.collect() + metrics, err := exporter.collect(ch) if err != nil { t.Errorf("Unexpected error: %s", err.Error()) return