Merge pull request #1 from treydock/errors

Better error handling
This commit is contained in:
treydock 2020-02-20 14:04:55 -05:00 committed by GitHub
commit 252ed00465
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 18 deletions

View File

@ -65,6 +65,7 @@ type CgroupMetric struct {
type Exporter struct { type Exporter struct {
paths []string paths []string
collectError *prometheus.Desc
cpuUser *prometheus.Desc cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc cpuTotal *prometheus.Desc
@ -77,7 +78,6 @@ type Exporter struct {
swapFailCount *prometheus.Desc swapFailCount *prometheus.Desc
userslice *prometheus.Desc userslice *prometheus.Desc
jobinfo *prometheus.Desc jobinfo *prometheus.Desc
success *prometheus.Desc
} }
func fileExists(filename string) bool { func fileExists(filename string) bool {
@ -215,6 +215,8 @@ func getName(p cgroups.Process, path string) (string, error) {
func NewExporter(paths []string) *Exporter { func NewExporter(paths []string) *Exporter {
return &Exporter{ return &Exporter{
paths: paths, paths: paths,
collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"),
"Indicates exporter error, 0=no error, 1=error", []string{"path", "error"}, nil),
cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"), cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"),
"Cumalitive CPU user seconds for cgroup", []string{"cgroup"}, nil), "Cumalitive CPU user seconds for cgroup", []string{"cgroup"}, nil),
cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "kernel_seconds"), cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "kernel_seconds"),
@ -239,12 +241,10 @@ func NewExporter(paths []string) *Exporter {
"User slice information", []string{"cgroup", "username", "uid"}, nil), "User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"), jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil), "User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"),
"Exporter status, 1=successful 0=errors", nil, nil),
} }
} }
func (e *Exporter) collect() ([]CgroupMetric, error) { func (e *Exporter) collect(ch chan<- prometheus.Metric) ([]CgroupMetric, error) {
var names []string var names []string
var metrics []CgroupMetric var metrics []CgroupMetric
for _, path := range e.paths { for _, path := range e.paths {
@ -252,12 +252,14 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path)) control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
if err != nil { if err != nil {
log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error()) log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error())
return nil, err ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, path, "load-subsystem")
continue
} }
processes, err := control.Processes(cgroups.Cpuacct, true) processes, err := control.Processes(cgroups.Cpuacct, true)
if err != nil { if err != nil {
log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error()) log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error())
return nil, err ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, path, "load-processes")
continue
} }
log.Debugf("Found %d processes", len(processes)) log.Debugf("Found %d processes", len(processes))
for _, p := range processes { for _, p := range processes {
@ -277,7 +279,8 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
}) })
if err != nil { if err != nil {
log.Errorf("Failed to load cgroups for %s: %s", name, err.Error()) log.Errorf("Failed to load cgroups for %s: %s", name, err.Error())
return nil, err ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, name, "load-subsystem")
continue
} }
stats, _ := ctrl.Stat(cgroups.IgnoreNotExist) stats, _ := ctrl.Stat(cgroups.IgnoreNotExist)
metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0 metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0
@ -311,17 +314,11 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.swapUsed ch <- e.swapUsed
ch <- e.swapTotal ch <- e.swapTotal
ch <- e.swapFailCount ch <- e.swapFailCount
ch <- e.success ch <- e.collectError
} }
func (e *Exporter) Collect(ch chan<- prometheus.Metric) { func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
metrics, err := e.collect() metrics, _ := e.collect(ch)
if err != nil {
log.Errorf("Exporter error: %s", err.Error())
ch <- prometheus.MustNewConstMetric(e.success, prometheus.GaugeValue, 0)
} else {
ch <- prometheus.MustNewConstMetric(e.success, prometheus.GaugeValue, 1)
}
for _, m := range metrics { for _, m := range metrics {
ch <- prometheus.MustNewConstMetric(e.cpuUser, prometheus.GaugeValue, m.cpuUser, m.name) ch <- prometheus.MustNewConstMetric(e.cpuUser, prometheus.GaugeValue, m.cpuUser, m.name)
ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name) ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name)

View File

@ -14,6 +14,7 @@
package main package main
import ( import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log" "github.com/prometheus/common/log"
kingpin "gopkg.in/alecthomas/kingpin.v2" kingpin "gopkg.in/alecthomas/kingpin.v2"
"path/filepath" "path/filepath"
@ -21,6 +22,10 @@ import (
"testing" "testing"
) )
var (
ch = make(chan prometheus.Metric)
)
func TestParseCpuSet(t *testing.T) { func TestParseCpuSet(t *testing.T) {
if cpus, err := parseCpuSet("0-2"); err != nil { if cpus, err := parseCpuSet("0-2"); err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
@ -49,7 +54,7 @@ func TestCollectUserSlice(t *testing.T) {
cgroupRoot = &fixture cgroupRoot = &fixture
exporter := NewExporter([]string{"/user.slice"}) exporter := NewExporter([]string{"/user.slice"})
metrics, err := exporter.collect() metrics, err := exporter.collect(ch)
if err != nil { if err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
return return
@ -104,7 +109,7 @@ func TestCollectSLURM(t *testing.T) {
cgroupRoot = &fixture cgroupRoot = &fixture
exporter := NewExporter([]string{"/slurm"}) exporter := NewExporter([]string{"/slurm"})
metrics, err := exporter.collect() metrics, err := exporter.collect(ch)
if err != nil { if err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
return return
@ -162,7 +167,7 @@ func TestCollectTorque(t *testing.T) {
cgroupRoot = &fixture cgroupRoot = &fixture
exporter := NewExporter([]string{"/torque"}) exporter := NewExporter([]string{"/torque"})
metrics, err := exporter.collect() metrics, err := exporter.collect(ch)
if err != nil { if err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
return return