SLURM support

This commit is contained in:
Trey Dockendorf 2020-02-13 10:42:16 -05:00
parent 2c8d40b431
commit fb1bf083dc
2 changed files with 133 additions and 19 deletions

View File

@ -50,12 +50,15 @@ type CgroupMetric struct {
cpuSystem float64 cpuSystem float64
cpuTotal float64 cpuTotal float64
cpus int cpus int
uid string
username string
memoryUsed float64 memoryUsed float64
memoryTotal float64 memoryTotal float64
swapUsed float64 swapUsed float64
swapTotal float64 swapTotal float64
userslice bool
job bool
uid string
username string
jobid string
} }
type Exporter struct { type Exporter struct {
@ -69,6 +72,7 @@ type Exporter struct {
swapUsed *prometheus.Desc swapUsed *prometheus.Desc
swapTotal *prometheus.Desc swapTotal *prometheus.Desc
userslice *prometheus.Desc userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc success *prometheus.Desc
} }
@ -107,7 +111,7 @@ func getCPUs(name string) (int, error) {
log.Errorf("Error reading %s: %s", cpusPath, err.Error()) log.Errorf("Error reading %s: %s", cpusPath, err.Error())
return 0, err return 0, err
} }
cpus, err := parseCpuSet(string(cpusData)) cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
if err != nil { if err != nil {
log.Errorf("Error parsing cpu set %s", err.Error()) log.Errorf("Error parsing cpu set %s", err.Error())
return 0, err return 0, err
@ -142,6 +146,61 @@ func parseCpuSet(cpuset string) (int, error) {
return cpus, nil return cpus, nil
} }
func getInfo(name string, metric *CgroupMetric) {
pathBase := filepath.Base(name)
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
if len(userSliceMatch) == 2 {
metric.userslice = true
metric.uid = userSliceMatch[1]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
return
}
slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)$")
slurmMatch := slurmPattern.FindStringSubmatch(name)
if len(slurmMatch) == 3 {
metric.job = true
metric.uid = slurmMatch[1]
metric.jobid = slurmMatch[2]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up slurm uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
}
}
func getName(p cgroups.Process, path string) (string, error) {
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
name := strings.TrimPrefix(p.Path, cpuacctPath)
name = strings.TrimSuffix(name, "/")
dirs := strings.Split(name, "/")
log.Debugf("cgroup name dirs %v", dirs)
// Handle user.slice, system.slice and torque
if len(dirs) == 3 {
return name, nil
}
// Handle deeper cgroup where we want higher level, mainly SLURM
var keepDirs []string
for i, d := range dirs {
if strings.HasPrefix(d, "job_") {
keepDirs = dirs[0 : i+1]
break
}
}
if keepDirs == nil {
return name, nil
}
name = strings.Join(keepDirs, "/")
return name, nil
}
func NewExporter(paths []string) *Exporter { func NewExporter(paths []string) *Exporter {
return &Exporter{ return &Exporter{
paths: paths, paths: paths,
@ -163,6 +222,8 @@ func NewExporter(paths []string) *Exporter {
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil), "Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"), userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil), "User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"), success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"),
"Exporter status, 1=successful 0=errors", nil, nil), "Exporter status, 1=successful 0=errors", nil, nil),
} }
@ -172,6 +233,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
var names []string var names []string
var metrics []CgroupMetric var metrics []CgroupMetric
for _, path := range e.paths { for _, path := range e.paths {
log.Debugf("Loading cgroup path %v", path)
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path)) control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
if err != nil { if err != nil {
log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error()) log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error())
@ -182,15 +244,19 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error()) log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error())
return nil, err return nil, err
} }
log.Debugf("Found %d processes", len(processes))
for _, p := range processes { for _, p := range processes {
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct") name, err := getName(p, path)
name := strings.TrimPrefix(p.Path, cpuacctPath) if err != nil {
name = strings.TrimSuffix(name, "/") log.Errorf("Error getting cgroup name for for process %s at path %s: %s", p.Path, path, err.Error())
continue
}
if sliceContains(names, name) { if sliceContains(names, name) {
continue continue
} }
names = append(names, name) names = append(names, name)
metric := CgroupMetric{name: name} metric := CgroupMetric{name: name}
log.Debugf("Loading cgroup path %s", name)
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) { ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
return name, nil return name, nil
}) })
@ -209,18 +275,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
if cpus, err := getCPUs(name); err == nil { if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus metric.cpus = cpus
} }
pathBase := filepath.Base(name) getInfo(name, &metric)
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
match := userSlicePattern.FindStringSubmatch(pathBase)
if len(match) == 2 {
metric.uid = match[1]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
}
metrics = append(metrics, metric) metrics = append(metrics, metric)
} }
} }
@ -255,9 +310,12 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name) ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name) ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name) ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
if m.username != "" { if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid) ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
} }
if m.job {
ch <- prometheus.MustNewConstMetric(e.jobinfo, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
}
} }
} }

View File

@ -14,6 +14,7 @@
package main package main
import ( import (
"github.com/prometheus/common/log"
kingpin "gopkg.in/alecthomas/kingpin.v2" kingpin "gopkg.in/alecthomas/kingpin.v2"
"path/filepath" "path/filepath"
"runtime" "runtime"
@ -81,4 +82,59 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].swapTotal; val != 9223372036854771712 { if val := metrics[0].swapTotal; val != 9223372036854771712 {
t.Errorf("Unexpected value for swapTotal, got %v", val) t.Errorf("Unexpected value for swapTotal, got %v", val)
} }
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
}
func TestCollectSLURM(t *testing.T) {
if _, err := kingpin.CommandLine.Parse([]string{"--config.paths=/slurm"}); err != nil {
t.Fatal(err)
}
log.Base().SetLevel("debug")
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
fixture := filepath.Join(dir, "test")
cgroupRoot = &fixture
exporter := NewExporter([]string{"/slurm"})
metrics, err := exporter.collect()
if err != nil {
t.Errorf("Unexpected error: %s", err.Error())
return
}
if val := len(metrics); val != 1 {
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
return
}
if val := metrics[0].cpuUser; val != 0 {
t.Errorf("Unexpected value for cpuUser, got %v", val)
}
if val := metrics[0].cpuSystem; val != 0 {
t.Errorf("Unexpected value for cpuSystem, got %v", val)
}
if val := metrics[0].cpuTotal; val != 0.007710215 {
t.Errorf("Unexpected value for cpuTotal, got %v", val)
}
if val := metrics[0].cpus; val != 2 {
t.Errorf("Unexpected value for cpus, got %v", val)
}
if val := metrics[0].memoryUsed; val != 356352 {
t.Errorf("Unexpected value for memoryUsed, got %v", val)
}
if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].swapUsed; val != 356352 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 2147483648 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
if val := metrics[0].jobid; val != "10" {
t.Errorf("Unexpected value for jobid, got %v", val)
}
} }