SLURM support

This commit is contained in:
Trey Dockendorf 2020-02-13 10:42:16 -05:00
parent 2c8d40b431
commit fb1bf083dc
2 changed files with 133 additions and 19 deletions

View File

@ -50,12 +50,15 @@ type CgroupMetric struct {
cpuSystem float64
cpuTotal float64
cpus int
uid string
username string
memoryUsed float64
memoryTotal float64
swapUsed float64
swapTotal float64
userslice bool
job bool
uid string
username string
jobid string
}
type Exporter struct {
@ -69,6 +72,7 @@ type Exporter struct {
swapUsed *prometheus.Desc
swapTotal *prometheus.Desc
userslice *prometheus.Desc
jobinfo *prometheus.Desc
success *prometheus.Desc
}
@ -107,7 +111,7 @@ func getCPUs(name string) (int, error) {
log.Errorf("Error reading %s: %s", cpusPath, err.Error())
return 0, err
}
cpus, err := parseCpuSet(string(cpusData))
cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
if err != nil {
log.Errorf("Error parsing cpu set %s", err.Error())
return 0, err
@ -142,6 +146,61 @@ func parseCpuSet(cpuset string) (int, error) {
return cpus, nil
}
func getInfo(name string, metric *CgroupMetric) {
pathBase := filepath.Base(name)
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
if len(userSliceMatch) == 2 {
metric.userslice = true
metric.uid = userSliceMatch[1]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
return
}
slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)$")
slurmMatch := slurmPattern.FindStringSubmatch(name)
if len(slurmMatch) == 3 {
metric.job = true
metric.uid = slurmMatch[1]
metric.jobid = slurmMatch[2]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up slurm uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
}
}
func getName(p cgroups.Process, path string) (string, error) {
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
name := strings.TrimPrefix(p.Path, cpuacctPath)
name = strings.TrimSuffix(name, "/")
dirs := strings.Split(name, "/")
log.Debugf("cgroup name dirs %v", dirs)
// Handle user.slice, system.slice and torque
if len(dirs) == 3 {
return name, nil
}
// Handle deeper cgroup where we want higher level, mainly SLURM
var keepDirs []string
for i, d := range dirs {
if strings.HasPrefix(d, "job_") {
keepDirs = dirs[0 : i+1]
break
}
}
if keepDirs == nil {
return name, nil
}
name = strings.Join(keepDirs, "/")
return name, nil
}
func NewExporter(paths []string) *Exporter {
return &Exporter{
paths: paths,
@ -163,6 +222,8 @@ func NewExporter(paths []string) *Exporter {
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
"User slice information", []string{"cgroup", "username", "uid"}, nil),
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"),
"Exporter status, 1=successful 0=errors", nil, nil),
}
@ -172,6 +233,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
var names []string
var metrics []CgroupMetric
for _, path := range e.paths {
log.Debugf("Loading cgroup path %v", path)
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
if err != nil {
log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error())
@ -182,15 +244,19 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error())
return nil, err
}
log.Debugf("Found %d processes", len(processes))
for _, p := range processes {
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
name := strings.TrimPrefix(p.Path, cpuacctPath)
name = strings.TrimSuffix(name, "/")
name, err := getName(p, path)
if err != nil {
log.Errorf("Error getting cgroup name for for process %s at path %s: %s", p.Path, path, err.Error())
continue
}
if sliceContains(names, name) {
continue
}
names = append(names, name)
metric := CgroupMetric{name: name}
log.Debugf("Loading cgroup path %s", name)
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
return name, nil
})
@ -209,18 +275,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus
}
pathBase := filepath.Base(name)
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
match := userSlicePattern.FindStringSubmatch(pathBase)
if len(match) == 2 {
metric.uid = match[1]
user, err := user.LookupId(metric.uid)
if err != nil {
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
} else {
metric.username = user.Username
}
}
getInfo(name, &metric)
metrics = append(metrics, metric)
}
}
@ -255,9 +310,12 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
if m.username != "" {
if m.userslice {
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
}
if m.job {
ch <- prometheus.MustNewConstMetric(e.jobinfo, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
}
}
}

View File

@ -14,6 +14,7 @@
package main
import (
"github.com/prometheus/common/log"
kingpin "gopkg.in/alecthomas/kingpin.v2"
"path/filepath"
"runtime"
@ -81,4 +82,59 @@ func TestCollectUserSlice(t *testing.T) {
if val := metrics[0].swapTotal; val != 9223372036854771712 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
}
func TestCollectSLURM(t *testing.T) {
if _, err := kingpin.CommandLine.Parse([]string{"--config.paths=/slurm"}); err != nil {
t.Fatal(err)
}
log.Base().SetLevel("debug")
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
fixture := filepath.Join(dir, "test")
cgroupRoot = &fixture
exporter := NewExporter([]string{"/slurm"})
metrics, err := exporter.collect()
if err != nil {
t.Errorf("Unexpected error: %s", err.Error())
return
}
if val := len(metrics); val != 1 {
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
return
}
if val := metrics[0].cpuUser; val != 0 {
t.Errorf("Unexpected value for cpuUser, got %v", val)
}
if val := metrics[0].cpuSystem; val != 0 {
t.Errorf("Unexpected value for cpuSystem, got %v", val)
}
if val := metrics[0].cpuTotal; val != 0.007710215 {
t.Errorf("Unexpected value for cpuTotal, got %v", val)
}
if val := metrics[0].cpus; val != 2 {
t.Errorf("Unexpected value for cpus, got %v", val)
}
if val := metrics[0].memoryUsed; val != 356352 {
t.Errorf("Unexpected value for memoryUsed, got %v", val)
}
if val := metrics[0].memoryTotal; val != 2147483648 {
t.Errorf("Unexpected value for memoryTotal, got %v", val)
}
if val := metrics[0].swapUsed; val != 356352 {
t.Errorf("Unexpected value for swapUsed, got %v", val)
}
if val := metrics[0].swapTotal; val != 2147483648 {
t.Errorf("Unexpected value for swapTotal, got %v", val)
}
if val := metrics[0].uid; val != "20821" {
t.Errorf("Unexpected value for uid, got %v", val)
}
if val := metrics[0].jobid; val != "10" {
t.Errorf("Unexpected value for jobid, got %v", val)
}
}