SLURM support
This commit is contained in:
parent
2c8d40b431
commit
fb1bf083dc
|
@ -50,12 +50,15 @@ type CgroupMetric struct {
|
||||||
cpuSystem float64
|
cpuSystem float64
|
||||||
cpuTotal float64
|
cpuTotal float64
|
||||||
cpus int
|
cpus int
|
||||||
uid string
|
|
||||||
username string
|
|
||||||
memoryUsed float64
|
memoryUsed float64
|
||||||
memoryTotal float64
|
memoryTotal float64
|
||||||
swapUsed float64
|
swapUsed float64
|
||||||
swapTotal float64
|
swapTotal float64
|
||||||
|
userslice bool
|
||||||
|
job bool
|
||||||
|
uid string
|
||||||
|
username string
|
||||||
|
jobid string
|
||||||
}
|
}
|
||||||
|
|
||||||
type Exporter struct {
|
type Exporter struct {
|
||||||
|
@ -69,6 +72,7 @@ type Exporter struct {
|
||||||
swapUsed *prometheus.Desc
|
swapUsed *prometheus.Desc
|
||||||
swapTotal *prometheus.Desc
|
swapTotal *prometheus.Desc
|
||||||
userslice *prometheus.Desc
|
userslice *prometheus.Desc
|
||||||
|
jobinfo *prometheus.Desc
|
||||||
success *prometheus.Desc
|
success *prometheus.Desc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,7 +111,7 @@ func getCPUs(name string) (int, error) {
|
||||||
log.Errorf("Error reading %s: %s", cpusPath, err.Error())
|
log.Errorf("Error reading %s: %s", cpusPath, err.Error())
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
cpus, err := parseCpuSet(string(cpusData))
|
cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error parsing cpu set %s", err.Error())
|
log.Errorf("Error parsing cpu set %s", err.Error())
|
||||||
return 0, err
|
return 0, err
|
||||||
|
@ -142,6 +146,61 @@ func parseCpuSet(cpuset string) (int, error) {
|
||||||
return cpus, nil
|
return cpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getInfo(name string, metric *CgroupMetric) {
|
||||||
|
pathBase := filepath.Base(name)
|
||||||
|
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
|
||||||
|
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
|
||||||
|
if len(userSliceMatch) == 2 {
|
||||||
|
metric.userslice = true
|
||||||
|
metric.uid = userSliceMatch[1]
|
||||||
|
user, err := user.LookupId(metric.uid)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
|
||||||
|
} else {
|
||||||
|
metric.username = user.Username
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)$")
|
||||||
|
slurmMatch := slurmPattern.FindStringSubmatch(name)
|
||||||
|
if len(slurmMatch) == 3 {
|
||||||
|
metric.job = true
|
||||||
|
metric.uid = slurmMatch[1]
|
||||||
|
metric.jobid = slurmMatch[2]
|
||||||
|
user, err := user.LookupId(metric.uid)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Error looking up slurm uid %s: %s", metric.uid, err.Error())
|
||||||
|
} else {
|
||||||
|
metric.username = user.Username
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getName(p cgroups.Process, path string) (string, error) {
|
||||||
|
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
|
||||||
|
name := strings.TrimPrefix(p.Path, cpuacctPath)
|
||||||
|
name = strings.TrimSuffix(name, "/")
|
||||||
|
dirs := strings.Split(name, "/")
|
||||||
|
log.Debugf("cgroup name dirs %v", dirs)
|
||||||
|
// Handle user.slice, system.slice and torque
|
||||||
|
if len(dirs) == 3 {
|
||||||
|
return name, nil
|
||||||
|
}
|
||||||
|
// Handle deeper cgroup where we want higher level, mainly SLURM
|
||||||
|
var keepDirs []string
|
||||||
|
for i, d := range dirs {
|
||||||
|
if strings.HasPrefix(d, "job_") {
|
||||||
|
keepDirs = dirs[0 : i+1]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if keepDirs == nil {
|
||||||
|
return name, nil
|
||||||
|
}
|
||||||
|
name = strings.Join(keepDirs, "/")
|
||||||
|
return name, nil
|
||||||
|
}
|
||||||
|
|
||||||
func NewExporter(paths []string) *Exporter {
|
func NewExporter(paths []string) *Exporter {
|
||||||
return &Exporter{
|
return &Exporter{
|
||||||
paths: paths,
|
paths: paths,
|
||||||
|
@ -163,6 +222,8 @@ func NewExporter(paths []string) *Exporter {
|
||||||
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||||
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
|
userslice: prometheus.NewDesc(prometheus.BuildFQName(namespace, "userslice", "info"),
|
||||||
"User slice information", []string{"cgroup", "username", "uid"}, nil),
|
"User slice information", []string{"cgroup", "username", "uid"}, nil),
|
||||||
|
jobinfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "job", "info"),
|
||||||
|
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
|
||||||
success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"),
|
success: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "success"),
|
||||||
"Exporter status, 1=successful 0=errors", nil, nil),
|
"Exporter status, 1=successful 0=errors", nil, nil),
|
||||||
}
|
}
|
||||||
|
@ -172,6 +233,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
||||||
var names []string
|
var names []string
|
||||||
var metrics []CgroupMetric
|
var metrics []CgroupMetric
|
||||||
for _, path := range e.paths {
|
for _, path := range e.paths {
|
||||||
|
log.Debugf("Loading cgroup path %v", path)
|
||||||
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
|
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error())
|
log.Errorf("Error loading cgroup subsystem path %s: %s", path, err.Error())
|
||||||
|
@ -182,15 +244,19 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
||||||
log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error())
|
log.Errorf("Error loading cgroup processes for path %s: %s", path, err.Error())
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
log.Debugf("Found %d processes", len(processes))
|
||||||
for _, p := range processes {
|
for _, p := range processes {
|
||||||
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
|
name, err := getName(p, path)
|
||||||
name := strings.TrimPrefix(p.Path, cpuacctPath)
|
if err != nil {
|
||||||
name = strings.TrimSuffix(name, "/")
|
log.Errorf("Error getting cgroup name for for process %s at path %s: %s", p.Path, path, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
if sliceContains(names, name) {
|
if sliceContains(names, name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
names = append(names, name)
|
names = append(names, name)
|
||||||
metric := CgroupMetric{name: name}
|
metric := CgroupMetric{name: name}
|
||||||
|
log.Debugf("Loading cgroup path %s", name)
|
||||||
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
|
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
|
||||||
return name, nil
|
return name, nil
|
||||||
})
|
})
|
||||||
|
@ -209,18 +275,7 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
||||||
if cpus, err := getCPUs(name); err == nil {
|
if cpus, err := getCPUs(name); err == nil {
|
||||||
metric.cpus = cpus
|
metric.cpus = cpus
|
||||||
}
|
}
|
||||||
pathBase := filepath.Base(name)
|
getInfo(name, &metric)
|
||||||
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
|
|
||||||
match := userSlicePattern.FindStringSubmatch(pathBase)
|
|
||||||
if len(match) == 2 {
|
|
||||||
metric.uid = match[1]
|
|
||||||
user, err := user.LookupId(metric.uid)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Error looking up user slice uid %s: %s", metric.uid, err.Error())
|
|
||||||
} else {
|
|
||||||
metric.username = user.Username
|
|
||||||
}
|
|
||||||
}
|
|
||||||
metrics = append(metrics, metric)
|
metrics = append(metrics, metric)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -255,9 +310,12 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
||||||
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
|
ch <- prometheus.MustNewConstMetric(e.swapUsed, prometheus.GaugeValue, m.swapUsed, m.name)
|
||||||
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
|
ch <- prometheus.MustNewConstMetric(e.swapTotal, prometheus.GaugeValue, m.swapTotal, m.name)
|
||||||
if m.username != "" {
|
if m.userslice {
|
||||||
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
|
ch <- prometheus.MustNewConstMetric(e.userslice, prometheus.GaugeValue, 1, m.name, m.username, m.uid)
|
||||||
}
|
}
|
||||||
|
if m.job {
|
||||||
|
ch <- prometheus.MustNewConstMetric(e.jobinfo, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"github.com/prometheus/common/log"
|
||||||
kingpin "gopkg.in/alecthomas/kingpin.v2"
|
kingpin "gopkg.in/alecthomas/kingpin.v2"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
@ -81,4 +82,59 @@ func TestCollectUserSlice(t *testing.T) {
|
||||||
if val := metrics[0].swapTotal; val != 9223372036854771712 {
|
if val := metrics[0].swapTotal; val != 9223372036854771712 {
|
||||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||||
}
|
}
|
||||||
|
if val := metrics[0].uid; val != "20821" {
|
||||||
|
t.Errorf("Unexpected value for uid, got %v", val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollectSLURM(t *testing.T) {
|
||||||
|
if _, err := kingpin.CommandLine.Parse([]string{"--config.paths=/slurm"}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
log.Base().SetLevel("debug")
|
||||||
|
_, filename, _, _ := runtime.Caller(0)
|
||||||
|
dir := filepath.Dir(filename)
|
||||||
|
fixture := filepath.Join(dir, "test")
|
||||||
|
cgroupRoot = &fixture
|
||||||
|
|
||||||
|
exporter := NewExporter([]string{"/slurm"})
|
||||||
|
metrics, err := exporter.collect()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Unexpected error: %s", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if val := len(metrics); val != 1 {
|
||||||
|
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if val := metrics[0].cpuUser; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].cpuSystem; val != 0 {
|
||||||
|
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].cpuTotal; val != 0.007710215 {
|
||||||
|
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].cpus; val != 2 {
|
||||||
|
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].memoryUsed; val != 356352 {
|
||||||
|
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].memoryTotal; val != 2147483648 {
|
||||||
|
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].swapUsed; val != 356352 {
|
||||||
|
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].swapTotal; val != 2147483648 {
|
||||||
|
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].uid; val != "20821" {
|
||||||
|
t.Errorf("Unexpected value for uid, got %v", val)
|
||||||
|
}
|
||||||
|
if val := metrics[0].jobid; val != "10" {
|
||||||
|
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue