parent
74271ef07c
commit
f60e0b3804
|
@ -17,13 +17,7 @@ import (
|
|||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/alecthomas/kingpin/v2"
|
||||
"github.com/containerd/cgroups"
|
||||
|
@ -34,462 +28,28 @@ import (
|
|||
"github.com/prometheus/common/promlog"
|
||||
"github.com/prometheus/common/promlog/flag"
|
||||
"github.com/prometheus/common/version"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
const (
|
||||
namespace = "cgroup"
|
||||
"github.com/treydock/cgroup_exporter/collector"
|
||||
)
|
||||
|
||||
var (
|
||||
defCgroupRoot = "/sys/fs/cgroup"
|
||||
defProcRoot = "/proc"
|
||||
configPaths = kingpin.Flag("config.paths", "Comma separated list of cgroup paths to check, eg /user.slice,/system.slice,/slurm").Required().String()
|
||||
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9306").String()
|
||||
disableExporterMetrics = kingpin.Flag("web.disable-exporter-metrics", "Exclude metrics about the exporter (promhttp_*, process_*, go_*)").Default("false").Bool()
|
||||
cgroupRoot = kingpin.Flag("path.cgroup.root", "Root path to cgroup fs").Default(defCgroupRoot).String()
|
||||
procRoot = kingpin.Flag("path.proc.root", "Root path to proc fs").Default(defProcRoot).String()
|
||||
collectProc = kingpin.Flag("collect.proc", "Boolean that sets if to collect proc information").Default("false").Bool()
|
||||
collectProcMaxExec = kingpin.Flag("collect.proc.max-exec", "Max length of process executable to record").Default("100").Int()
|
||||
metricLock = sync.RWMutex{}
|
||||
)
|
||||
|
||||
type CgroupMetric struct {
|
||||
name string
|
||||
cpuUser float64
|
||||
cpuSystem float64
|
||||
cpuTotal float64
|
||||
cpus int
|
||||
cpu_list string
|
||||
memoryRSS float64
|
||||
memoryCache float64
|
||||
memoryUsed float64
|
||||
memoryTotal float64
|
||||
memoryFailCount float64
|
||||
memswUsed float64
|
||||
memswTotal float64
|
||||
memswFailCount float64
|
||||
userslice bool
|
||||
job bool
|
||||
uid string
|
||||
username string
|
||||
jobid string
|
||||
processExec map[string]float64
|
||||
err bool
|
||||
}
|
||||
|
||||
type Exporter struct {
|
||||
paths []string
|
||||
collectError *prometheus.Desc
|
||||
cpuUser *prometheus.Desc
|
||||
cpuSystem *prometheus.Desc
|
||||
cpuTotal *prometheus.Desc
|
||||
cpus *prometheus.Desc
|
||||
cpu_info *prometheus.Desc
|
||||
memoryRSS *prometheus.Desc
|
||||
memoryCache *prometheus.Desc
|
||||
memoryUsed *prometheus.Desc
|
||||
memoryTotal *prometheus.Desc
|
||||
memoryFailCount *prometheus.Desc
|
||||
memswUsed *prometheus.Desc
|
||||
memswTotal *prometheus.Desc
|
||||
memswFailCount *prometheus.Desc
|
||||
info *prometheus.Desc
|
||||
processExec *prometheus.Desc
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
func fileExists(filename string) bool {
|
||||
info, err := os.Stat(filename)
|
||||
if os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return !info.IsDir()
|
||||
}
|
||||
|
||||
func sliceContains(s interface{}, v interface{}) bool {
|
||||
slice := reflect.ValueOf(s)
|
||||
for i := 0; i < slice.Len(); i++ {
|
||||
if slice.Index(i).Interface() == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func subsystem() ([]cgroups.Subsystem, error) {
|
||||
s := []cgroups.Subsystem{
|
||||
cgroups.NewCpuacct(*cgroupRoot),
|
||||
cgroups.NewMemory(*cgroupRoot),
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func getCPUs(name string, logger log.Logger) ([]string, error) {
|
||||
cpusPath := fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupRoot, name)
|
||||
if !fileExists(cpusPath) {
|
||||
return nil, nil
|
||||
}
|
||||
cpusData, err := os.ReadFile(cpusPath)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error reading cpuset", "cpuset", cpusPath, "err", err)
|
||||
return nil, err
|
||||
}
|
||||
cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error parsing cpu set", "cpuset", cpusPath, "err", err)
|
||||
return nil, err
|
||||
}
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func parseCpuSet(cpuset string) ([]string, error) {
|
||||
var cpus []string
|
||||
var start, end int
|
||||
var err error
|
||||
if cpuset == "" {
|
||||
return nil, nil
|
||||
}
|
||||
ranges := strings.Split(cpuset, ",")
|
||||
for _, r := range ranges {
|
||||
boundaries := strings.Split(r, "-")
|
||||
if len(boundaries) == 1 {
|
||||
start, err = strconv.Atoi(boundaries[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
end = start
|
||||
} else if len(boundaries) == 2 {
|
||||
start, err = strconv.Atoi(boundaries[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
end, err = strconv.Atoi(boundaries[1])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
for e := start; e <= end; e++ {
|
||||
cpu := strconv.Itoa(e)
|
||||
cpus = append(cpus, cpu)
|
||||
}
|
||||
}
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func getInfo(name string, metric *CgroupMetric, logger log.Logger) {
|
||||
pathBase := filepath.Base(name)
|
||||
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
|
||||
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
|
||||
if len(userSliceMatch) == 2 {
|
||||
metric.userslice = true
|
||||
metric.uid = userSliceMatch[1]
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up user slice uid", "uid", metric.uid, "err", err)
|
||||
} else {
|
||||
metric.username = user.Username
|
||||
}
|
||||
return
|
||||
}
|
||||
slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)$")
|
||||
slurmMatch := slurmPattern.FindStringSubmatch(name)
|
||||
if len(slurmMatch) == 3 {
|
||||
metric.job = true
|
||||
metric.uid = slurmMatch[1]
|
||||
metric.jobid = slurmMatch[2]
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up slurm uid", "uid", metric.uid, "err", err)
|
||||
} else {
|
||||
metric.username = user.Username
|
||||
}
|
||||
return
|
||||
}
|
||||
if strings.HasPrefix(name, "/torque") {
|
||||
metric.job = true
|
||||
pathBaseSplit := strings.Split(pathBase, ".")
|
||||
metric.jobid = pathBaseSplit[0]
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func getProcInfo(pids []int, metric *CgroupMetric, logger log.Logger) {
|
||||
executables := make(map[string]float64)
|
||||
procFS, err := procfs.NewFS(*procRoot)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to open procfs", "path", *procRoot)
|
||||
return
|
||||
}
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(len(pids))
|
||||
for _, pid := range pids {
|
||||
go func(p int) {
|
||||
proc, err := procFS.Proc(p)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to read PID", "pid", p)
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
executable, err := proc.Executable()
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to get executable for PID", "pid", p)
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
if len(executable) > *collectProcMaxExec {
|
||||
level.Debug(logger).Log("msg", "Executable will be truncated", "executable", executable, "len", len(executable), "pid", p)
|
||||
trim := *collectProcMaxExec / 2
|
||||
executable_prefix := executable[0:trim]
|
||||
executable_suffix := executable[len(executable)-trim:]
|
||||
executable = fmt.Sprintf("%s...%s", executable_prefix, executable_suffix)
|
||||
}
|
||||
metricLock.Lock()
|
||||
executables[executable] += 1
|
||||
metricLock.Unlock()
|
||||
wg.Done()
|
||||
}(pid)
|
||||
}
|
||||
wg.Wait()
|
||||
metric.processExec = executables
|
||||
}
|
||||
|
||||
func getName(p cgroups.Process, path string, logger log.Logger) (string, error) {
|
||||
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
|
||||
name := strings.TrimPrefix(p.Path, cpuacctPath)
|
||||
name = strings.TrimSuffix(name, "/")
|
||||
dirs := strings.Split(name, "/")
|
||||
level.Debug(logger).Log("msg", "cgroup name", "dirs", fmt.Sprintf("%v", dirs))
|
||||
// Handle user.slice, system.slice and torque
|
||||
if len(dirs) == 3 {
|
||||
return name, nil
|
||||
}
|
||||
// Handle deeper cgroup where we want higher level, mainly SLURM
|
||||
var keepDirs []string
|
||||
for i, d := range dirs {
|
||||
if strings.HasPrefix(d, "job_") {
|
||||
keepDirs = dirs[0 : i+1]
|
||||
break
|
||||
}
|
||||
}
|
||||
if keepDirs == nil {
|
||||
return name, nil
|
||||
}
|
||||
name = strings.Join(keepDirs, "/")
|
||||
return name, nil
|
||||
}
|
||||
|
||||
func NewExporter(paths []string, logger log.Logger) *Exporter {
|
||||
return &Exporter{
|
||||
paths: paths,
|
||||
cpuUser: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "user_seconds"),
|
||||
"Cumalitive CPU user seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "system_seconds"),
|
||||
"Cumalitive CPU system seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpuTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "cpu", "total_seconds"),
|
||||
"Cumalitive CPU total seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"),
|
||||
"Number of CPUs in the cgroup", []string{"cgroup"}, nil),
|
||||
cpu_info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpu_info"),
|
||||
"Information about the cgroup CPUs", []string{"cgroup", "cpus"}, nil),
|
||||
memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"),
|
||||
"Memory RSS used in bytes", []string{"cgroup"}, nil),
|
||||
memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"),
|
||||
"Memory cache used in bytes", []string{"cgroup"}, nil),
|
||||
memoryUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "used_bytes"),
|
||||
"Memory used in bytes", []string{"cgroup"}, nil),
|
||||
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "total_bytes"),
|
||||
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "fail_count"),
|
||||
"Memory fail count", []string{"cgroup"}, nil),
|
||||
memswUsed: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "used_bytes"),
|
||||
"Swap used in bytes", []string{"cgroup"}, nil),
|
||||
memswTotal: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "total_bytes"),
|
||||
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||
memswFailCount: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memsw", "fail_count"),
|
||||
"Swap fail count", []string{"cgroup"}, nil),
|
||||
info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "info"),
|
||||
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
|
||||
processExec: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "process_exec_count"),
|
||||
"Count of instances of a given process", []string{"cgroup", "exec"}, nil),
|
||||
collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"),
|
||||
"Indicates collection error, 0=no error, 1=error", []string{"cgroup"}, nil),
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) getMetrics(name string, pids map[string][]int) (CgroupMetric, error) {
|
||||
metric := CgroupMetric{name: name}
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "path", name)
|
||||
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
|
||||
return name, nil
|
||||
})
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to load cgroups", "path", name, "err", err)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
stats, err := ctrl.Stat(cgroups.IgnoreNotExist)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to stat cgroups", "path", name, "err", err)
|
||||
return metric, err
|
||||
}
|
||||
if stats == nil {
|
||||
level.Error(e.logger).Log("msg", "Cgroup stats are nil", "path", name)
|
||||
return metric, err
|
||||
}
|
||||
if stats.CPU != nil {
|
||||
if stats.CPU.Usage != nil {
|
||||
metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0
|
||||
metric.cpuSystem = float64(stats.CPU.Usage.Kernel) / 1000000000.0
|
||||
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
|
||||
}
|
||||
}
|
||||
if stats.Memory != nil {
|
||||
metric.memoryRSS = float64(stats.Memory.TotalRSS)
|
||||
metric.memoryCache = float64(stats.Memory.TotalCache)
|
||||
if stats.Memory.Usage != nil {
|
||||
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
|
||||
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
|
||||
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
|
||||
}
|
||||
if stats.Memory.Swap != nil {
|
||||
metric.memswUsed = float64(stats.Memory.Swap.Usage)
|
||||
metric.memswTotal = float64(stats.Memory.Swap.Limit)
|
||||
metric.memswFailCount = float64(stats.Memory.Swap.Failcnt)
|
||||
}
|
||||
}
|
||||
if cpus, err := getCPUs(name, e.logger); err == nil {
|
||||
metric.cpus = len(cpus)
|
||||
metric.cpu_list = strings.Join(cpus, ",")
|
||||
}
|
||||
getInfo(name, &metric, e.logger)
|
||||
if *collectProc {
|
||||
if val, ok := pids[name]; ok {
|
||||
level.Debug(e.logger).Log("msg", "Get process info", "pids", fmt.Sprintf("%v", val))
|
||||
getProcInfo(val, &metric, e.logger)
|
||||
} else {
|
||||
level.Error(e.logger).Log("msg", "Unable to get PIDs", "path", name)
|
||||
}
|
||||
}
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (e *Exporter) collect() ([]CgroupMetric, error) {
|
||||
var names []string
|
||||
var metrics []CgroupMetric
|
||||
for _, path := range e.paths {
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "path", path)
|
||||
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup subsystem", "path", path, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
processes, err := control.Processes(cgroups.Cpuacct, true)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup processes", "path", path, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
level.Debug(e.logger).Log("msg", "Found processes", "processes", len(processes))
|
||||
pids := make(map[string][]int)
|
||||
for _, p := range processes {
|
||||
level.Debug(e.logger).Log("msg", "Get Name", "process", p.Path, "pid", p.Pid, "path", path)
|
||||
name, err := getName(p, path, e.logger)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error getting cgroup name for process", "process", p.Path, "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
if !sliceContains(names, name) {
|
||||
names = append(names, name)
|
||||
}
|
||||
if val, ok := pids[name]; ok {
|
||||
if !sliceContains(val, p.Pid) {
|
||||
val = append(val, p.Pid)
|
||||
}
|
||||
pids[name] = val
|
||||
} else {
|
||||
pids[name] = []int{p.Pid}
|
||||
}
|
||||
}
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(len(names))
|
||||
for _, name := range names {
|
||||
go func(n string, p map[string][]int) {
|
||||
metric, _ := e.getMetrics(n, p)
|
||||
metricLock.Lock()
|
||||
metrics = append(metrics, metric)
|
||||
metricLock.Unlock()
|
||||
wg.Done()
|
||||
}(name, pids)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- e.cpuUser
|
||||
ch <- e.cpuSystem
|
||||
ch <- e.cpuTotal
|
||||
ch <- e.cpus
|
||||
ch <- e.cpu_info
|
||||
ch <- e.memoryRSS
|
||||
ch <- e.memoryCache
|
||||
ch <- e.memoryUsed
|
||||
ch <- e.memoryTotal
|
||||
ch <- e.memoryFailCount
|
||||
ch <- e.memswUsed
|
||||
ch <- e.memswTotal
|
||||
ch <- e.memswFailCount
|
||||
ch <- e.info
|
||||
if *collectProc {
|
||||
ch <- e.processExec
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
||||
metrics, _ := e.collect()
|
||||
for _, m := range metrics {
|
||||
if m.err {
|
||||
ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, m.name)
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuUser, prometheus.GaugeValue, m.cpuUser, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpu_info, prometheus.GaugeValue, 1, m.name, m.cpu_list)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryCache, prometheus.GaugeValue, m.memoryCache, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memswUsed, prometheus.GaugeValue, m.memswUsed, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memswTotal, prometheus.GaugeValue, m.memswTotal, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.name)
|
||||
if m.userslice || m.job {
|
||||
ch <- prometheus.MustNewConstMetric(e.info, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
|
||||
}
|
||||
if *collectProc {
|
||||
for exec, count := range m.processExec {
|
||||
ch <- prometheus.MustNewConstMetric(e.processExec, prometheus.GaugeValue, count, m.name, exec)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func metricsHandler(logger log.Logger) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
registry := prometheus.NewRegistry()
|
||||
|
||||
paths := strings.Split(*configPaths, ",")
|
||||
|
||||
exporter := NewExporter(paths, logger)
|
||||
registry.MustRegister(exporter)
|
||||
registry.MustRegister(version.NewCollector(fmt.Sprintf("%s_exporter", namespace)))
|
||||
var cgroupV2 bool
|
||||
if cgroups.Mode() == cgroups.Unified {
|
||||
cgroupV2 = true
|
||||
}
|
||||
cgroupCollector := collector.NewCgroupCollector(cgroupV2, paths, logger)
|
||||
registry.MustRegister(cgroupCollector)
|
||||
registry.MustRegister(version.NewCollector(fmt.Sprintf("%s_exporter", collector.Namespace)))
|
||||
|
||||
gatherers := prometheus.Gatherers{registry}
|
||||
if !*disableExporterMetrics {
|
||||
|
|
|
@ -19,7 +19,6 @@ import (
|
|||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
|
@ -27,6 +26,7 @@ import (
|
|||
|
||||
kingpin "github.com/alecthomas/kingpin/v2"
|
||||
"github.com/go-kit/log"
|
||||
"github.com/treydock/cgroup_exporter/collector"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -40,12 +40,11 @@ func TestMain(m *testing.M) {
|
|||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
fixture := filepath.Join(dir, "fixtures")
|
||||
cgroupRoot = &fixture
|
||||
collector.CgroupRoot = &fixture
|
||||
procFixture := filepath.Join(fixture, "proc")
|
||||
procRoot = &procFixture
|
||||
collector.ProcRoot = &procFixture
|
||||
varTrue := true
|
||||
disableExporterMetrics = &varTrue
|
||||
collectProc = &varTrue
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
go func() {
|
||||
|
@ -62,246 +61,6 @@ func TestMain(m *testing.M) {
|
|||
os.Exit(exitVal)
|
||||
}
|
||||
|
||||
func TestParseCpuSet(t *testing.T) {
|
||||
expected := []string{"0", "1", "2"}
|
||||
if cpus, err := parseCpuSet("0-2"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
expected = []string{"0", "1", "4", "5", "8", "9"}
|
||||
if cpus, err := parseCpuSet("0-1,4-5,8-9"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
expected = []string{"1", "3", "5", "7"}
|
||||
if cpus, err := parseCpuSet("1,3,5,7"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetProcInfo(t *testing.T) {
|
||||
metric := CgroupMetric{}
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
getProcInfo([]int{95521, 95525}, &metric, logger)
|
||||
if val, ok := metric.processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics")
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 /bin/bash processes, got %v", val)
|
||||
}
|
||||
}
|
||||
varLen := 6
|
||||
collectProcMaxExec = &varLen
|
||||
getProcInfo([]int{95521, 95525}, &metric, logger)
|
||||
if val, ok := metric.processExec["/bi...ash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics, found: %v", metric.processExec)
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 /b...sh processes, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectUserSlice(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/user.slice"}, logger)
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 0.41 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuSystem; val != 0.39 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuTotal; val != 0.831825022 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpus; val != 0 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryRSS; val != 5378048 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryCache; val != 2322432 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryUsed; val != 8081408 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryTotal; val != 68719476736 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswUsed; val != 8081408 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswTotal; val != 9.223372036854772e+18 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectSLURM(t *testing.T) {
|
||||
varTrue := true
|
||||
collectProc = &varTrue
|
||||
varLen := 100
|
||||
collectProcMaxExec = &varLen
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/slurm"}, logger)
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 2 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 2", val)
|
||||
return
|
||||
}
|
||||
var m CgroupMetric
|
||||
for _, metric := range metrics {
|
||||
if metric.jobid == "10" {
|
||||
m = metric
|
||||
}
|
||||
}
|
||||
if m.jobid == "" {
|
||||
t.Errorf("Metrics with jobid=10 not found")
|
||||
return
|
||||
}
|
||||
if val := m.cpuUser; val != 0 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := m.cpuSystem; val != 0 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := m.cpuTotal; val != 0.007710215 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := m.cpus; val != 2 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := m.memoryRSS; val != 311296 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := m.memoryCache; val != 4096 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := m.memoryUsed; val != 356352 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := m.memoryTotal; val != 2147483648 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := m.memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := m.memswUsed; val != 356352 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := m.memswTotal; val != 2147483648 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := m.memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := m.uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
if val := m.jobid; val != "10" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
if val, ok := m.processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("processExec does not contain /bin/bash")
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Unexpected 2 values for processExec /bin/bash, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectTorque(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/torque"}, logger)
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 153146.31 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuSystem; val != 260.77 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuTotal; val != 152995.785583781 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpus; val != 40 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryRSS; val != 82444320768 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryCache; val != 109678592 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryUsed; val != 82553999360 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryTotal; val != 196755132416 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswUsed; val != 82553999360 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswTotal; val != 196755132416 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].uid; val != "" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
if val := metrics[0].jobid; val != "1182724" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsHandler(t *testing.T) {
|
||||
body, err := queryExporter()
|
||||
if err != nil {
|
||||
|
|
|
@ -0,0 +1,218 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/containerd/cgroups"
|
||||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
)
|
||||
|
||||
func NewCgroupV1Collector(paths []string, logger log.Logger) Collector {
|
||||
return NewExporter(paths, logger, false)
|
||||
}
|
||||
|
||||
func subsystem() ([]cgroups.Subsystem, error) {
|
||||
s := []cgroups.Subsystem{
|
||||
cgroups.NewCpuacct(*CgroupRoot),
|
||||
cgroups.NewMemory(*CgroupRoot),
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func getInfov1(name string, metric *CgroupMetric, logger log.Logger) {
|
||||
pathBase := filepath.Base(name)
|
||||
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
|
||||
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
|
||||
if len(userSliceMatch) == 2 {
|
||||
metric.userslice = true
|
||||
metric.uid = userSliceMatch[1]
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up user slice uid", "uid", metric.uid, "err", err)
|
||||
} else {
|
||||
metric.username = user.Username
|
||||
}
|
||||
return
|
||||
}
|
||||
slurmPattern := regexp.MustCompile("^/slurm/uid_([0-9]+)/job_([0-9]+)$")
|
||||
slurmMatch := slurmPattern.FindStringSubmatch(name)
|
||||
if len(slurmMatch) == 3 {
|
||||
metric.job = true
|
||||
metric.uid = slurmMatch[1]
|
||||
metric.jobid = slurmMatch[2]
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up slurm uid", "uid", metric.uid, "err", err)
|
||||
} else {
|
||||
metric.username = user.Username
|
||||
}
|
||||
return
|
||||
}
|
||||
if strings.HasPrefix(name, "/torque") {
|
||||
metric.job = true
|
||||
pathBaseSplit := strings.Split(pathBase, ".")
|
||||
metric.jobid = pathBaseSplit[0]
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func getNamev1(p cgroups.Process, path string, logger log.Logger) (string, error) {
|
||||
cpuacctPath := filepath.Join(*CgroupRoot, "cpuacct")
|
||||
name := strings.TrimPrefix(p.Path, cpuacctPath)
|
||||
name = strings.TrimSuffix(name, "/")
|
||||
dirs := strings.Split(name, "/")
|
||||
level.Debug(logger).Log("msg", "cgroup name", "dirs", fmt.Sprintf("%v", dirs))
|
||||
// Handle user.slice, system.slice and torque
|
||||
if len(dirs) == 3 {
|
||||
return name, nil
|
||||
}
|
||||
// Handle deeper cgroup where we want higher level, mainly SLURM
|
||||
var keepDirs []string
|
||||
for i, d := range dirs {
|
||||
if strings.HasPrefix(d, "job_") {
|
||||
keepDirs = dirs[0 : i+1]
|
||||
break
|
||||
}
|
||||
}
|
||||
if keepDirs == nil {
|
||||
return name, nil
|
||||
}
|
||||
name = strings.Join(keepDirs, "/")
|
||||
return name, nil
|
||||
}
|
||||
|
||||
func (e *Exporter) getMetricsv1(name string, pids map[string][]int) (CgroupMetric, error) {
|
||||
metric := CgroupMetric{name: name}
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "root", *CgroupRoot, "path", name)
|
||||
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
|
||||
return name, nil
|
||||
})
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to load cgroups", "path", name, "err", err)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
stats, err := ctrl.Stat(cgroups.IgnoreNotExist)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to stat cgroups", "path", name, "err", err)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
if stats == nil {
|
||||
level.Error(e.logger).Log("msg", "Cgroup stats are nil", "path", name)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
if stats.CPU != nil {
|
||||
if stats.CPU.Usage != nil {
|
||||
metric.cpuUser = float64(stats.CPU.Usage.User) / 1000000000.0
|
||||
metric.cpuSystem = float64(stats.CPU.Usage.Kernel) / 1000000000.0
|
||||
metric.cpuTotal = float64(stats.CPU.Usage.Total) / 1000000000.0
|
||||
}
|
||||
}
|
||||
if stats.Memory != nil {
|
||||
metric.memoryRSS = float64(stats.Memory.TotalRSS)
|
||||
metric.memoryCache = float64(stats.Memory.TotalCache)
|
||||
if stats.Memory.Usage != nil {
|
||||
metric.memoryUsed = float64(stats.Memory.Usage.Usage)
|
||||
metric.memoryTotal = float64(stats.Memory.Usage.Limit)
|
||||
metric.memoryFailCount = float64(stats.Memory.Usage.Failcnt)
|
||||
}
|
||||
if stats.Memory.Swap != nil {
|
||||
metric.memswUsed = float64(stats.Memory.Swap.Usage)
|
||||
metric.memswTotal = float64(stats.Memory.Swap.Limit)
|
||||
metric.memswFailCount = float64(stats.Memory.Swap.Failcnt)
|
||||
}
|
||||
}
|
||||
cpusPath := fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *CgroupRoot, name)
|
||||
if cpus, err := getCPUs(cpusPath, e.logger); err == nil {
|
||||
metric.cpus = len(cpus)
|
||||
metric.cpu_list = strings.Join(cpus, ",")
|
||||
}
|
||||
getInfov1(name, &metric, e.logger)
|
||||
if *collectProc {
|
||||
if val, ok := pids[name]; ok {
|
||||
level.Debug(e.logger).Log("msg", "Get process info", "pids", fmt.Sprintf("%v", val))
|
||||
getProcInfo(val, &metric, e.logger)
|
||||
} else {
|
||||
level.Error(e.logger).Log("msg", "Unable to get PIDs", "path", name)
|
||||
metric.err = true
|
||||
}
|
||||
}
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (e *Exporter) collectv1() ([]CgroupMetric, error) {
|
||||
var names []string
|
||||
var metrics []CgroupMetric
|
||||
for _, path := range e.paths {
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "root", *CgroupRoot, "path", path)
|
||||
control, err := cgroups.Load(subsystem, cgroups.StaticPath(path))
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup subsystem", "root", *CgroupRoot, "path", path, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
processes, err := control.Processes(cgroups.Cpuacct, true)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup processes", "path", path, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
level.Debug(e.logger).Log("msg", "Found processes", "processes", len(processes))
|
||||
pids := make(map[string][]int)
|
||||
for _, p := range processes {
|
||||
level.Debug(e.logger).Log("msg", "Get Name", "process", p.Path, "pid", p.Pid, "path", path)
|
||||
name, err := getNamev1(p, path, e.logger)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error getting cgroup name for process", "process", p.Path, "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
if !sliceContains(names, name) {
|
||||
names = append(names, name)
|
||||
}
|
||||
if val, ok := pids[name]; ok {
|
||||
if !sliceContains(val, p.Pid) {
|
||||
val = append(val, p.Pid)
|
||||
}
|
||||
pids[name] = val
|
||||
} else {
|
||||
pids[name] = []int{p.Pid}
|
||||
}
|
||||
}
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(len(names))
|
||||
for _, name := range names {
|
||||
go func(n string, p map[string][]int) {
|
||||
defer wg.Done()
|
||||
metric, _ := e.getMetricsv1(n, p)
|
||||
metricLock.Lock()
|
||||
metrics = append(metrics, metric)
|
||||
metricLock.Unlock()
|
||||
}(name, pids)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,214 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
)
|
||||
|
||||
func TestCollectUserSlice(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/user.slice"}, logger, false)
|
||||
metrics, err := exporter.collectv1()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 0.41 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuSystem; val != 0.39 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuTotal; val != 0.831825022 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpus; val != 0 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryRSS; val != 5378048 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryCache; val != 2322432 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryUsed; val != 8081408 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryTotal; val != 68719476736 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswUsed; val != 8081408 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswTotal; val != 9.223372036854772e+18 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectSLURM(t *testing.T) {
|
||||
varTrue := true
|
||||
collectProc = &varTrue
|
||||
varLen := 100
|
||||
collectProcMaxExec = &varLen
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/slurm"}, logger, false)
|
||||
metrics, err := exporter.collectv1()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 2 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 2", val)
|
||||
return
|
||||
}
|
||||
var m CgroupMetric
|
||||
for _, metric := range metrics {
|
||||
if metric.jobid == "10" {
|
||||
m = metric
|
||||
}
|
||||
}
|
||||
if m.jobid == "" {
|
||||
t.Errorf("Metrics with jobid=10 not found")
|
||||
return
|
||||
}
|
||||
if val := m.cpuUser; val != 0 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := m.cpuSystem; val != 0 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := m.cpuTotal; val != 0.007710215 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := m.cpus; val != 2 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := m.memoryRSS; val != 311296 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := m.memoryCache; val != 4096 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := m.memoryUsed; val != 356352 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := m.memoryTotal; val != 2147483648 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := m.memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := m.memswUsed; val != 356352 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := m.memswTotal; val != 2147483648 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := m.memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := m.uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
if val := m.jobid; val != "10" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
if val, ok := m.processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("processExec does not contain /bin/bash")
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Unexpected 2 values for processExec /bin/bash, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectTorque(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/torque"}, logger, false)
|
||||
metrics, err := exporter.collectv1()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 153146.31 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuSystem; val != 260.77 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuTotal; val != 152995.785583781 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpus; val != 40 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryRSS; val != 82444320768 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryCache; val != 109678592 {
|
||||
t.Errorf("Unexpected value for memoryCache, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryUsed; val != 82553999360 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryTotal; val != 196755132416 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswUsed; val != 82553999360 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswTotal; val != 196755132416 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for swapFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].uid; val != "" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
if val := metrics[0].jobid; val != "1182724" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/containerd/cgroups/v3/cgroup2"
|
||||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
var (
|
||||
// Use this hack to allow unit tests to override /proc location
|
||||
pidGroupPath = cgroup2.PidGroupPath
|
||||
)
|
||||
|
||||
func NewCgroupV2Collector(paths []string, logger log.Logger) Collector {
|
||||
return NewExporter(paths, logger, true)
|
||||
}
|
||||
|
||||
func getInfov2(name string, pids []int, metric *CgroupMetric, logger log.Logger) {
|
||||
pathBase := filepath.Base(name)
|
||||
userSlicePattern := regexp.MustCompile("^user-([0-9]+).slice$")
|
||||
userSliceMatch := userSlicePattern.FindStringSubmatch(pathBase)
|
||||
if len(userSliceMatch) == 2 {
|
||||
metric.userslice = true
|
||||
metric.uid = userSliceMatch[1]
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up user slice uid", "uid", metric.uid, "err", err)
|
||||
} else {
|
||||
metric.username = user.Username
|
||||
}
|
||||
return
|
||||
}
|
||||
slurmPattern := regexp.MustCompile("/job_([0-9]+)$")
|
||||
slurmMatch := slurmPattern.FindStringSubmatch(name)
|
||||
if len(slurmMatch) == 2 {
|
||||
metric.job = true
|
||||
metric.jobid = slurmMatch[1]
|
||||
procFS, err := procfs.NewFS(*ProcRoot)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to get procfs", "root", *ProcRoot, "err", err)
|
||||
return
|
||||
}
|
||||
var proc procfs.Proc
|
||||
for _, pid := range pids {
|
||||
proc, err = procFS.Proc(pid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to read PID", "pid", pid, "err", err)
|
||||
return
|
||||
}
|
||||
exec, err := proc.Executable()
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to read process executable", "pid", pid, "err", err)
|
||||
return
|
||||
}
|
||||
if filepath.Base(exec) != "sleep" && filepath.Base(exec) != "slurmstepd" {
|
||||
break
|
||||
}
|
||||
}
|
||||
procStat, err := proc.NewStatus()
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to get proc status for PID", "pid", proc.PID, "err", err)
|
||||
return
|
||||
}
|
||||
// effective UID
|
||||
uid := procStat.UIDs[1]
|
||||
metric.uid = uid
|
||||
user, err := user.LookupId(metric.uid)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error looking up slurm uid", "uid", metric.uid, "err", err)
|
||||
return
|
||||
}
|
||||
metric.username = user.Username
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func getNamev2(pidPath string, path string, logger log.Logger) string {
|
||||
dirs := strings.Split(pidPath, "/")
|
||||
var name string
|
||||
if strings.Contains(path, "slurm") {
|
||||
keepDirs := dirs[0:4]
|
||||
name = strings.Join(keepDirs, "/")
|
||||
} else {
|
||||
keepDirs := dirs[0:3]
|
||||
name = strings.Join(keepDirs, "/")
|
||||
}
|
||||
level.Debug(logger).Log("msg", "Get name from path", "name", name, "pidPath", pidPath, "path", path, "dirs", fmt.Sprintf("+%v", dirs))
|
||||
return name
|
||||
}
|
||||
|
||||
func getStatv2(name string, path string, logger log.Logger) (float64, error) {
|
||||
if !fileExists(path) {
|
||||
return 0, fmt.Errorf("Path %s does not exist", path)
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
parts := strings.Fields(s.Text())
|
||||
if len(parts) != 2 {
|
||||
return 0, cgroup2.ErrInvalidFormat
|
||||
}
|
||||
v, err := strconv.ParseUint(parts[1], 10, 64)
|
||||
if err != nil {
|
||||
return 0, cgroup2.ErrInvalidFormat
|
||||
}
|
||||
if parts[0] == name {
|
||||
return float64(v), nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("Unable to find stat key %s in %s", name, path)
|
||||
}
|
||||
|
||||
func (e *Exporter) getMetricsv2(name string, pids []int, opts cgroup2.InitOpts) (CgroupMetric, error) {
|
||||
metric := CgroupMetric{name: name}
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "path", name)
|
||||
ctrl, err := cgroup2.Load(name, opts)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to load cgroups", "path", name, "err", err)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
stats, err := ctrl.Stat()
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Failed to get cgroup stats", "path", name)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
if stats == nil {
|
||||
level.Error(e.logger).Log("msg", "Cgroup stats are nil", "path", name)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
if stats.CPU != nil {
|
||||
metric.cpuUser = float64(stats.CPU.UserUsec) / 1000000.0
|
||||
metric.cpuSystem = float64(stats.CPU.SystemUsec) / 1000000.0
|
||||
metric.cpuTotal = float64(stats.CPU.UsageUsec) / 1000000.0
|
||||
}
|
||||
// TODO: Move to https://github.com/containerd/cgroups/blob/d131035c7599c51ff4aed27903c45eb3b2cc29d0/cgroup2/manager.go#L593
|
||||
memoryStatPath := filepath.Join(*CgroupRoot, name, "memory.stat")
|
||||
swapcached, err := getStatv2("swapcached", memoryStatPath, e.logger)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Unable to get swapcached", "path", name, "err", err)
|
||||
metric.err = true
|
||||
return metric, err
|
||||
}
|
||||
if stats.Memory != nil {
|
||||
metric.memoryRSS = float64(stats.Memory.Anon) + swapcached + float64(stats.Memory.AnonThp)
|
||||
metric.memoryUsed = float64(stats.Memory.Usage)
|
||||
metric.memoryTotal = float64(stats.Memory.UsageLimit)
|
||||
metric.memoryCache = float64(stats.Memory.File)
|
||||
metric.memswUsed = float64(stats.Memory.SwapUsage)
|
||||
metric.memswTotal = float64(stats.Memory.SwapLimit)
|
||||
if stats.MemoryEvents != nil {
|
||||
metric.memoryFailCount = float64(stats.MemoryEvents.Oom)
|
||||
}
|
||||
}
|
||||
// TODO: cpuset.cpus.effective?
|
||||
cpusPath := filepath.Join(*CgroupRoot, name, "cpuset.cpus")
|
||||
if cpus, err := getCPUs(cpusPath, e.logger); err == nil {
|
||||
metric.cpus = len(cpus)
|
||||
metric.cpu_list = strings.Join(cpus, ",")
|
||||
}
|
||||
getInfov2(name, pids, &metric, e.logger)
|
||||
if *collectProc {
|
||||
level.Debug(e.logger).Log("msg", "Get process info", "pids", fmt.Sprintf("%v", pids))
|
||||
getProcInfo(pids, &metric, e.logger)
|
||||
}
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (e *Exporter) collectv2() ([]CgroupMetric, error) {
|
||||
var names []string
|
||||
var metrics []CgroupMetric
|
||||
for _, path := range e.paths {
|
||||
var group string
|
||||
if strings.Contains(path, "slurm") {
|
||||
group = "/system.slice/slurmstepd.scope"
|
||||
} else {
|
||||
group = path
|
||||
}
|
||||
level.Debug(e.logger).Log("msg", "Loading cgroup", "path", path, "group", group)
|
||||
//TODO
|
||||
//control, err := cgroup2.LoadSystemd(path, group)
|
||||
opts := cgroup2.WithMountpoint(*CgroupRoot)
|
||||
control, err := cgroup2.Load(group, opts)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup", "path", path, "group", group, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
processes, err := control.Procs(true)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error loading cgroup processes", "path", path, "group", group, "err", err)
|
||||
metric := CgroupMetric{name: path, err: true}
|
||||
metrics = append(metrics, metric)
|
||||
continue
|
||||
}
|
||||
level.Debug(e.logger).Log("msg", "Found processes", "path", path, "group", group, "processes", len(processes))
|
||||
pids := make(map[string][]int)
|
||||
for _, p := range processes {
|
||||
pid := int(p)
|
||||
pidPath, err := pidGroupPath(pid)
|
||||
if err != nil {
|
||||
level.Error(e.logger).Log("msg", "Error getting PID group path", "path", path, "group", group, "pid", pid, "err", err)
|
||||
continue
|
||||
}
|
||||
level.Debug(e.logger).Log("msg", "Get Name", "pid", pid, "path", path)
|
||||
name := getNamev2(pidPath, path, e.logger)
|
||||
if strings.Contains(path, "slurm") && filepath.Base(name) == "system" {
|
||||
level.Debug(e.logger).Log("msg", "Skip system cgroup", "name", name)
|
||||
continue
|
||||
}
|
||||
if !sliceContains(names, name) {
|
||||
names = append(names, name)
|
||||
}
|
||||
if val, ok := pids[name]; ok {
|
||||
if !sliceContains(val, pid) {
|
||||
val = append(val, pid)
|
||||
}
|
||||
pids[name] = val
|
||||
} else {
|
||||
pids[name] = []int{pid}
|
||||
}
|
||||
}
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(len(names))
|
||||
for _, name := range names {
|
||||
go func(n string, p map[string][]int) {
|
||||
defer wg.Done()
|
||||
var pids []int
|
||||
if val, ok := p[n]; ok {
|
||||
pids = val
|
||||
} else {
|
||||
level.Error(e.logger).Log("msg", "Unable to get PIDs for name", "name", n)
|
||||
return
|
||||
}
|
||||
metric, _ := e.getMetricsv2(n, pids, opts)
|
||||
metricLock.Lock()
|
||||
metrics = append(metrics, metric)
|
||||
metricLock.Unlock()
|
||||
}(name, pids)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,220 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
)
|
||||
|
||||
func TestGetStatv2(t *testing.T) {
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
_, err := getStatv2("swapcached", "/dne", logger)
|
||||
if err == nil {
|
||||
t.Errorf("Expected error with /dne but none given")
|
||||
}
|
||||
path := filepath.Join(*CgroupRoot, "system.slice")
|
||||
_, err = getStatv2("swapcached", path, logger)
|
||||
if err == nil {
|
||||
t.Errorf("Expected error with /dne but none given")
|
||||
}
|
||||
path = filepath.Join(*CgroupRoot, "user.slice/user-20821.slice/memory.max")
|
||||
_, err = getStatv2("swapcached", path, logger)
|
||||
if err == nil {
|
||||
t.Errorf("Expected error with single value file but none given")
|
||||
}
|
||||
path = filepath.Join(*CgroupRoot, "stat.invalid")
|
||||
_, err = getStatv2("nan", path, logger)
|
||||
if err == nil {
|
||||
t.Errorf("Expected error with stat.invalid but none given")
|
||||
}
|
||||
path = filepath.Join(*CgroupRoot, "user.slice/user-20821.slice/memory.stat")
|
||||
_, err = getStatv2("dne", path, logger)
|
||||
if err == nil {
|
||||
t.Errorf("Expected error when stat key missing but none given")
|
||||
}
|
||||
stat, err := getStatv2("swapcached", path, logger)
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err)
|
||||
}
|
||||
if stat != 0 {
|
||||
t.Errorf("Unexpectd value: %v", stat)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectv2Error(t *testing.T) {
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/dne"}, logger, true)
|
||||
metrics, err := exporter.collectv2()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].err; val != true {
|
||||
t.Errorf("Unexpected value for err, got %v", val)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectv2UserSlice(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
pidGroupPath = func(pid int) (string, error) {
|
||||
if pid == 67998 {
|
||||
return "/user.slice/user-20821.slice/session-157.scope", nil
|
||||
}
|
||||
return "", fmt.Errorf("Could not find cgroup path for %d", pid)
|
||||
}
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/user.slice"}, logger, true)
|
||||
metrics, err := exporter.collectv2()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].name; val != "/user.slice/user-20821.slice" {
|
||||
t.Errorf("Unexpected value for name, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 15.270449 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuSystem; val != 2.705424 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpuTotal; val != 17.975873 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].cpus; val != 0 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryRSS; val != 22048768 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryUsed; val != 27115520 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryTotal; val != 2147483648 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswUsed; val != 0 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := metrics[0].memswTotal; val != 1.8446744073709552e+19 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := metrics[0].uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectv2SLURM(t *testing.T) {
|
||||
varTrue := true
|
||||
collectProc = &varTrue
|
||||
varLen := 100
|
||||
collectProcMaxExec = &varLen
|
||||
pidGroupPath = func(pid int) (string, error) {
|
||||
if pid == 49276 {
|
||||
return "/system.slice/slurmstepd.scope/job_4/step_0/user/task_0", nil
|
||||
}
|
||||
if pid == 43310 {
|
||||
return "/system.slice/slurmstepd.scope/system", nil
|
||||
}
|
||||
return "", fmt.Errorf("Could not find cgroup path for %d", pid)
|
||||
}
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
exporter := NewExporter([]string{"/slurm"}, logger, true)
|
||||
metrics, err := exporter.collectv2()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
return
|
||||
}
|
||||
var m CgroupMetric
|
||||
for _, metric := range metrics {
|
||||
if metric.jobid == "4" {
|
||||
m = metric
|
||||
}
|
||||
}
|
||||
if m.jobid == "" {
|
||||
t.Errorf("Metrics with jobid=4 not found")
|
||||
return
|
||||
}
|
||||
if val := m.name; val != "/system.slice/slurmstepd.scope/job_4" {
|
||||
t.Errorf("Unexpected value for name, got %v", val)
|
||||
}
|
||||
if val := m.cpuUser; val != 0.049043 {
|
||||
t.Errorf("Unexpected value for cpuUser, got %v", val)
|
||||
}
|
||||
if val := m.cpuSystem; val != 0.077642 {
|
||||
t.Errorf("Unexpected value for cpuSystem, got %v", val)
|
||||
}
|
||||
if val := m.cpuTotal; val != 0.126686 {
|
||||
t.Errorf("Unexpected value for cpuTotal, got %v", val)
|
||||
}
|
||||
if val := m.cpus; val != 1 {
|
||||
t.Errorf("Unexpected value for cpus, got %v", val)
|
||||
}
|
||||
if val := m.memoryRSS; val != 2260992 {
|
||||
t.Errorf("Unexpected value for memoryRSS, got %v", val)
|
||||
}
|
||||
if val := m.memoryUsed; val != 5660672 {
|
||||
t.Errorf("Unexpected value for memoryUsed, got %v", val)
|
||||
}
|
||||
if val := m.memoryTotal; val != 1835008000 {
|
||||
t.Errorf("Unexpected value for memoryTotal, got %v", val)
|
||||
}
|
||||
if val := m.memoryFailCount; val != 0 {
|
||||
t.Errorf("Unexpected value for memoryFailCount, got %v", val)
|
||||
}
|
||||
if val := m.memswUsed; val != 0 {
|
||||
t.Errorf("Unexpected value for swapUsed, got %v", val)
|
||||
}
|
||||
if val := m.memswTotal; val != 1835008000 {
|
||||
t.Errorf("Unexpected value for swapTotal, got %v", val)
|
||||
}
|
||||
if val := m.uid; val != "20821" {
|
||||
t.Errorf("Unexpected value for uid, got %v", val)
|
||||
}
|
||||
if val := m.jobid; val != "4" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
if val, ok := m.processExec["/usr/bin/bash"]; !ok {
|
||||
t.Errorf("processExec does not contain /bin/bash")
|
||||
} else {
|
||||
if val != 1 {
|
||||
t.Errorf("Unexpected 1 values for processExec /usr/bin/bash, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,313 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/alecthomas/kingpin/v2"
|
||||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
var (
|
||||
collectProc = kingpin.Flag("collect.proc", "Boolean that sets if to collect proc information").Default("false").Bool()
|
||||
CgroupRoot = kingpin.Flag("path.cgroup.root", "Root path to cgroup fs").Default(defCgroupRoot).String()
|
||||
collectProcMaxExec = kingpin.Flag("collect.proc.max-exec", "Max length of process executable to record").Default("100").Int()
|
||||
ProcRoot = kingpin.Flag("path.proc.root", "Root path to proc fs").Default(defProcRoot).String()
|
||||
metricLock = sync.RWMutex{}
|
||||
)
|
||||
|
||||
const (
|
||||
Namespace = "cgroup"
|
||||
defCgroupRoot = "/sys/fs/cgroup"
|
||||
defProcRoot = "/proc"
|
||||
)
|
||||
|
||||
type Collector interface {
|
||||
// Get new metrics and expose them via prometheus registry.
|
||||
Describe(ch chan<- *prometheus.Desc)
|
||||
Collect(ch chan<- prometheus.Metric)
|
||||
}
|
||||
|
||||
type Exporter struct {
|
||||
paths []string
|
||||
collectError *prometheus.Desc
|
||||
cpuUser *prometheus.Desc
|
||||
cpuSystem *prometheus.Desc
|
||||
cpuTotal *prometheus.Desc
|
||||
cpus *prometheus.Desc
|
||||
cpu_info *prometheus.Desc
|
||||
memoryRSS *prometheus.Desc
|
||||
memoryCache *prometheus.Desc
|
||||
memoryUsed *prometheus.Desc
|
||||
memoryTotal *prometheus.Desc
|
||||
memoryFailCount *prometheus.Desc
|
||||
memswUsed *prometheus.Desc
|
||||
memswTotal *prometheus.Desc
|
||||
memswFailCount *prometheus.Desc
|
||||
info *prometheus.Desc
|
||||
processExec *prometheus.Desc
|
||||
logger log.Logger
|
||||
cgroupv2 bool
|
||||
}
|
||||
|
||||
type CgroupMetric struct {
|
||||
name string
|
||||
cpuUser float64
|
||||
cpuSystem float64
|
||||
cpuTotal float64
|
||||
cpus int
|
||||
cpu_list string
|
||||
memoryRSS float64
|
||||
memoryCache float64
|
||||
memoryUsed float64
|
||||
memoryTotal float64
|
||||
memoryFailCount float64
|
||||
memswUsed float64
|
||||
memswTotal float64
|
||||
memswFailCount float64
|
||||
userslice bool
|
||||
job bool
|
||||
uid string
|
||||
username string
|
||||
jobid string
|
||||
processExec map[string]float64
|
||||
err bool
|
||||
}
|
||||
|
||||
func NewCgroupCollector(cgroupV2 bool, paths []string, logger log.Logger) Collector {
|
||||
var collector Collector
|
||||
if cgroupV2 {
|
||||
collector = NewCgroupV2Collector(paths, logger)
|
||||
} else {
|
||||
collector = NewCgroupV1Collector(paths, logger)
|
||||
}
|
||||
return collector
|
||||
}
|
||||
|
||||
func NewExporter(paths []string, logger log.Logger, cgroupv2 bool) *Exporter {
|
||||
return &Exporter{
|
||||
paths: paths,
|
||||
cpuUser: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "cpu", "user_seconds"),
|
||||
"Cumalitive CPU user seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpuSystem: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "cpu", "system_seconds"),
|
||||
"Cumalitive CPU system seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpuTotal: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "cpu", "total_seconds"),
|
||||
"Cumalitive CPU total seconds for cgroup", []string{"cgroup"}, nil),
|
||||
cpus: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "", "cpus"),
|
||||
"Number of CPUs in the cgroup", []string{"cgroup"}, nil),
|
||||
cpu_info: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "", "cpu_info"),
|
||||
"Information about the cgroup CPUs", []string{"cgroup", "cpus"}, nil),
|
||||
memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memory", "rss_bytes"),
|
||||
"Memory RSS used in bytes", []string{"cgroup"}, nil),
|
||||
memoryCache: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memory", "cache_bytes"),
|
||||
"Memory cache used in bytes", []string{"cgroup"}, nil),
|
||||
memoryUsed: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memory", "used_bytes"),
|
||||
"Memory used in bytes", []string{"cgroup"}, nil),
|
||||
memoryTotal: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memory", "total_bytes"),
|
||||
"Memory total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||
memoryFailCount: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memory", "fail_count"),
|
||||
"Memory fail count", []string{"cgroup"}, nil),
|
||||
memswUsed: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memsw", "used_bytes"),
|
||||
"Swap used in bytes", []string{"cgroup"}, nil),
|
||||
memswTotal: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memsw", "total_bytes"),
|
||||
"Swap total given to cgroup in bytes", []string{"cgroup"}, nil),
|
||||
memswFailCount: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "memsw", "fail_count"),
|
||||
"Swap fail count", []string{"cgroup"}, nil),
|
||||
info: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "", "info"),
|
||||
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
|
||||
processExec: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "", "process_exec_count"),
|
||||
"Count of instances of a given process", []string{"cgroup", "exec"}, nil),
|
||||
collectError: prometheus.NewDesc(prometheus.BuildFQName(Namespace, "exporter", "collect_error"),
|
||||
"Indicates collection error, 0=no error, 1=error", []string{"cgroup"}, nil),
|
||||
logger: logger,
|
||||
cgroupv2: cgroupv2,
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- e.cpuUser
|
||||
ch <- e.cpuSystem
|
||||
ch <- e.cpuTotal
|
||||
ch <- e.cpus
|
||||
ch <- e.cpu_info
|
||||
ch <- e.memoryRSS
|
||||
ch <- e.memoryCache
|
||||
ch <- e.memoryUsed
|
||||
ch <- e.memoryTotal
|
||||
ch <- e.memoryFailCount
|
||||
ch <- e.memswUsed
|
||||
ch <- e.memswTotal
|
||||
ch <- e.memswFailCount
|
||||
ch <- e.info
|
||||
if *collectProc {
|
||||
ch <- e.processExec
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
||||
var metrics []CgroupMetric
|
||||
if e.cgroupv2 {
|
||||
metrics, _ = e.collectv2()
|
||||
} else {
|
||||
metrics, _ = e.collectv1()
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.err {
|
||||
ch <- prometheus.MustNewConstMetric(e.collectError, prometheus.GaugeValue, 1, m.name)
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuUser, prometheus.GaugeValue, m.cpuUser, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.cpu_info, prometheus.GaugeValue, 1, m.name, m.cpu_list)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryTotal, prometheus.GaugeValue, m.memoryTotal, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryCache, prometheus.GaugeValue, m.memoryCache, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memoryFailCount, prometheus.GaugeValue, m.memoryFailCount, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memswUsed, prometheus.GaugeValue, m.memswUsed, m.name)
|
||||
ch <- prometheus.MustNewConstMetric(e.memswTotal, prometheus.GaugeValue, m.memswTotal, m.name)
|
||||
// These metrics currently have no cgroup v2 information
|
||||
if !e.cgroupv2 {
|
||||
ch <- prometheus.MustNewConstMetric(e.memswFailCount, prometheus.GaugeValue, m.memswFailCount, m.name)
|
||||
}
|
||||
if m.userslice || m.job {
|
||||
ch <- prometheus.MustNewConstMetric(e.info, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
|
||||
}
|
||||
if *collectProc {
|
||||
for exec, count := range m.processExec {
|
||||
ch <- prometheus.MustNewConstMetric(e.processExec, prometheus.GaugeValue, count, m.name, exec)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getProcInfo(pids []int, metric *CgroupMetric, logger log.Logger) {
|
||||
executables := make(map[string]float64)
|
||||
procFS, err := procfs.NewFS(*ProcRoot)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to open procfs", "path", *ProcRoot)
|
||||
return
|
||||
}
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(len(pids))
|
||||
for _, pid := range pids {
|
||||
go func(p int) {
|
||||
proc, err := procFS.Proc(p)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to read PID", "pid", p)
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
executable, err := proc.Executable()
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Unable to get executable for PID", "pid", p)
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
if len(executable) > *collectProcMaxExec {
|
||||
level.Debug(logger).Log("msg", "Executable will be truncated", "executable", executable, "len", len(executable), "pid", p)
|
||||
trim := *collectProcMaxExec / 2
|
||||
executable_prefix := executable[0:trim]
|
||||
executable_suffix := executable[len(executable)-trim:]
|
||||
executable = fmt.Sprintf("%s...%s", executable_prefix, executable_suffix)
|
||||
}
|
||||
metricLock.Lock()
|
||||
executables[executable] += 1
|
||||
metricLock.Unlock()
|
||||
wg.Done()
|
||||
}(pid)
|
||||
}
|
||||
wg.Wait()
|
||||
metric.processExec = executables
|
||||
}
|
||||
|
||||
func parseCpuSet(cpuset string) ([]string, error) {
|
||||
var cpus []string
|
||||
var start, end int
|
||||
var err error
|
||||
if cpuset == "" {
|
||||
return nil, nil
|
||||
}
|
||||
ranges := strings.Split(cpuset, ",")
|
||||
for _, r := range ranges {
|
||||
boundaries := strings.Split(r, "-")
|
||||
if len(boundaries) == 1 {
|
||||
start, err = strconv.Atoi(boundaries[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
end = start
|
||||
} else if len(boundaries) == 2 {
|
||||
start, err = strconv.Atoi(boundaries[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
end, err = strconv.Atoi(boundaries[1])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
for e := start; e <= end; e++ {
|
||||
cpu := strconv.Itoa(e)
|
||||
cpus = append(cpus, cpu)
|
||||
}
|
||||
}
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func getCPUs(path string, logger log.Logger) ([]string, error) {
|
||||
if !fileExists(path) {
|
||||
return nil, nil
|
||||
}
|
||||
cpusData, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error reading cpuset", "cpuset", path, "err", err)
|
||||
return nil, err
|
||||
}
|
||||
cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
|
||||
if err != nil {
|
||||
level.Error(logger).Log("msg", "Error parsing cpu set", "cpuset", path, "err", err)
|
||||
return nil, err
|
||||
}
|
||||
return cpus, nil
|
||||
}
|
||||
|
||||
func fileExists(filename string) bool {
|
||||
info, err := os.Stat(filename)
|
||||
if os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return !info.IsDir()
|
||||
}
|
||||
|
||||
func sliceContains(s interface{}, v interface{}) bool {
|
||||
slice := reflect.ValueOf(s)
|
||||
for i := 0; i < slice.Len(); i++ {
|
||||
if slice.Index(i).Interface() == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
// Copyright 2020 Trey Dockendorf
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"testing"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
fixture := filepath.Join(dir, "..", "fixtures")
|
||||
CgroupRoot = &fixture
|
||||
procFixture := filepath.Join(fixture, "proc")
|
||||
ProcRoot = &procFixture
|
||||
varTrue := true
|
||||
collectProc = &varTrue
|
||||
|
||||
exitVal := m.Run()
|
||||
|
||||
os.Exit(exitVal)
|
||||
}
|
||||
|
||||
func TestParseCpuSet(t *testing.T) {
|
||||
expected := []string{"0", "1", "2"}
|
||||
if cpus, err := parseCpuSet("0-2"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
expected = []string{"0", "1", "4", "5", "8", "9"}
|
||||
if cpus, err := parseCpuSet("0-1,4-5,8-9"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
expected = []string{"1", "3", "5", "7"}
|
||||
if cpus, err := parseCpuSet("1,3,5,7"); err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
} else if !reflect.DeepEqual(cpus, expected) {
|
||||
t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetProcInfo(t *testing.T) {
|
||||
metric := CgroupMetric{}
|
||||
w := log.NewSyncWriter(os.Stderr)
|
||||
logger := log.NewLogfmtLogger(w)
|
||||
getProcInfo([]int{95521, 95525}, &metric, logger)
|
||||
if val, ok := metric.processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics")
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 /bin/bash processes, got %v", val)
|
||||
}
|
||||
}
|
||||
varLen := 6
|
||||
collectProcMaxExec = &varLen
|
||||
getProcInfo([]int{95521, 95525}, &metric, logger)
|
||||
if val, ok := metric.processExec["/bi...ash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics, found: %v", metric.processExec)
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 /b...sh processes, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
5671
fixtures.ttar
5671
fixtures.ttar
File diff suppressed because it is too large
Load Diff
4
go.mod
4
go.mod
|
@ -5,6 +5,7 @@ go 1.20
|
|||
require (
|
||||
github.com/alecthomas/kingpin/v2 v2.3.2
|
||||
github.com/containerd/cgroups v1.1.0
|
||||
github.com/containerd/cgroups/v3 v3.0.3
|
||||
github.com/go-kit/log v0.2.1
|
||||
github.com/prometheus/client_golang v1.15.1
|
||||
github.com/prometheus/common v0.43.0
|
||||
|
@ -15,6 +16,7 @@ require (
|
|||
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.2.0 // indirect
|
||||
github.com/cilium/ebpf v0.11.0 // indirect
|
||||
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/go-logfmt/logfmt v0.6.0 // indirect
|
||||
|
@ -24,7 +26,9 @@ require (
|
|||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/opencontainers/runtime-spec v1.0.2 // indirect
|
||||
github.com/prometheus/client_model v0.4.0 // indirect
|
||||
github.com/sirupsen/logrus v1.9.0 // indirect
|
||||
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 // indirect
|
||||
golang.org/x/sys v0.8.0 // indirect
|
||||
google.golang.org/protobuf v1.30.0 // indirect
|
||||
)
|
||||
|
|
17
go.sum
17
go.sum
|
@ -6,14 +6,20 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
|||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
|
||||
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y=
|
||||
github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs=
|
||||
github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM=
|
||||
github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw=
|
||||
github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0=
|
||||
github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0=
|
||||
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA=
|
||||
github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU=
|
||||
github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0=
|
||||
github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4=
|
||||
|
@ -31,6 +37,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
|||
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
|
||||
github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0=
|
||||
|
@ -45,16 +53,23 @@ github.com/prometheus/common v0.43.0 h1:iq+BVjvYLei5f27wiuNiB1DN6DYQkp1c8Bx0Vykh
|
|||
github.com/prometheus/common v0.43.0/go.mod h1:NCvr5cQIh3Y/gy73/RdVtC9r8xxrxwJnB+2lB3BxrFc=
|
||||
github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI=
|
||||
github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY=
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0=
|
||||
github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
|
||||
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
|
||||
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 h1:Jvc7gsqn21cJHCmAWx0LiimpP18LZmUxkT5Mp7EZ1mI=
|
||||
golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
|
@ -68,6 +83,7 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
|
|||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
|
@ -87,4 +103,5 @@ google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqw
|
|||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
|
|
Loading…
Reference in New Issue