Merge pull request #9 from treydock/processes
Add cgroup_process_exec_count metric
This commit is contained in:
commit
ea267dd2fd
|
@ -1,3 +1,4 @@
|
|||
/fixtures/
|
||||
/.tarballs
|
||||
/.build
|
||||
/cgroup_exporter
|
||||
|
|
12
Makefile
12
Makefile
|
@ -8,3 +8,15 @@ DOCKER_IMAGE_NAME ?= cgroup_exporter
|
|||
|
||||
coverage:
|
||||
go test -race -coverpkg=./... -coverprofile=coverage.txt -covermode=atomic ./...
|
||||
|
||||
%/.unpacked: %.ttar
|
||||
@echo ">> extracting fixtures"
|
||||
./ttar -C $(dir $*) -x -f $*.ttar
|
||||
touch $@
|
||||
|
||||
update_fixtures:
|
||||
rm -vf fixtures/.unpacked
|
||||
./ttar -c -f fixtures.ttar fixtures/
|
||||
|
||||
.PHONY: test
|
||||
test: fixtures/.unpacked common-test
|
||||
|
|
|
@ -41,6 +41,14 @@ Or
|
|||
go get github.com/treydock/cgroup_exporter
|
||||
```
|
||||
|
||||
## Process metrics
|
||||
|
||||
If you wish to collect process information for a cgroup pass the `--collect.proc` flag. If this exporter is not running as root then it's required to set capabilities to ensure the user running this exporter can read everything under procfs:
|
||||
|
||||
```
|
||||
setcap cap_sys_ptrace=eip /usr/bin/cgroup_exporter
|
||||
```
|
||||
|
||||
## Metrics
|
||||
|
||||
Example of metrics exposed by this exporter when looking at `/user.slice` paths:
|
||||
|
|
|
@ -20,6 +20,7 @@ import (
|
|||
"os"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
@ -29,6 +30,7 @@ import (
|
|||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/prometheus/common/log"
|
||||
"github.com/prometheus/common/version"
|
||||
"github.com/prometheus/procfs"
|
||||
"gopkg.in/alecthomas/kingpin.v2"
|
||||
)
|
||||
|
||||
|
@ -38,10 +40,14 @@ const (
|
|||
|
||||
var (
|
||||
defCgroupRoot = "/sys/fs/cgroup"
|
||||
defProcRoot = "/proc"
|
||||
configPaths = kingpin.Flag("config.paths", "Comma separated list of cgroup paths to check, eg /user.slice,/system.slice,/slurm").Required().String()
|
||||
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9306").String()
|
||||
disableExporterMetrics = kingpin.Flag("web.disable-exporter-metrics", "Exclude metrics about the exporter (promhttp_*, process_*, go_*)").Default("false").Bool()
|
||||
cgroupRoot = kingpin.Flag("path.cgroup.root", "Root path to cgroup fs").Default(defCgroupRoot).String()
|
||||
procRoot = kingpin.Flag("path.proc.root", "Root path to proc fs").Default(defProcRoot).String()
|
||||
collectProc = kingpin.Flag("collect.proc", "Boolean that sets if to collect proc information").Default("false").Bool()
|
||||
collectProcMaxExec = kingpin.Flag("collect.proc.max-exec", "Max length of process executable to record").Default("100").Int()
|
||||
)
|
||||
|
||||
type CgroupMetric struct {
|
||||
|
@ -64,6 +70,7 @@ type CgroupMetric struct {
|
|||
uid string
|
||||
username string
|
||||
jobid string
|
||||
processExec map[string]float64
|
||||
err bool
|
||||
}
|
||||
|
||||
|
@ -84,6 +91,7 @@ type Exporter struct {
|
|||
memswTotal *prometheus.Desc
|
||||
memswFailCount *prometheus.Desc
|
||||
info *prometheus.Desc
|
||||
processExec *prometheus.Desc
|
||||
}
|
||||
|
||||
func fileExists(filename string) bool {
|
||||
|
@ -94,9 +102,10 @@ func fileExists(filename string) bool {
|
|||
return !info.IsDir()
|
||||
}
|
||||
|
||||
func sliceContains(slice []string, str string) bool {
|
||||
for _, s := range slice {
|
||||
if str == s {
|
||||
func sliceContains(s interface{}, v interface{}) bool {
|
||||
slice := reflect.ValueOf(s)
|
||||
for i := 0; i < slice.Len(); i++ {
|
||||
if slice.Index(i).Interface() == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
@ -200,6 +209,34 @@ func getInfo(name string, metric *CgroupMetric) {
|
|||
}
|
||||
}
|
||||
|
||||
func getProcInfo(pids []int, metric *CgroupMetric) {
|
||||
executables := make(map[string]float64)
|
||||
procFS, err := procfs.NewFS(*procRoot)
|
||||
if err != nil {
|
||||
log.Errorf("Unable to open procfs at %s", *procRoot)
|
||||
return
|
||||
}
|
||||
for _, pid := range pids {
|
||||
proc, err := procFS.Proc(pid)
|
||||
if err != nil {
|
||||
log.Errorf("Unable to read PID=%d", pid)
|
||||
continue
|
||||
}
|
||||
executable, err := proc.Executable()
|
||||
if err != nil {
|
||||
log.Errorf("Unable to get executable for PID=%d", pid)
|
||||
continue
|
||||
}
|
||||
if len(executable) > *collectProcMaxExec {
|
||||
log.Debugf("Executable will be truncated executable=%s len=%d pid=%d", executable, len(executable), pid)
|
||||
executable = executable[len(executable)-*collectProcMaxExec:]
|
||||
executable = fmt.Sprintf("...%s", executable)
|
||||
}
|
||||
executables[executable] += 1
|
||||
}
|
||||
metric.processExec = executables
|
||||
}
|
||||
|
||||
func getName(p cgroups.Process, path string) (string, error) {
|
||||
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
|
||||
name := strings.TrimPrefix(p.Path, cpuacctPath)
|
||||
|
@ -256,6 +293,8 @@ func NewExporter(paths []string) *Exporter {
|
|||
"Swap fail count", []string{"cgroup"}, nil),
|
||||
info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "info"),
|
||||
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
|
||||
processExec: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "process_exec_count"),
|
||||
"Count of instances of a given process", []string{"cgroup", "exec"}, nil),
|
||||
collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"),
|
||||
"Indicates collection error, 0=no error, 1=error", []string{"cgroup"}, nil),
|
||||
}
|
||||
|
@ -281,16 +320,27 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
|||
continue
|
||||
}
|
||||
log.Debugf("Found %d processes", len(processes))
|
||||
pids := make(map[string][]int)
|
||||
for _, p := range processes {
|
||||
log.Debugf("Get name of process=%s pid=%d path=%s", p.Path, p.Pid, path)
|
||||
name, err := getName(p, path)
|
||||
if err != nil {
|
||||
log.Errorf("Error getting cgroup name for for process %s at path %s: %s", p.Path, path, err.Error())
|
||||
continue
|
||||
}
|
||||
if sliceContains(names, name) {
|
||||
continue
|
||||
if !sliceContains(names, name) {
|
||||
names = append(names, name)
|
||||
}
|
||||
names = append(names, name)
|
||||
if val, ok := pids[name]; ok {
|
||||
if !sliceContains(val, p.Pid) {
|
||||
val = append(val, p.Pid)
|
||||
}
|
||||
pids[name] = val
|
||||
} else {
|
||||
pids[name] = []int{p.Pid}
|
||||
}
|
||||
}
|
||||
for _, name := range names {
|
||||
metric := CgroupMetric{name: name}
|
||||
log.Debugf("Loading cgroup path %s", name)
|
||||
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
|
||||
|
@ -319,6 +369,14 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
|
|||
metric.cpu_list = strings.Join(cpus, ",")
|
||||
}
|
||||
getInfo(name, &metric)
|
||||
if *collectProc {
|
||||
if val, ok := pids[name]; ok {
|
||||
log.Debugf("Get process info for pids=%v", val)
|
||||
getProcInfo(val, &metric)
|
||||
} else {
|
||||
log.Errorf("Unable to get PIDs for %s", name)
|
||||
}
|
||||
}
|
||||
metrics = append(metrics, metric)
|
||||
}
|
||||
}
|
||||
|
@ -340,6 +398,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
|
|||
ch <- e.memswUsed
|
||||
ch <- e.memswTotal
|
||||
ch <- e.memswFailCount
|
||||
ch <- e.info
|
||||
if *collectProc {
|
||||
ch <- e.processExec
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
||||
|
@ -364,6 +426,11 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
|
|||
if m.userslice || m.job {
|
||||
ch <- prometheus.MustNewConstMetric(e.info, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
|
||||
}
|
||||
if *collectProc {
|
||||
for exec, count := range m.processExec {
|
||||
ch <- prometheus.MustNewConstMetric(e.processExec, prometheus.GaugeValue, count, m.name, exec)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,10 +38,14 @@ func TestMain(m *testing.M) {
|
|||
}
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
fixture := filepath.Join(dir, "test")
|
||||
fixture := filepath.Join(dir, "fixtures")
|
||||
cgroupRoot = &fixture
|
||||
procFixture := filepath.Join(fixture, "proc")
|
||||
procRoot = &procFixture
|
||||
varTrue := true
|
||||
disableExporterMetrics = &varTrue
|
||||
collectProc = &varTrue
|
||||
_ = log.Base().SetLevel("debug")
|
||||
go func() {
|
||||
http.Handle("/metrics", metricsHandler())
|
||||
log.Fatal(http.ListenAndServe(address, nil))
|
||||
|
@ -74,7 +78,33 @@ func TestParseCpuSet(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGetProcInfo(t *testing.T) {
|
||||
metric := CgroupMetric{}
|
||||
getProcInfo([]int{95521, 95525}, &metric)
|
||||
if val, ok := metric.processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics")
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 /bin/bash processes, got %v", val)
|
||||
}
|
||||
}
|
||||
varLen := 4
|
||||
collectProcMaxExec = &varLen
|
||||
getProcInfo([]int{95521, 95525}, &metric)
|
||||
if val, ok := metric.processExec["...bash"]; !ok {
|
||||
t.Errorf("Process /bin/bash not in metrics, found: %v", metric.processExec)
|
||||
return
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Expected 2 .../bash processes, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectUserSlice(t *testing.T) {
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
exporter := NewExporter([]string{"/user.slice"})
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
|
@ -127,15 +157,18 @@ func TestCollectUserSlice(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestCollectSLURM(t *testing.T) {
|
||||
_ = log.Base().SetLevel("debug")
|
||||
varTrue := true
|
||||
collectProc = &varTrue
|
||||
varLen := 100
|
||||
collectProcMaxExec = &varLen
|
||||
exporter := NewExporter([]string{"/slurm"})
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s", err.Error())
|
||||
return
|
||||
}
|
||||
if val := len(metrics); val != 1 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
|
||||
if val := len(metrics); val != 2 {
|
||||
t.Errorf("Unexpected number of metrics, got %d expected 2", val)
|
||||
return
|
||||
}
|
||||
if val := metrics[0].cpuUser; val != 0 {
|
||||
|
@ -180,10 +213,18 @@ func TestCollectSLURM(t *testing.T) {
|
|||
if val := metrics[0].jobid; val != "10" {
|
||||
t.Errorf("Unexpected value for jobid, got %v", val)
|
||||
}
|
||||
if val, ok := metrics[0].processExec["/bin/bash"]; !ok {
|
||||
t.Errorf("processExec does not contain /bin/bash")
|
||||
} else {
|
||||
if val != 2 {
|
||||
t.Errorf("Unexpected 2 values for processExec /bin/bash, got %v", val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectTorque(t *testing.T) {
|
||||
_ = log.Base().SetLevel("debug")
|
||||
varFalse := false
|
||||
collectProc = &varFalse
|
||||
exporter := NewExporter([]string{"/torque"})
|
||||
metrics, err := exporter.collect()
|
||||
if err != nil {
|
||||
|
@ -239,7 +280,6 @@ func TestCollectTorque(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestMetricsHandler(t *testing.T) {
|
||||
_ = log.Base().SetLevel("debug")
|
||||
body, err := queryExporter()
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error GET /metrics: %s", err.Error())
|
||||
|
|
File diff suppressed because it is too large
Load Diff
2
go.mod
2
go.mod
|
@ -7,7 +7,7 @@ require (
|
|||
github.com/coreos/go-systemd/v22 v22.1.0 // indirect
|
||||
github.com/prometheus/client_golang v1.7.1
|
||||
github.com/prometheus/common v0.14.0
|
||||
github.com/prometheus/procfs v0.2.0 // indirect
|
||||
github.com/prometheus/procfs v0.2.0
|
||||
github.com/sirupsen/logrus v1.7.0 // indirect
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f // indirect
|
||||
google.golang.org/protobuf v1.25.0 // indirect
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 2
|
||||
system 4
|
|
@ -1 +0,0 @@
|
|||
65297599
|
|
@ -1 +0,0 @@
|
|||
27864801 33969662 1099301 2363835
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 0
|
||||
system 0
|
|
@ -1 +0,0 @@
|
|||
7710215
|
|
@ -1 +0,0 @@
|
|||
3710825 3999390 0 0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 0
|
||||
system 0
|
|
@ -1 +0,0 @@
|
|||
7710215
|
|
@ -1 +0,0 @@
|
|||
3710825 3999390 0 0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 0
|
||||
system 0
|
|
@ -1 +0,0 @@
|
|||
7710215
|
|
@ -1 +0,0 @@
|
|||
3710825 3999390 0 0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1,2 +0,0 @@
|
|||
95521
|
||||
95525
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 0
|
||||
system 0
|
|
@ -1 +0,0 @@
|
|||
7710215
|
|
@ -1 +0,0 @@
|
|||
3710825 3999390 0 0
|
|
@ -1,2 +0,0 @@
|
|||
95521
|
||||
95525
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1,7 +0,0 @@
|
|||
280687
|
||||
280755
|
||||
280942
|
||||
280943
|
||||
280944
|
||||
280948
|
||||
280949
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 15314631
|
||||
system 26077
|
|
@ -1 +0,0 @@
|
|||
152995785583781
|
|
@ -1 +0,0 @@
|
|||
3741446525236 3763871961512 3800004207303 3795925066130 3802666529070 3800304192183 3802555292152 3804795218295 4160254191665 3803812739329 3805914555196 3803924757760 3804157062388 3804434454497 3799628022547 3805746895327 3801777709215 3807932330725 3804793579197 4281745115690 3804595709989 3805045301831 3803044002744 3805835045559 3804952857992 3797882249643 3805434134051 3805050988133 3805773462792 3805422181921 3926304892533 3793095726993 3804059814194 3805599780955 3804553512591 3805412437282 3805505596539 3803162401816 3804398646859 3804887475895
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1,47 +0,0 @@
|
|||
280687
|
||||
280755
|
||||
280942
|
||||
280943
|
||||
280944
|
||||
280948
|
||||
280949
|
||||
281004
|
||||
281005
|
||||
281006
|
||||
281007
|
||||
281008
|
||||
281009
|
||||
281010
|
||||
281011
|
||||
281012
|
||||
281013
|
||||
281014
|
||||
281015
|
||||
281016
|
||||
281017
|
||||
281018
|
||||
281019
|
||||
281020
|
||||
281021
|
||||
281022
|
||||
281023
|
||||
281024
|
||||
281025
|
||||
281026
|
||||
281027
|
||||
281028
|
||||
281029
|
||||
281030
|
||||
281031
|
||||
281032
|
||||
281033
|
||||
281034
|
||||
281035
|
||||
281036
|
||||
281037
|
||||
281038
|
||||
281039
|
||||
281040
|
||||
281041
|
||||
281042
|
||||
281043
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
100000
|
|
@ -1 +0,0 @@
|
|||
-1
|
|
@ -1 +0,0 @@
|
|||
1000000
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
1024
|
|
@ -1,3 +0,0 @@
|
|||
nr_periods 0
|
||||
nr_throttled 0
|
||||
throttled_time 0
|
|
@ -1,2 +0,0 @@
|
|||
user 30648565
|
||||
system 51983
|
|
@ -1 +0,0 @@
|
|||
306181522324683
|
|
@ -1 +0,0 @@
|
|||
7507822956061 7528656764703 7592506792141 7602715606951 7599085684977 7609294979494 7604660323578 7618618653931 7959815191466 7616203824147 7610638786639 7616682686571 7608820260166 7619712348886 7598884212794 7621924199684 7604738465824 7623627374038 7608781761607 8574847626011 7608343551357 7617770917092 7605183264978 7621886223773 7607865073606 7613846648300 7605125778530 7624608985662 7607385714373 7620503587308 8199008007781 7609171662251 7605928891278 7621536734593 7606951115289 7620745765085 7609675504308 7620580738305 7607492605427 7619806854280
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1 +0,0 @@
|
|||
0
|
|
@ -1,4 +0,0 @@
|
|||
99062
|
||||
99080
|
||||
99081
|
||||
100190
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue