Merge pull request #9 from treydock/processes

Add cgroup_process_exec_count metric
This commit is contained in:
treydock 2020-10-02 14:49:57 -04:00 committed by GitHub
commit ea267dd2fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
438 changed files with 5710 additions and 875 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
/fixtures/
/.tarballs
/.build
/cgroup_exporter

View File

@ -8,3 +8,15 @@ DOCKER_IMAGE_NAME ?= cgroup_exporter
coverage:
go test -race -coverpkg=./... -coverprofile=coverage.txt -covermode=atomic ./...
%/.unpacked: %.ttar
@echo ">> extracting fixtures"
./ttar -C $(dir $*) -x -f $*.ttar
touch $@
update_fixtures:
rm -vf fixtures/.unpacked
./ttar -c -f fixtures.ttar fixtures/
.PHONY: test
test: fixtures/.unpacked common-test

View File

@ -41,6 +41,14 @@ Or
go get github.com/treydock/cgroup_exporter
```
## Process metrics
If you wish to collect process information for a cgroup pass the `--collect.proc` flag. If this exporter is not running as root then it's required to set capabilities to ensure the user running this exporter can read everything under procfs:
```
setcap cap_sys_ptrace=eip /usr/bin/cgroup_exporter
```
## Metrics
Example of metrics exposed by this exporter when looking at `/user.slice` paths:

View File

@ -20,6 +20,7 @@ import (
"os"
"os/user"
"path/filepath"
"reflect"
"regexp"
"strconv"
"strings"
@ -29,6 +30,7 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/log"
"github.com/prometheus/common/version"
"github.com/prometheus/procfs"
"gopkg.in/alecthomas/kingpin.v2"
)
@ -38,10 +40,14 @@ const (
var (
defCgroupRoot = "/sys/fs/cgroup"
defProcRoot = "/proc"
configPaths = kingpin.Flag("config.paths", "Comma separated list of cgroup paths to check, eg /user.slice,/system.slice,/slurm").Required().String()
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9306").String()
disableExporterMetrics = kingpin.Flag("web.disable-exporter-metrics", "Exclude metrics about the exporter (promhttp_*, process_*, go_*)").Default("false").Bool()
cgroupRoot = kingpin.Flag("path.cgroup.root", "Root path to cgroup fs").Default(defCgroupRoot).String()
procRoot = kingpin.Flag("path.proc.root", "Root path to proc fs").Default(defProcRoot).String()
collectProc = kingpin.Flag("collect.proc", "Boolean that sets if to collect proc information").Default("false").Bool()
collectProcMaxExec = kingpin.Flag("collect.proc.max-exec", "Max length of process executable to record").Default("100").Int()
)
type CgroupMetric struct {
@ -64,6 +70,7 @@ type CgroupMetric struct {
uid string
username string
jobid string
processExec map[string]float64
err bool
}
@ -84,6 +91,7 @@ type Exporter struct {
memswTotal *prometheus.Desc
memswFailCount *prometheus.Desc
info *prometheus.Desc
processExec *prometheus.Desc
}
func fileExists(filename string) bool {
@ -94,9 +102,10 @@ func fileExists(filename string) bool {
return !info.IsDir()
}
func sliceContains(slice []string, str string) bool {
for _, s := range slice {
if str == s {
func sliceContains(s interface{}, v interface{}) bool {
slice := reflect.ValueOf(s)
for i := 0; i < slice.Len(); i++ {
if slice.Index(i).Interface() == v {
return true
}
}
@ -200,6 +209,34 @@ func getInfo(name string, metric *CgroupMetric) {
}
}
func getProcInfo(pids []int, metric *CgroupMetric) {
executables := make(map[string]float64)
procFS, err := procfs.NewFS(*procRoot)
if err != nil {
log.Errorf("Unable to open procfs at %s", *procRoot)
return
}
for _, pid := range pids {
proc, err := procFS.Proc(pid)
if err != nil {
log.Errorf("Unable to read PID=%d", pid)
continue
}
executable, err := proc.Executable()
if err != nil {
log.Errorf("Unable to get executable for PID=%d", pid)
continue
}
if len(executable) > *collectProcMaxExec {
log.Debugf("Executable will be truncated executable=%s len=%d pid=%d", executable, len(executable), pid)
executable = executable[len(executable)-*collectProcMaxExec:]
executable = fmt.Sprintf("...%s", executable)
}
executables[executable] += 1
}
metric.processExec = executables
}
func getName(p cgroups.Process, path string) (string, error) {
cpuacctPath := filepath.Join(*cgroupRoot, "cpuacct")
name := strings.TrimPrefix(p.Path, cpuacctPath)
@ -256,6 +293,8 @@ func NewExporter(paths []string) *Exporter {
"Swap fail count", []string{"cgroup"}, nil),
info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "info"),
"User slice information", []string{"cgroup", "username", "uid", "jobid"}, nil),
processExec: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "process_exec_count"),
"Count of instances of a given process", []string{"cgroup", "exec"}, nil),
collectError: prometheus.NewDesc(prometheus.BuildFQName(namespace, "exporter", "collect_error"),
"Indicates collection error, 0=no error, 1=error", []string{"cgroup"}, nil),
}
@ -281,16 +320,27 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
continue
}
log.Debugf("Found %d processes", len(processes))
pids := make(map[string][]int)
for _, p := range processes {
log.Debugf("Get name of process=%s pid=%d path=%s", p.Path, p.Pid, path)
name, err := getName(p, path)
if err != nil {
log.Errorf("Error getting cgroup name for for process %s at path %s: %s", p.Path, path, err.Error())
continue
}
if sliceContains(names, name) {
continue
if !sliceContains(names, name) {
names = append(names, name)
}
names = append(names, name)
if val, ok := pids[name]; ok {
if !sliceContains(val, p.Pid) {
val = append(val, p.Pid)
}
pids[name] = val
} else {
pids[name] = []int{p.Pid}
}
}
for _, name := range names {
metric := CgroupMetric{name: name}
log.Debugf("Loading cgroup path %s", name)
ctrl, err := cgroups.Load(subsystem, func(subsystem cgroups.Name) (string, error) {
@ -319,6 +369,14 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.cpu_list = strings.Join(cpus, ",")
}
getInfo(name, &metric)
if *collectProc {
if val, ok := pids[name]; ok {
log.Debugf("Get process info for pids=%v", val)
getProcInfo(val, &metric)
} else {
log.Errorf("Unable to get PIDs for %s", name)
}
}
metrics = append(metrics, metric)
}
}
@ -340,6 +398,10 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.memswUsed
ch <- e.memswTotal
ch <- e.memswFailCount
ch <- e.info
if *collectProc {
ch <- e.processExec
}
}
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
@ -364,6 +426,11 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
if m.userslice || m.job {
ch <- prometheus.MustNewConstMetric(e.info, prometheus.GaugeValue, 1, m.name, m.username, m.uid, m.jobid)
}
if *collectProc {
for exec, count := range m.processExec {
ch <- prometheus.MustNewConstMetric(e.processExec, prometheus.GaugeValue, count, m.name, exec)
}
}
}
}

View File

@ -38,10 +38,14 @@ func TestMain(m *testing.M) {
}
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
fixture := filepath.Join(dir, "test")
fixture := filepath.Join(dir, "fixtures")
cgroupRoot = &fixture
procFixture := filepath.Join(fixture, "proc")
procRoot = &procFixture
varTrue := true
disableExporterMetrics = &varTrue
collectProc = &varTrue
_ = log.Base().SetLevel("debug")
go func() {
http.Handle("/metrics", metricsHandler())
log.Fatal(http.ListenAndServe(address, nil))
@ -74,7 +78,33 @@ func TestParseCpuSet(t *testing.T) {
}
}
func TestGetProcInfo(t *testing.T) {
metric := CgroupMetric{}
getProcInfo([]int{95521, 95525}, &metric)
if val, ok := metric.processExec["/bin/bash"]; !ok {
t.Errorf("Process /bin/bash not in metrics")
return
} else {
if val != 2 {
t.Errorf("Expected 2 /bin/bash processes, got %v", val)
}
}
varLen := 4
collectProcMaxExec = &varLen
getProcInfo([]int{95521, 95525}, &metric)
if val, ok := metric.processExec["...bash"]; !ok {
t.Errorf("Process /bin/bash not in metrics, found: %v", metric.processExec)
return
} else {
if val != 2 {
t.Errorf("Expected 2 .../bash processes, got %v", val)
}
}
}
func TestCollectUserSlice(t *testing.T) {
varFalse := false
collectProc = &varFalse
exporter := NewExporter([]string{"/user.slice"})
metrics, err := exporter.collect()
if err != nil {
@ -127,15 +157,18 @@ func TestCollectUserSlice(t *testing.T) {
}
func TestCollectSLURM(t *testing.T) {
_ = log.Base().SetLevel("debug")
varTrue := true
collectProc = &varTrue
varLen := 100
collectProcMaxExec = &varLen
exporter := NewExporter([]string{"/slurm"})
metrics, err := exporter.collect()
if err != nil {
t.Errorf("Unexpected error: %s", err.Error())
return
}
if val := len(metrics); val != 1 {
t.Errorf("Unexpected number of metrics, got %d expected 1", val)
if val := len(metrics); val != 2 {
t.Errorf("Unexpected number of metrics, got %d expected 2", val)
return
}
if val := metrics[0].cpuUser; val != 0 {
@ -180,10 +213,18 @@ func TestCollectSLURM(t *testing.T) {
if val := metrics[0].jobid; val != "10" {
t.Errorf("Unexpected value for jobid, got %v", val)
}
if val, ok := metrics[0].processExec["/bin/bash"]; !ok {
t.Errorf("processExec does not contain /bin/bash")
} else {
if val != 2 {
t.Errorf("Unexpected 2 values for processExec /bin/bash, got %v", val)
}
}
}
func TestCollectTorque(t *testing.T) {
_ = log.Base().SetLevel("debug")
varFalse := false
collectProc = &varFalse
exporter := NewExporter([]string{"/torque"})
metrics, err := exporter.collect()
if err != nil {
@ -239,7 +280,6 @@ func TestCollectTorque(t *testing.T) {
}
func TestMetricsHandler(t *testing.T) {
_ = log.Base().SetLevel("debug")
body, err := queryExporter()
if err != nil {
t.Fatalf("Unexpected error GET /metrics: %s", err.Error())

5156
fixtures.ttar Normal file

File diff suppressed because it is too large Load Diff

2
go.mod
View File

@ -7,7 +7,7 @@ require (
github.com/coreos/go-systemd/v22 v22.1.0 // indirect
github.com/prometheus/client_golang v1.7.1
github.com/prometheus/common v0.14.0
github.com/prometheus/procfs v0.2.0 // indirect
github.com/prometheus/procfs v0.2.0
github.com/sirupsen/logrus v1.7.0 // indirect
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f // indirect
google.golang.org/protobuf v1.25.0 // indirect

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
100000

View File

@ -1 +0,0 @@
-1

View File

@ -1 +0,0 @@
1000000

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
1024

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 2
system 4

View File

@ -1 +0,0 @@
65297599

View File

@ -1 +0,0 @@
27864801 33969662 1099301 2363835

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
100000

View File

@ -1 +0,0 @@
-1

View File

@ -1 +0,0 @@
1000000

View File

@ -1 +0,0 @@
1024

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 0
system 0

View File

@ -1 +0,0 @@
7710215

View File

@ -1 +0,0 @@
3710825 3999390 0 0

View File

@ -1 +0,0 @@
1000000

View File

@ -1 +0,0 @@
1024

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 0
system 0

View File

@ -1 +0,0 @@
7710215

View File

@ -1 +0,0 @@
3710825 3999390 0 0

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 0
system 0

View File

@ -1 +0,0 @@
3710825 3999390 0 0

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 0
system 0

View File

@ -1,2 +0,0 @@
95521
95525

View File

@ -1,7 +0,0 @@
280687
280755
280942
280943
280944
280948
280949

View File

@ -1 +0,0 @@
100000

View File

@ -1 +0,0 @@
1000000

View File

@ -1 +0,0 @@
1024

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 15314631
system 26077

View File

@ -1 +0,0 @@
152995785583781

View File

@ -1 +0,0 @@
3741446525236 3763871961512 3800004207303 3795925066130 3802666529070 3800304192183 3802555292152 3804795218295 4160254191665 3803812739329 3805914555196 3803924757760 3804157062388 3804434454497 3799628022547 3805746895327 3801777709215 3807932330725 3804793579197 4281745115690 3804595709989 3805045301831 3803044002744 3805835045559 3804952857992 3797882249643 3805434134051 3805050988133 3805773462792 3805422181921 3926304892533 3793095726993 3804059814194 3805599780955 3804553512591 3805412437282 3805505596539 3803162401816 3804398646859 3804887475895

View File

@ -1,47 +0,0 @@
280687
280755
280942
280943
280944
280948
280949
281004
281005
281006
281007
281008
281009
281010
281011
281012
281013
281014
281015
281016
281017
281018
281019
281020
281021
281022
281023
281024
281025
281026
281027
281028
281029
281030
281031
281032
281033
281034
281035
281036
281037
281038
281039
281040
281041
281042
281043

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
100000

View File

@ -1 +0,0 @@
-1

View File

@ -1 +0,0 @@
1000000

View File

@ -1 +0,0 @@
0

View File

@ -1 +0,0 @@
1024

View File

@ -1,3 +0,0 @@
nr_periods 0
nr_throttled 0
throttled_time 0

View File

@ -1,2 +0,0 @@
user 30648565
system 51983

View File

@ -1 +0,0 @@
306181522324683

View File

@ -1 +0,0 @@
7507822956061 7528656764703 7592506792141 7602715606951 7599085684977 7609294979494 7604660323578 7618618653931 7959815191466 7616203824147 7610638786639 7616682686571 7608820260166 7619712348886 7598884212794 7621924199684 7604738465824 7623627374038 7608781761607 8574847626011 7608343551357 7617770917092 7605183264978 7621886223773 7607865073606 7613846648300 7605125778530 7624608985662 7607385714373 7620503587308 8199008007781 7609171662251 7605928891278 7621536734593 7606951115289 7620745765085 7609675504308 7620580738305 7607492605427 7619806854280

View File

@ -1 +0,0 @@
0

View File

@ -1,4 +0,0 @@
99062
99080
99081
100190

Some files were not shown because too many files have changed in this diff Show More