Merge pull request #7 from treydock/cpulist

Add cgroup_cpu_info metric
This commit is contained in:
treydock 2020-10-01 11:05:40 -04:00 committed by GitHub
commit cd0c8008d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 44 additions and 23 deletions

View File

@ -50,6 +50,7 @@ cgroup_cpu_system_seconds{cgroup="/user.slice/user-20821.slice"} 1.96
cgroup_cpu_total_seconds{cgroup="/user.slice/user-20821.slice"} 3.817500568 cgroup_cpu_total_seconds{cgroup="/user.slice/user-20821.slice"} 3.817500568
cgroup_cpu_user_seconds{cgroup="/user.slice/user-20821.slice"} 1.61 cgroup_cpu_user_seconds{cgroup="/user.slice/user-20821.slice"} 1.61
cgroup_cpus{cgroup="/user.slice/user-20821.slice"} 0 cgroup_cpus{cgroup="/user.slice/user-20821.slice"} 0
cgroup_cpu_info{cgroup="/user.slice/user-20821.slice",cpus=""} 1
cgroup_info{cgroup="/user.slice/user-20821.slice",uid="20821",username="tdockendorf",jobid=""} 1 cgroup_info{cgroup="/user.slice/user-20821.slice",uid="20821",username="tdockendorf",jobid=""} 1
cgroup_memory_cache_bytes{cgroup="/user.slice/user-20821.slice"} 2.322432e+06 cgroup_memory_cache_bytes{cgroup="/user.slice/user-20821.slice"} 2.322432e+06
cgroup_memory_fail_count{cgroup="/user.slice/user-20821.slice"} 0 cgroup_memory_fail_count{cgroup="/user.slice/user-20821.slice"} 0
@ -68,6 +69,7 @@ cgroup_cpu_system_seconds{cgroup="/slurm/uid_20821/job_12"} 0
cgroup_cpu_total_seconds{cgroup="/slurm/uid_20821/job_12"} 0.007840451 cgroup_cpu_total_seconds{cgroup="/slurm/uid_20821/job_12"} 0.007840451
cgroup_cpu_user_seconds{cgroup="/slurm/uid_20821/job_12"} 0 cgroup_cpu_user_seconds{cgroup="/slurm/uid_20821/job_12"} 0
cgroup_cpus{cgroup="/slurm/uid_20821/job_12"} 2 cgroup_cpus{cgroup="/slurm/uid_20821/job_12"} 2
cgroup_cpu_info{cgroup="/slurm/uid_20821/job_12",cpus="0,1"} 1
cgroup_info{cgroup="/slurm/uid_20821/job_12",jobid="12",uid="20821",username="tdockendorf"} 1 cgroup_info{cgroup="/slurm/uid_20821/job_12",jobid="12",uid="20821",username="tdockendorf"} 1
cgroup_memory_cache_bytes{cgroup="/slurm/uid_20821/job_12"} 4.096e+03 cgroup_memory_cache_bytes{cgroup="/slurm/uid_20821/job_12"} 4.096e+03
cgroup_memory_fail_count{cgroup="/slurm/uid_20821/job_12"} 0 cgroup_memory_fail_count{cgroup="/slurm/uid_20821/job_12"} 0
@ -85,7 +87,8 @@ Example of metrics exposed by this exporter when looking at `/torque` paths:
cgroup_cpu_system_seconds{cgroup="/torque/1182958.batch.example.com"} 26.35 cgroup_cpu_system_seconds{cgroup="/torque/1182958.batch.example.com"} 26.35
cgroup_cpu_total_seconds{cgroup="/torque/1182958.batch.example.com"} 939.568245515 cgroup_cpu_total_seconds{cgroup="/torque/1182958.batch.example.com"} 939.568245515
cgroup_cpu_user_seconds{cgroup="/torque/1182958.batch.example.com"} 915.61 cgroup_cpu_user_seconds{cgroup="/torque/1182958.batch.example.com"} 915.61
cgroup_cpus{cgroup="/torque/1182958.batch.example.com"} 40 cgroup_cpus{cgroup="/torque/1182958.batch.example.com"} 8
cgroup_cpu_info{cgroup="/torque/1182958.batch.example.com",cpus="0,1,2,3,4,5,6,7,8"} 1
cgroup_info{cgroup="/torque/1182958.batch.example.com",jobid="1182958",uid="",username=""} 1 cgroup_info{cgroup="/torque/1182958.batch.example.com",jobid="1182958",uid="",username=""} 1
cgroup_memory_cache_bytes{cgroup="/torque/1182958.batch.example.com"} 1.09678592e+08 cgroup_memory_cache_bytes{cgroup="/torque/1182958.batch.example.com"} 1.09678592e+08
cgroup_memory_fail_count{cgroup="/torque/1182958.batch.example.com"} 0 cgroup_memory_fail_count{cgroup="/torque/1182958.batch.example.com"} 0

View File

@ -50,6 +50,7 @@ type CgroupMetric struct {
cpuSystem float64 cpuSystem float64
cpuTotal float64 cpuTotal float64
cpus int cpus int
cpu_list string
memoryRSS float64 memoryRSS float64
memoryCache float64 memoryCache float64
memoryUsed float64 memoryUsed float64
@ -73,6 +74,7 @@ type Exporter struct {
cpuSystem *prometheus.Desc cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc cpuTotal *prometheus.Desc
cpus *prometheus.Desc cpus *prometheus.Desc
cpu_info *prometheus.Desc
memoryRSS *prometheus.Desc memoryRSS *prometheus.Desc
memoryCache *prometheus.Desc memoryCache *prometheus.Desc
memoryUsed *prometheus.Desc memoryUsed *prometheus.Desc
@ -109,47 +111,54 @@ func subsystem() ([]cgroups.Subsystem, error) {
return s, nil return s, nil
} }
func getCPUs(name string) (int, error) { func getCPUs(name string) ([]string, error) {
cpusPath := fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupRoot, name) cpusPath := fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupRoot, name)
if !fileExists(cpusPath) { if !fileExists(cpusPath) {
return 0, nil return nil, nil
} }
cpusData, err := ioutil.ReadFile(cpusPath) cpusData, err := ioutil.ReadFile(cpusPath)
if err != nil { if err != nil {
log.Errorf("Error reading %s: %s", cpusPath, err.Error()) log.Errorf("Error reading %s: %s", cpusPath, err.Error())
return 0, err return nil, err
} }
cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n")) cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n"))
if err != nil { if err != nil {
log.Errorf("Error parsing cpu set %s", err.Error()) log.Errorf("Error parsing cpu set %s", err.Error())
return 0, err return nil, err
} }
return cpus, nil return cpus, nil
} }
func parseCpuSet(cpuset string) (int, error) { func parseCpuSet(cpuset string) ([]string, error) {
var cpus int var cpus []string
var start, end int
var err error
if cpuset == "" { if cpuset == "" {
return 0, nil return nil, nil
} }
ranges := strings.Split(cpuset, ",") ranges := strings.Split(cpuset, ",")
for _, r := range ranges { for _, r := range ranges {
boundaries := strings.Split(r, "-") boundaries := strings.Split(r, "-")
if len(boundaries) == 1 { if len(boundaries) == 1 {
cpus++ start, err = strconv.Atoi(boundaries[0])
if err != nil {
return nil, err
}
end = start
} else if len(boundaries) == 2 { } else if len(boundaries) == 2 {
start, err := strconv.Atoi(boundaries[0]) start, err = strconv.Atoi(boundaries[0])
if err != nil { if err != nil {
return 0, err return nil, err
} }
end, err := strconv.Atoi(boundaries[1]) end, err = strconv.Atoi(boundaries[1])
if err != nil { if err != nil {
return 0, err return nil, err
}
for e := start; e <= end; e++ {
cpus++
} }
} }
for e := start; e <= end; e++ {
cpu := strconv.Itoa(e)
cpus = append(cpus, cpu)
}
} }
return cpus, nil return cpus, nil
} }
@ -227,6 +236,8 @@ func NewExporter(paths []string) *Exporter {
"Cumalitive CPU total seconds for cgroup", []string{"cgroup"}, nil), "Cumalitive CPU total seconds for cgroup", []string{"cgroup"}, nil),
cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"), cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"),
"Number of CPUs in the cgroup", []string{"cgroup"}, nil), "Number of CPUs in the cgroup", []string{"cgroup"}, nil),
cpu_info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpu_info"),
"Information about the cgroup CPUs", []string{"cgroup", "cpus"}, nil),
memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"), memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"),
"Memory RSS used in bytes", []string{"cgroup"}, nil), "Memory RSS used in bytes", []string{"cgroup"}, nil),
memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"), memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"),
@ -304,7 +315,8 @@ func (e *Exporter) collect() ([]CgroupMetric, error) {
metric.memswTotal = float64(stats.Memory.Swap.Limit) metric.memswTotal = float64(stats.Memory.Swap.Limit)
metric.memswFailCount = float64(stats.Memory.Swap.Failcnt) metric.memswFailCount = float64(stats.Memory.Swap.Failcnt)
if cpus, err := getCPUs(name); err == nil { if cpus, err := getCPUs(name); err == nil {
metric.cpus = cpus metric.cpus = len(cpus)
metric.cpu_list = strings.Join(cpus, ",")
} }
getInfo(name, &metric) getInfo(name, &metric)
metrics = append(metrics, metric) metrics = append(metrics, metric)
@ -319,6 +331,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- e.cpuSystem ch <- e.cpuSystem
ch <- e.cpuTotal ch <- e.cpuTotal
ch <- e.cpus ch <- e.cpus
ch <- e.cpu_info
ch <- e.memoryRSS ch <- e.memoryRSS
ch <- e.memoryCache ch <- e.memoryCache
ch <- e.memoryUsed ch <- e.memoryUsed
@ -339,6 +352,7 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name) ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name)
ch <- prometheus.MustNewConstMetric(e.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.name) ch <- prometheus.MustNewConstMetric(e.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.name)
ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name) ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name)
ch <- prometheus.MustNewConstMetric(e.cpu_info, prometheus.GaugeValue, 1, m.name, m.cpu_list)
ch <- prometheus.MustNewConstMetric(e.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.name) ch <- prometheus.MustNewConstMetric(e.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryCache, prometheus.GaugeValue, m.memoryCache, m.name) ch <- prometheus.MustNewConstMetric(e.memoryCache, prometheus.GaugeValue, m.memoryCache, m.name)
ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name) ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name)

View File

@ -21,6 +21,7 @@ import (
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"reflect"
"runtime" "runtime"
"strings" "strings"
"testing" "testing"
@ -53,20 +54,23 @@ func TestMain(m *testing.M) {
} }
func TestParseCpuSet(t *testing.T) { func TestParseCpuSet(t *testing.T) {
expected := []string{"0", "1", "2"}
if cpus, err := parseCpuSet("0-2"); err != nil { if cpus, err := parseCpuSet("0-2"); err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
} else if cpus != 3 { } else if !reflect.DeepEqual(cpus, expected) {
t.Errorf("Unexpected cpus, expected 3 got %d", cpus) t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
} }
expected = []string{"0", "1", "4", "5", "8", "9"}
if cpus, err := parseCpuSet("0-1,4-5,8-9"); err != nil { if cpus, err := parseCpuSet("0-1,4-5,8-9"); err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
} else if cpus != 6 { } else if !reflect.DeepEqual(cpus, expected) {
t.Errorf("Unexpected cpus, expected 6 got %d", cpus) t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
} }
expected = []string{"1", "3", "5", "7"}
if cpus, err := parseCpuSet("1,3,5,7"); err != nil { if cpus, err := parseCpuSet("1,3,5,7"); err != nil {
t.Errorf("Unexpected error: %s", err.Error()) t.Errorf("Unexpected error: %s", err.Error())
} else if cpus != 4 { } else if !reflect.DeepEqual(cpus, expected) {
t.Errorf("Unexpected cpus, expected 4 got %d", cpus) t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus)
} }
} }