diff --git a/README.md b/README.md index 49edbe4..3855dca 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ cgroup_cpu_system_seconds{cgroup="/user.slice/user-20821.slice"} 1.96 cgroup_cpu_total_seconds{cgroup="/user.slice/user-20821.slice"} 3.817500568 cgroup_cpu_user_seconds{cgroup="/user.slice/user-20821.slice"} 1.61 cgroup_cpus{cgroup="/user.slice/user-20821.slice"} 0 +cgroup_cpu_info{cgroup="/user.slice/user-20821.slice",cpus=""} 1 cgroup_info{cgroup="/user.slice/user-20821.slice",uid="20821",username="tdockendorf",jobid=""} 1 cgroup_memory_cache_bytes{cgroup="/user.slice/user-20821.slice"} 2.322432e+06 cgroup_memory_fail_count{cgroup="/user.slice/user-20821.slice"} 0 @@ -68,6 +69,7 @@ cgroup_cpu_system_seconds{cgroup="/slurm/uid_20821/job_12"} 0 cgroup_cpu_total_seconds{cgroup="/slurm/uid_20821/job_12"} 0.007840451 cgroup_cpu_user_seconds{cgroup="/slurm/uid_20821/job_12"} 0 cgroup_cpus{cgroup="/slurm/uid_20821/job_12"} 2 +cgroup_cpu_info{cgroup="/slurm/uid_20821/job_12",cpus="0,1"} 1 cgroup_info{cgroup="/slurm/uid_20821/job_12",jobid="12",uid="20821",username="tdockendorf"} 1 cgroup_memory_cache_bytes{cgroup="/slurm/uid_20821/job_12"} 4.096e+03 cgroup_memory_fail_count{cgroup="/slurm/uid_20821/job_12"} 0 @@ -85,7 +87,8 @@ Example of metrics exposed by this exporter when looking at `/torque` paths: cgroup_cpu_system_seconds{cgroup="/torque/1182958.batch.example.com"} 26.35 cgroup_cpu_total_seconds{cgroup="/torque/1182958.batch.example.com"} 939.568245515 cgroup_cpu_user_seconds{cgroup="/torque/1182958.batch.example.com"} 915.61 -cgroup_cpus{cgroup="/torque/1182958.batch.example.com"} 40 +cgroup_cpus{cgroup="/torque/1182958.batch.example.com"} 8 +cgroup_cpu_info{cgroup="/torque/1182958.batch.example.com",cpus="0,1,2,3,4,5,6,7,8"} 1 cgroup_info{cgroup="/torque/1182958.batch.example.com",jobid="1182958",uid="",username=""} 1 cgroup_memory_cache_bytes{cgroup="/torque/1182958.batch.example.com"} 1.09678592e+08 cgroup_memory_fail_count{cgroup="/torque/1182958.batch.example.com"} 0 diff --git a/cgroup_exporter.go b/cgroup_exporter.go index b42aefb..b8e00a8 100644 --- a/cgroup_exporter.go +++ b/cgroup_exporter.go @@ -50,6 +50,7 @@ type CgroupMetric struct { cpuSystem float64 cpuTotal float64 cpus int + cpu_list string memoryRSS float64 memoryCache float64 memoryUsed float64 @@ -73,6 +74,7 @@ type Exporter struct { cpuSystem *prometheus.Desc cpuTotal *prometheus.Desc cpus *prometheus.Desc + cpu_info *prometheus.Desc memoryRSS *prometheus.Desc memoryCache *prometheus.Desc memoryUsed *prometheus.Desc @@ -109,47 +111,54 @@ func subsystem() ([]cgroups.Subsystem, error) { return s, nil } -func getCPUs(name string) (int, error) { +func getCPUs(name string) ([]string, error) { cpusPath := fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupRoot, name) if !fileExists(cpusPath) { - return 0, nil + return nil, nil } cpusData, err := ioutil.ReadFile(cpusPath) if err != nil { log.Errorf("Error reading %s: %s", cpusPath, err.Error()) - return 0, err + return nil, err } cpus, err := parseCpuSet(strings.TrimSuffix(string(cpusData), "\n")) if err != nil { log.Errorf("Error parsing cpu set %s", err.Error()) - return 0, err + return nil, err } return cpus, nil } -func parseCpuSet(cpuset string) (int, error) { - var cpus int +func parseCpuSet(cpuset string) ([]string, error) { + var cpus []string + var start, end int + var err error if cpuset == "" { - return 0, nil + return nil, nil } ranges := strings.Split(cpuset, ",") for _, r := range ranges { boundaries := strings.Split(r, "-") if len(boundaries) == 1 { - cpus++ + start, err = strconv.Atoi(boundaries[0]) + if err != nil { + return nil, err + } + end = start } else if len(boundaries) == 2 { - start, err := strconv.Atoi(boundaries[0]) + start, err = strconv.Atoi(boundaries[0]) if err != nil { - return 0, err + return nil, err } - end, err := strconv.Atoi(boundaries[1]) + end, err = strconv.Atoi(boundaries[1]) if err != nil { - return 0, err - } - for e := start; e <= end; e++ { - cpus++ + return nil, err } } + for e := start; e <= end; e++ { + cpu := strconv.Itoa(e) + cpus = append(cpus, cpu) + } } return cpus, nil } @@ -227,6 +236,8 @@ func NewExporter(paths []string) *Exporter { "Cumalitive CPU total seconds for cgroup", []string{"cgroup"}, nil), cpus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpus"), "Number of CPUs in the cgroup", []string{"cgroup"}, nil), + cpu_info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "", "cpu_info"), + "Information about the cgroup CPUs", []string{"cgroup", "cpus"}, nil), memoryRSS: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "rss_bytes"), "Memory RSS used in bytes", []string{"cgroup"}, nil), memoryCache: prometheus.NewDesc(prometheus.BuildFQName(namespace, "memory", "cache_bytes"), @@ -304,7 +315,8 @@ func (e *Exporter) collect() ([]CgroupMetric, error) { metric.memswTotal = float64(stats.Memory.Swap.Limit) metric.memswFailCount = float64(stats.Memory.Swap.Failcnt) if cpus, err := getCPUs(name); err == nil { - metric.cpus = cpus + metric.cpus = len(cpus) + metric.cpu_list = strings.Join(cpus, ",") } getInfo(name, &metric) metrics = append(metrics, metric) @@ -319,6 +331,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { ch <- e.cpuSystem ch <- e.cpuTotal ch <- e.cpus + ch <- e.cpu_info ch <- e.memoryRSS ch <- e.memoryCache ch <- e.memoryUsed @@ -339,6 +352,7 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric(e.cpuSystem, prometheus.GaugeValue, m.cpuSystem, m.name) ch <- prometheus.MustNewConstMetric(e.cpuTotal, prometheus.GaugeValue, m.cpuTotal, m.name) ch <- prometheus.MustNewConstMetric(e.cpus, prometheus.GaugeValue, float64(m.cpus), m.name) + ch <- prometheus.MustNewConstMetric(e.cpu_info, prometheus.GaugeValue, 1, m.name, m.cpu_list) ch <- prometheus.MustNewConstMetric(e.memoryRSS, prometheus.GaugeValue, m.memoryRSS, m.name) ch <- prometheus.MustNewConstMetric(e.memoryCache, prometheus.GaugeValue, m.memoryCache, m.name) ch <- prometheus.MustNewConstMetric(e.memoryUsed, prometheus.GaugeValue, m.memoryUsed, m.name) diff --git a/cgroup_exporter_test.go b/cgroup_exporter_test.go index 686f311..1da097a 100644 --- a/cgroup_exporter_test.go +++ b/cgroup_exporter_test.go @@ -21,6 +21,7 @@ import ( "net/http" "os" "path/filepath" + "reflect" "runtime" "strings" "testing" @@ -53,20 +54,23 @@ func TestMain(m *testing.M) { } func TestParseCpuSet(t *testing.T) { + expected := []string{"0", "1", "2"} if cpus, err := parseCpuSet("0-2"); err != nil { t.Errorf("Unexpected error: %s", err.Error()) - } else if cpus != 3 { - t.Errorf("Unexpected cpus, expected 3 got %d", cpus) + } else if !reflect.DeepEqual(cpus, expected) { + t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus) } + expected = []string{"0", "1", "4", "5", "8", "9"} if cpus, err := parseCpuSet("0-1,4-5,8-9"); err != nil { t.Errorf("Unexpected error: %s", err.Error()) - } else if cpus != 6 { - t.Errorf("Unexpected cpus, expected 6 got %d", cpus) + } else if !reflect.DeepEqual(cpus, expected) { + t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus) } + expected = []string{"1", "3", "5", "7"} if cpus, err := parseCpuSet("1,3,5,7"); err != nil { t.Errorf("Unexpected error: %s", err.Error()) - } else if cpus != 4 { - t.Errorf("Unexpected cpus, expected 4 got %d", cpus) + } else if !reflect.DeepEqual(cpus, expected) { + t.Errorf("Unexpected cpus, expected %v got %v", expected, cpus) } }