Skip to content

Commit a0a3916

Browse files
nikola-jokicTingluoHuangLink-
authored
Provide scale-set listener metrics (#2559)
Co-authored-by: Tingluo Huang <tingluohuang@github.com> Co-authored-by: Bassem Dghaidi <568794+Link-@users.noreply.github.com>
1 parent 1c360d7 commit a0a3916

20 files changed

Lines changed: 975 additions & 427 deletions

File tree

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Build the manager binary
2-
FROM --platform=$BUILDPLATFORM golang:1.19.4 as builder
2+
FROM --platform=$BUILDPLATFORM golang:1.20.7 as builder
33

44
WORKDIR /workspace
55

charts/gha-runner-scale-set-controller/templates/_helpers.tpl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Selector labels
5151
*/}}
5252
{{- define "gha-runner-scale-set-controller.selectorLabels" -}}
5353
app.kubernetes.io/name: {{ include "gha-runner-scale-set-controller.name" . }}
54+
app.kubernetes.io/namespace: {{ .Release.Namespace }}
5455
app.kubernetes.io/instance: {{ .Release.Name }}
5556
{{- end }}
5657

@@ -119,3 +120,7 @@ Create the name of the service account to use
119120
{{- end }}
120121
{{- $names | join ","}}
121122
{{- end }}
123+
124+
{{- define "gha-runner-scale-set-controller.serviceMonitorName" -}}
125+
{{- include "gha-runner-scale-set-controller.fullname" . }}-service-monitor
126+
{{- end }}

charts/gha-runner-scale-set-controller/templates/deployment.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,25 @@ spec:
6565
{{- with .Values.flags.updateStrategy }}
6666
- "--update-strategy={{ . }}"
6767
{{- end }}
68+
{{- if .Values.metrics }}
69+
{{- with .Values.metrics }}
70+
- "--listener-metrics-addr={{ .listenerAddr }}"
71+
- "--listener-metrics-endpoint={{ .listenerEndpoint }}"
72+
- "--metrics-addr={{ .controllerManagerAddr }}"
73+
{{- end }}
74+
{{- else }}
75+
- "--listener-metrics-addr=0"
76+
- "--listener-metrics-endpoint="
77+
- "--metrics-addr=0"
78+
{{- end }}
6879
command:
6980
- "/manager"
81+
{{- with .Values.metrics }}
82+
ports:
83+
- containerPort: {{regexReplaceAll ":([0-9]+)" .controllerManagerAddr "${1}"}}
84+
protocol: TCP
85+
name: metrics
86+
{{- end }}
7087
env:
7188
- name: CONTROLLER_MANAGER_CONTAINER_IMAGE
7289
value: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"

charts/gha-runner-scale-set-controller/tests/template_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package tests
22

33
import (
4+
"fmt"
45
"os"
56
"path/filepath"
67
"strings"
@@ -361,6 +362,9 @@ func TestTemplate_ControllerDeployment_Defaults(t *testing.T) {
361362
"--log-level=debug",
362363
"--log-format=text",
363364
"--update-strategy=immediate",
365+
"--metrics-addr=0",
366+
"--listener-metrics-addr=0",
367+
"--listener-metrics-endpoint=",
364368
}
365369
assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
366370

@@ -495,6 +499,9 @@ func TestTemplate_ControllerDeployment_Customize(t *testing.T) {
495499
"--log-level=info",
496500
"--log-format=json",
497501
"--update-strategy=eventual",
502+
"--listener-metrics-addr=0",
503+
"--listener-metrics-endpoint=",
504+
"--metrics-addr=0",
498505
}
499506

500507
assert.ElementsMatch(t, expectArgs, deployment.Spec.Template.Spec.Containers[0].Args)
@@ -621,6 +628,9 @@ func TestTemplate_EnableLeaderElection(t *testing.T) {
621628
"--log-level=debug",
622629
"--log-format=text",
623630
"--update-strategy=immediate",
631+
"--listener-metrics-addr=0",
632+
"--listener-metrics-endpoint=",
633+
"--metrics-addr=0",
624634
}
625635

626636
assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
@@ -658,6 +668,9 @@ func TestTemplate_ControllerDeployment_ForwardImagePullSecrets(t *testing.T) {
658668
"--log-level=debug",
659669
"--log-format=text",
660670
"--update-strategy=immediate",
671+
"--listener-metrics-addr=0",
672+
"--listener-metrics-endpoint=",
673+
"--metrics-addr=0",
661674
}
662675

663676
assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
@@ -744,6 +757,9 @@ func TestTemplate_ControllerDeployment_WatchSingleNamespace(t *testing.T) {
744757
"--log-format=text",
745758
"--watch-single-namespace=demo",
746759
"--update-strategy=immediate",
760+
"--listener-metrics-addr=0",
761+
"--listener-metrics-endpoint=",
762+
"--metrics-addr=0",
747763
}
748764

749765
assert.ElementsMatch(t, expectedArgs, deployment.Spec.Template.Spec.Containers[0].Args)
@@ -934,3 +950,75 @@ func TestTemplate_ManagerSingleNamespaceRoleBinding(t *testing.T) {
934950
assert.Equal(t, "test-arc-gha-rs-controller", managerSingleNamespaceWatchRoleBinding.Subjects[0].Name)
935951
assert.Equal(t, namespaceName, managerSingleNamespaceWatchRoleBinding.Subjects[0].Namespace)
936952
}
953+
954+
func TestControllerDeployment_MetricsPorts(t *testing.T) {
955+
t.Parallel()
956+
957+
// Path to the helm chart we will test
958+
helmChartPath, err := filepath.Abs("../../gha-runner-scale-set-controller")
959+
require.NoError(t, err)
960+
961+
chartContent, err := os.ReadFile(filepath.Join(helmChartPath, "Chart.yaml"))
962+
require.NoError(t, err)
963+
964+
chart := new(Chart)
965+
err = yaml.Unmarshal(chartContent, chart)
966+
require.NoError(t, err)
967+
968+
releaseName := "test-arc"
969+
namespaceName := "test-" + strings.ToLower(random.UniqueId())
970+
971+
options := &helm.Options{
972+
Logger: logger.Discard,
973+
SetValues: map[string]string{
974+
"image.tag": "dev",
975+
"metrics.controllerManagerAddr": ":8080",
976+
"metrics.listenerAddr": ":8081",
977+
"metrics.listenerEndpoint": "/metrics",
978+
},
979+
KubectlOptions: k8s.NewKubectlOptions("", "", namespaceName),
980+
}
981+
982+
output := helm.RenderTemplate(t, options, helmChartPath, releaseName, []string{"templates/deployment.yaml"})
983+
984+
var deployment appsv1.Deployment
985+
helm.UnmarshalK8SYaml(t, output, &deployment)
986+
987+
require.Len(t, deployment.Spec.Template.Spec.Containers, 1, "Expected one container")
988+
container := deployment.Spec.Template.Spec.Containers[0]
989+
assert.Len(t, container.Ports, 1)
990+
port := container.Ports[0]
991+
assert.Equal(t, corev1.Protocol("TCP"), port.Protocol)
992+
assert.Equal(t, int32(8080), port.ContainerPort)
993+
994+
metricsFlags := map[string]*struct {
995+
expect string
996+
frequency int
997+
}{
998+
"--listener-metrics-addr": {
999+
expect: ":8081",
1000+
},
1001+
"--listener-metrics-endpoint": {
1002+
expect: "/metrics",
1003+
},
1004+
"--metrics-addr": {
1005+
expect: ":8080",
1006+
},
1007+
}
1008+
for _, cmd := range container.Args {
1009+
s := strings.Split(cmd, "=")
1010+
if len(s) != 2 {
1011+
continue
1012+
}
1013+
flag, ok := metricsFlags[s[0]]
1014+
if !ok {
1015+
continue
1016+
}
1017+
flag.frequency++
1018+
assert.Equal(t, flag.expect, s[1])
1019+
}
1020+
1021+
for key, value := range metricsFlags {
1022+
assert.Equal(t, value.frequency, 1, fmt.Sprintf("frequency of %q is not 1", key))
1023+
}
1024+
}

charts/gha-runner-scale-set-controller/values.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,14 @@ flags:
102102
## This can lead to a longer time to apply the change but it will ensure
103103
## that you don't have any overprovisioning of runners.
104104
updateStrategy: "immediate"
105+
106+
## If `metrics:` object is not provided, or commented out, the following flags
107+
## will be applied the controller-manager and listener pods with empty values:
108+
## `--metrics-addr`, `--listener-metrics-addr`, `--listener-metrics-endpoint`.
109+
## This will disable metrics.
110+
##
111+
## To enable metrics, uncomment the following lines.
112+
# metrics:
113+
# controllerManagerAddr: ":8080"
114+
# listenerAddr: ":8080"
115+
# listenerEndpoint: "/metrics"

cmd/githubrunnerscalesetlistener/autoScalerService.go

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"math"
89
"strings"
@@ -25,6 +26,31 @@ type Service struct {
2526
kubeManager KubernetesManager
2627
settings *ScaleSettings
2728
currentRunnerCount int
29+
metricsExporter metricsExporter
30+
errs []error
31+
}
32+
33+
func WithPrometheusMetrics(conf RunnerScaleSetListenerConfig) func(*Service) {
34+
return func(svc *Service) {
35+
parsedURL, err := actions.ParseGitHubConfigFromURL(conf.ConfigureUrl)
36+
if err != nil {
37+
svc.errs = append(svc.errs, err)
38+
}
39+
40+
svc.metricsExporter.withBaseLabels(baseLabels{
41+
scaleSetName: conf.EphemeralRunnerSetName,
42+
scaleSetNamespace: conf.EphemeralRunnerSetNamespace,
43+
enterprise: parsedURL.Enterprise,
44+
organization: parsedURL.Organization,
45+
repository: parsedURL.Repository,
46+
})
47+
}
48+
}
49+
50+
func WithLogger(logger logr.Logger) func(*Service) {
51+
return func(s *Service) {
52+
s.logger = logger.WithName("service")
53+
}
2854
}
2955

3056
func NewService(
@@ -33,7 +59,7 @@ func NewService(
3359
manager KubernetesManager,
3460
settings *ScaleSettings,
3561
options ...func(*Service),
36-
) *Service {
62+
) (*Service, error) {
3763
s := &Service{
3864
ctx: ctx,
3965
rsClient: rsClient,
@@ -47,7 +73,11 @@ func NewService(
4773
option(s)
4874
}
4975

50-
return s
76+
if len(s.errs) > 0 {
77+
return nil, errors.Join(s.errs...)
78+
}
79+
80+
return s, nil
5181
}
5282

5383
func (s *Service) Start() error {
@@ -81,6 +111,8 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
81111
"busy runners", message.Statistics.TotalBusyRunners,
82112
"idle runners", message.Statistics.TotalIdleRunners)
83113

114+
s.metricsExporter.publishStatistics(message.Statistics)
115+
84116
if message.MessageType != "RunnerScaleSetJobMessages" {
85117
s.logger.Info("skip message with unknown message type.", "messageType", message.MessageType)
86118
return nil
@@ -110,27 +142,54 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
110142
if err := json.Unmarshal(message, &jobAvailable); err != nil {
111143
return fmt.Errorf("could not decode job available message. %w", err)
112144
}
113-
s.logger.Info("job available message received.", "RequestId", jobAvailable.RunnerRequestId)
145+
s.logger.Info(
146+
"job available message received.",
147+
"RequestId",
148+
jobAvailable.RunnerRequestId,
149+
)
114150
availableJobs = append(availableJobs, jobAvailable.RunnerRequestId)
115151
case "JobAssigned":
116152
var jobAssigned actions.JobAssigned
117153
if err := json.Unmarshal(message, &jobAssigned); err != nil {
118154
return fmt.Errorf("could not decode job assigned message. %w", err)
119155
}
120-
s.logger.Info("job assigned message received.", "RequestId", jobAssigned.RunnerRequestId)
156+
s.logger.Info(
157+
"job assigned message received.",
158+
"RequestId",
159+
jobAssigned.RunnerRequestId,
160+
)
161+
// s.metricsExporter.publishJobAssigned(&jobAssigned)
121162
case "JobStarted":
122163
var jobStarted actions.JobStarted
123164
if err := json.Unmarshal(message, &jobStarted); err != nil {
124165
return fmt.Errorf("could not decode job started message. %w", err)
125166
}
126-
s.logger.Info("job started message received.", "RequestId", jobStarted.RunnerRequestId, "RunnerId", jobStarted.RunnerId)
167+
s.logger.Info(
168+
"job started message received.",
169+
"RequestId",
170+
jobStarted.RunnerRequestId,
171+
"RunnerId",
172+
jobStarted.RunnerId,
173+
)
174+
s.metricsExporter.publishJobStarted(&jobStarted)
127175
s.updateJobInfoForRunner(jobStarted)
128176
case "JobCompleted":
129177
var jobCompleted actions.JobCompleted
130178
if err := json.Unmarshal(message, &jobCompleted); err != nil {
131179
return fmt.Errorf("could not decode job completed message. %w", err)
132180
}
133-
s.logger.Info("job completed message received.", "RequestId", jobCompleted.RunnerRequestId, "Result", jobCompleted.Result, "RunnerId", jobCompleted.RunnerId, "RunnerName", jobCompleted.RunnerName)
181+
s.logger.Info(
182+
"job completed message received.",
183+
"RequestId",
184+
jobCompleted.RunnerRequestId,
185+
"Result",
186+
jobCompleted.Result,
187+
"RunnerId",
188+
jobCompleted.RunnerId,
189+
"RunnerName",
190+
jobCompleted.RunnerName,
191+
)
192+
s.metricsExporter.publishJobCompleted(&jobCompleted)
134193
default:
135194
s.logger.Info("unknown job message type.", "messageType", messageType.MessageType)
136195
}
@@ -146,13 +205,15 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
146205

147206
func (s *Service) scaleForAssignedJobCount(count int) error {
148207
targetRunnerCount := int(math.Max(math.Min(float64(s.settings.MaxRunners), float64(count)), float64(s.settings.MinRunners)))
208+
s.metricsExporter.publishDesiredRunners(targetRunnerCount)
149209
if targetRunnerCount != s.currentRunnerCount {
150210
s.logger.Info("try scale runner request up/down base on assigned job count",
151211
"assigned job", count,
152212
"decision", targetRunnerCount,
153213
"min", s.settings.MinRunners,
154214
"max", s.settings.MaxRunners,
155-
"currentRunnerCount", s.currentRunnerCount)
215+
"currentRunnerCount", s.currentRunnerCount,
216+
)
156217
err := s.kubeManager.ScaleEphemeralRunnerSet(s.ctx, s.settings.Namespace, s.settings.ResourceName, targetRunnerCount)
157218
if err != nil {
158219
return fmt.Errorf("could not scale ephemeral runner set (%s/%s). %w", s.settings.Namespace, s.settings.ResourceName, err)
@@ -173,7 +234,8 @@ func (s *Service) updateJobInfoForRunner(jobInfo actions.JobStarted) {
173234
"workflowRef", jobInfo.JobWorkflowRef,
174235
"workflowRunId", jobInfo.WorkflowRunId,
175236
"jobDisplayName", jobInfo.JobDisplayName,
176-
"requestId", jobInfo.RunnerRequestId)
237+
"requestId", jobInfo.RunnerRequestId,
238+
)
177239
err := s.kubeManager.UpdateEphemeralRunnerWithJobInfo(s.ctx, s.settings.Namespace, jobInfo.RunnerName, jobInfo.OwnerName, jobInfo.RepositoryName, jobInfo.JobWorkflowRef, jobInfo.JobDisplayName, jobInfo.WorkflowRunId, jobInfo.RunnerRequestId)
178240
if err != nil {
179241
s.logger.Error(err, "could not update ephemeral runner with job info", "runnerName", jobInfo.RunnerName, "requestId", jobInfo.RunnerRequestId)

0 commit comments

Comments
 (0)