diff --git a/.github/workflows/rebuild-postgres.yaml b/.github/workflows/rebuild-postgres.yaml new file mode 100644 index 000000000..a55c72433 --- /dev/null +++ b/.github/workflows/rebuild-postgres.yaml @@ -0,0 +1,103 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +name: Rebuild Postgres on CVE +on: + schedule: + # Run daily at 6:00 UTC (scheduled workflows always run on default branch) + - cron: "0 6 * * *" + workflow_dispatch: + inputs: + dry_run: + description: "Only scan and compare, skip PR creation" + type: boolean + default: false + +env: + REGISTRY: ghcr.io + IMAGE: ghcr.io/${{ github.repository }}-postgres + +jobs: + check: + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + rebuild_fixes_cves: ${{ steps.compare.outputs.rebuild_fixes_cves }} + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: Scan published image + uses: aquasecurity/trivy-action@v0.36.0 + with: + scan-type: image + image-ref: ${{ env.IMAGE }}:latest + scanners: vuln + ignore-unfixed: true + severity: "CRITICAL,HIGH,MEDIUM" + format: json + output: published-scan.json + continue-on-error: true + + - name: Build fresh image + run: docker build -t cortex-postgres:rebuilt -f postgres/Dockerfile postgres/ + + - name: Scan rebuilt image + uses: aquasecurity/trivy-action@v0.36.0 + with: + scan-type: image + image-ref: cortex-postgres:rebuilt + scanners: vuln + ignore-unfixed: true + severity: "CRITICAL,HIGH,MEDIUM" + format: json + output: rebuilt-scan.json + continue-on-error: true + + - name: Compare CVE counts + id: compare + run: | + published_cves=$(jq '[.Results[]?.Vulnerabilities // [] | length] | add // 0' published-scan.json) + rebuilt_cves=$(jq '[.Results[]?.Vulnerabilities // [] | length] | add // 0' rebuilt-scan.json) + echo "Published image CVEs: $published_cves" + echo "Rebuilt image CVEs: $rebuilt_cves" + if [ "$published_cves" -gt 0 ] && [ "$rebuilt_cves" -lt "$published_cves" ]; then + echo "rebuild_fixes_cves=true" >> "$GITHUB_OUTPUT" + else + echo "rebuild_fixes_cves=false" >> "$GITHUB_OUTPUT" + fi + + open-pr: + needs: check + if: needs.check.outputs.rebuild_fixes_cves == 'true' && !(inputs.dry_run || false) + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v6 + with: + ref: main + persist-credentials: false + - name: Update rebuild trigger + run: | + echo "${{ github.run_id }}" > postgres/rebuild-trigger + - name: Create Pull Request + uses: peter-evans/create-pull-request@v8 + with: + base: main + commit-message: "fix(postgres): rebuild image to resolve CVEs" + title: "fix(postgres): rebuild image to resolve CVEs" + body: | + The daily CVE scan detected fixable vulnerabilities in the published + `cortex-postgres` image. A test rebuild confirms that rebuilding + reduces the CVE count (via `apt-get upgrade` picking up security patches). + + Merging this PR triggers the image rebuild and publish pipeline. + + This PR was created automatically by the `rebuild-postgres` workflow. + branch: fix/postgres-cve-rebuild + delete-branch: true + labels: security diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 28e4cf93d..5630964ad 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -581,19 +581,20 @@ func main() { } crControllerConf := commitmentsConfig.CommittedResourceController - crControllerConf.ApplyDefaults() + + crControllerMonitor := commitments.NewCRControllerMonitor(multiclusterClient) + metrics.Registry.MustRegister(&crControllerMonitor) + if err := (&commitments.CommittedResourceController{ - Client: multiclusterClient, - Scheme: mgr.GetScheme(), - Conf: crControllerConf, + Client: multiclusterClient, + Scheme: mgr.GetScheme(), + Conf: crControllerConf, + Monitor: &crControllerMonitor, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "CommittedResource") os.Exit(1) } - crControllerMonitor := commitments.NewCRControllerMonitor(multiclusterClient) - metrics.Registry.MustRegister(&crControllerMonitor) - usageReconcilerMonitor := commitments.NewUsageReconcilerMonitor() metrics.Registry.MustRegister(&usageReconcilerMonitor) if commitmentsUsageDB == nil { diff --git a/go.mod b/go.mod index b52dbc921..2a71db918 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/ironcore-dev/ironcore v0.3.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 - github.com/sapcc/go-bits v0.0.0-20260519090007-308851876285 + github.com/sapcc/go-bits v0.0.0-20260526084158-fcb8a0bff0a3 go.xyrillian.de/gg v1.7.0 k8s.io/api v0.36.1 k8s.io/apimachinery v0.36.1 @@ -109,10 +109,10 @@ require ( go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 // indirect - golang.org/x/net v0.54.0 // indirect + golang.org/x/net v0.55.0 // indirect golang.org/x/oauth2 v0.35.0 // indirect golang.org/x/sync v0.20.0 - golang.org/x/sys v0.44.0 // indirect + golang.org/x/sys v0.45.0 // indirect golang.org/x/term v0.43.0 golang.org/x/text v0.37.0 // indirect golang.org/x/time v0.15.0 // indirect diff --git a/go.sum b/go.sum index b3b68c983..13a221377 100644 --- a/go.sum +++ b/go.sum @@ -204,8 +204,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sapcc/go-api-declarations v1.22.0 h1:nU/eJ6OO54Z9YSo1gWinD0A2etrfZObCwYdB9xA0VWE= github.com/sapcc/go-api-declarations v1.22.0/go.mod h1:x3V8bzg7Y4kmbA+DeWWwKteFEdCCSiVQdwRXj4fGAYY= -github.com/sapcc/go-bits v0.0.0-20260519090007-308851876285 h1:m3WoORYUK0nIxLKxi27HMQEdKXENkoQLiQnkjfEYyKs= -github.com/sapcc/go-bits v0.0.0-20260519090007-308851876285/go.mod h1:AbFZEVaSrwyhx7x4ZNExokEVIEWdXFOIJ/WHQzs4Y3I= +github.com/sapcc/go-bits v0.0.0-20260526084158-fcb8a0bff0a3 h1:uf2Szgyh5z4mh4pCp8ZAHdDSaUJfRiKFUr1lU1Fs3oo= +github.com/sapcc/go-bits v0.0.0-20260526084158-fcb8a0bff0a3/go.mod h1:tlX0d8TvLgEikNWwFbB1SxnW0q/6XybpXjt8mr97Qzg= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -271,16 +271,16 @@ golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1i golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= -golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= -golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= -golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= +golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 934ab6c85..c0c58a69d 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-cinder description: A Helm chart deploying Cortex for Cinder. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.3 + version: 0.6.4 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-crds/Chart.yaml b/helm/bundles/cortex-crds/Chart.yaml index c569c7692..4e841e56f 100644 --- a/helm/bundles/cortex-crds/Chart.yaml +++ b/helm/bundles/cortex-crds/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-crds description: A Helm chart deploying Cortex CRDs. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-ironcore/Chart.yaml b/helm/bundles/cortex-ironcore/Chart.yaml index 14cba8c87..5ffa84cbe 100644 --- a/helm/bundles/cortex-ironcore/Chart.yaml +++ b/helm/bundles/cortex-ironcore/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-ironcore description: A Helm chart deploying Cortex for IronCore. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index faa77fdad..3ad0a358e 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-manila description: A Helm chart deploying Cortex for Manila. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.3 + version: 0.6.4 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index e5a9efcf4..e5b8da70f 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -5,23 +5,23 @@ apiVersion: v2 name: cortex-nova description: A Helm chart deploying Cortex for Nova. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.6.3 + version: 0.6.4 # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-knowledge-controllers # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 alias: cortex-scheduling-controllers # Owner info adds a configmap to the kubernetes cluster with information on diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 8b6f803ab..3fbb6f59e 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -624,6 +624,7 @@ spec: VM allocations and all reservation types are ignored to represent an empty datacenter scenario. params: + - {key: ignoreAllocations, boolValue: true} - {key: ignoredReservationTypes, stringListValue: ["CommittedResourceReservation", "FailoverReservation"]} - name: filter_has_requested_traits description: | diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 3ffd30f0f..4a194ae50 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -173,6 +173,10 @@ cortex-scheduling-controllers: requeueIntervalRetry: "1m" # Maximum back-off interval cap for the exponential retry delay maxRequeueInterval: "30m" + # Pause between consecutive Reservation CRD creates to spread scheduler load; 0 disables + slotCreationDelay: "0ms" + # Max Reservation CRDs per CommittedResource on the API path; 0 disables the limit + maxSlotsPerCommitment: 0 committedResourceAPI: # Timeout for watching CommittedResource CRDs before rolling back watchTimeout: "15s" diff --git a/helm/bundles/cortex-pods/Chart.yaml b/helm/bundles/cortex-pods/Chart.yaml index 223126d9e..822ca5942 100644 --- a/helm/bundles/cortex-pods/Chart.yaml +++ b/helm/bundles/cortex-pods/Chart.yaml @@ -5,13 +5,13 @@ apiVersion: v2 name: cortex-pods description: A Helm chart deploying Cortex for Pods. type: application -version: 0.0.71 +version: 0.0.72 appVersion: 0.1.0 dependencies: # from: file://../../library/cortex - name: cortex repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.0.58 + version: 0.0.59 # Owner info adds a configmap to the kubernetes cluster with information on # the service owner. This makes it easier to find out who to contact in case diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index 0113c7b63..445045d3c 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: cortex-postgres description: Postgres setup for Cortex. type: application -version: 0.6.3 -appVersion: "sha-b012ae82" +version: 0.6.4 +appVersion: "sha-8cc792c5" diff --git a/helm/library/cortex/Chart.yaml b/helm/library/cortex/Chart.yaml index e6c9ce416..ebb5f4f49 100644 --- a/helm/library/cortex/Chart.yaml +++ b/helm/library/cortex/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: cortex description: A Helm chart to distribute cortex. type: application -version: 0.0.58 -appVersion: "sha-71f4552a" +version: 0.0.59 +appVersion: "sha-6bc914a6" icon: "https://example.com/icon.png" dependencies: [] diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go index 03298e9db..906a46a1e 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_api.go @@ -20,15 +20,18 @@ import ( "github.com/gophercloud/gophercloud/v2/openstack" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates" "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" + "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/servers" glanceimages "github.com/gophercloud/gophercloud/v2/openstack/image/v2/images" "github.com/gophercloud/gophercloud/v2/pagination" "github.com/prometheus/client_golang/prometheus" + "github.com/sapcc/go-bits/liquidapi" ) type NovaAPI interface { // Init the nova API. Init(ctx context.Context) error // Get all nova servers that are NOT deleted. (Includes ERROR, SHUTOFF etc) + // For KVM flavors, os_type is probed concurrently using the OSTypeProber. GetAllServers(ctx context.Context) ([]Server, error) // Get all deleted nova servers since the timestamp. GetDeletedServers(ctx context.Context, since time.Time) ([]DeletedServer, error) @@ -56,6 +59,8 @@ type novaAPI struct { sc *gophercloud.ServiceClient // Authenticated Glance image service client (only used for NovaDatasourceTypeImages). glance *gophercloud.ServiceClient + // OS type prober for determining VM operating system type (only for NovaDatasourceTypeServers). + osTypeProber *liquidapi.OSTypeProber } func NewNovaAPI(mon datasources.Monitor, k keystone.KeystoneClient, conf v1alpha1.NovaDatasource) NovaAPI { @@ -95,6 +100,11 @@ func (api *novaAPI) Init(ctx context.Context) error { } api.glance = glanceClient } + // Initialize the OS type prober only for the servers datasource. + if api.conf.Type == v1alpha1.NovaDatasourceTypeServers { + eo := gophercloud.EndpointOpts{Availability: gophercloud.Availability(sameAsKeystone)} + api.osTypeProber = initOSTypeProber(provider, eo) + } return nil } @@ -157,10 +167,56 @@ func (api *novaAPI) GetAllServers(ctx context.Context) ([]Server, error) { } } + // Probe OS type concurrently for KVM servers. + api.probeOSTypes(ctx, allServers) + slog.Info("fetched", "label", label, "count", len(allServers)) return allServers, nil } +// probeOSTypes determines the OS type for all KVM servers sequentially. +// The prober caches by image ID internally, so repeated images are instant. +func (api *novaAPI) probeOSTypes(ctx context.Context, allServers []Server) { + if api.osTypeProber == nil { + slog.Info("os_type prober not initialized, skipping") + return + } + var probed, resolved, unknown, rootdiskMissing int + for i := range allServers { + if isKVMFlavor(allServers[i].FlavorName) { + probed++ + osType := api.probeOSType(ctx, allServers[i]) + switch osType { + case "unknown": + unknown++ + case "rootdisk-missing": + rootdiskMissing++ + default: + if osType != "" { + resolved++ + } + } + allServers[i].OSType = osType + } + } + slog.Info("probed os_type for KVM servers", + "total", len(allServers), + "kvm", probed, + "resolved", resolved, + "unknown", unknown, + "rootdiskMissing", rootdiskMissing, + ) +} + +// probeOSType determines the OS type for a single server. +func (api *novaAPI) probeOSType(ctx context.Context, s Server) string { + var imageMap map[string]any + if s.ImageRef != "" { + imageMap = map[string]any{"id": s.ImageRef} + } + return api.osTypeProber.Get(ctx, servers.Server{ID: s.ID, Image: imageMap}) +} + // Get all deleted Nova servers. // Note on Nova terminology: Nova uses "instance" internally in its database and code, // but exposes these as "server" objects through the public API. @@ -522,3 +578,19 @@ func deriveOSType(properties map[string]any, tags []string) string { } return "unknown" } + +// initOSTypeProber safely creates an OSTypeProber, returning nil on any error or panic. +func initOSTypeProber(provider *gophercloud.ProviderClient, eo gophercloud.EndpointOpts) (prober *liquidapi.OSTypeProber) { + defer func() { + if r := recover(); r != nil { + slog.Warn("panic during OS type prober initialization - os_type will be empty", "panic", r) + prober = nil + } + }() + p, err := liquidapi.NewOSTypeProber(provider, eo) + if err != nil { + slog.Warn("failed to initialize OS type prober - os_type will be empty", "error", err) + return nil + } + return p +} diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go index a2c466c42..ec0533f84 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_sync.go @@ -5,6 +5,7 @@ package nova import ( "context" + "strings" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -75,6 +76,11 @@ func (s *NovaSyncer) Sync(ctx context.Context) (int64, error) { return nResults, err } +// isKVMFlavor returns true if the flavor name indicates a KVM-based VM. +func isKVMFlavor(flavorName string) bool { + return strings.Contains(flavorName, "_k_") +} + // Sync all the active OpenStack servers into the database. (Includes ERROR, SHUTOFF, etc. state) func (s *NovaSyncer) SyncAllServers(ctx context.Context) (int64, error) { allServers, err := s.API.GetAllServers(ctx) diff --git a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go index 5fef71d6e..17c422194 100644 --- a/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go +++ b/internal/knowledge/datasources/plugins/openstack/nova/nova_types.go @@ -116,6 +116,10 @@ type Server struct { // Empty string for volume-booted servers. ImageRef string `json:"-" db:"image_ref"` + // OSType is the operating system type determined by the OSTypeProber at sync time. + // Only populated for KVM servers (flavor name contains "_k_"). + OSType string `json:"-" db:"os_type"` + // From nested server.fault JSON // The error response code. @@ -234,7 +238,7 @@ func (s *Server) MarshalJSON() ([]byte, error) { } // Table in which the openstack model is stored. -func (Server) TableName() string { return "openstack_servers_v3" } +func (Server) TableName() string { return "openstack_servers_v4" } // Index for the openstack model. func (Server) Indexes() map[string][]string { return nil } diff --git a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql index 56b20a980..cf2f3ca50 100644 --- a/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql +++ b/internal/knowledge/extractor/plugins/compute/libvirt_domain_cpu_steal_pct.sql @@ -3,6 +3,6 @@ SELECT os.os_ext_srv_attr_host AS host, MAX(value) AS max_steal_time_pct FROM kvm_libvirt_domain_metrics kvm -JOIN openstack_servers_v3 os ON os.os_ext_srv_attr_instance_name = kvm.domain +JOIN openstack_servers_v4 os ON os.os_ext_srv_attr_instance_name = kvm.domain WHERE kvm.name = 'kvm_libvirt_domain_steal_pct' AND os.id IS NOT NULL GROUP BY os.os_ext_srv_attr_host, os.id; \ No newline at end of file diff --git a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql index 190f2da19..69987328e 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_host_residency.sql @@ -21,7 +21,7 @@ WITH durations AS ( )) AS BIGINT) ) AS duration FROM openstack_migrations AS migrations - LEFT JOIN openstack_servers_v3 AS servers ON servers.id = migrations.instance_uuid + LEFT JOIN openstack_servers_v4 AS servers ON servers.id = migrations.instance_uuid LEFT JOIN openstack_flavors_v2 AS flavors ON flavors.name = servers.flavor_name ) SELECT diff --git a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql index 38b8762ba..ba2fc4d27 100644 --- a/internal/knowledge/extractor/plugins/compute/vm_life_span.sql +++ b/internal/knowledge/extractor/plugins/compute/vm_life_span.sql @@ -13,7 +13,7 @@ running_servers AS ( EXTRACT(EPOCH FROM (NOW()::timestamp - servers.created::timestamp))::BIGINT AS duration, COALESCE(flavors.name, 'unknown')::TEXT AS flavor_name, false::BOOLEAN AS deleted - FROM openstack_servers_v3 servers + FROM openstack_servers_v4 servers LEFT JOIN openstack_flavors_v2 flavors ON flavors.name = servers.flavor_name WHERE servers.created IS NOT NULL ) diff --git a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql index 21f3104fd..39705d585 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_resolver.sql @@ -3,5 +3,5 @@ SELECT DISTINCT m.hostsystem AS vrops_hostsystem, s.os_ext_srv_attr_host AS nova_compute_host FROM vrops_vm_metrics m -LEFT JOIN openstack_servers_v3 s ON m.instance_uuid = s.id +LEFT JOIN openstack_servers_v4 s ON m.instance_uuid = s.id WHERE s.os_ext_srv_attr_host IS NOT NULL; diff --git a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql index 850cbbca1..e539263e4 100644 --- a/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql +++ b/internal/knowledge/extractor/plugins/compute/vrops_project_noisiness.sql @@ -19,7 +19,7 @@ host_cpu_usage AS ( s.tenant_id, h.service_host, AVG(p.avg_cpu) AS avg_cpu_of_project - FROM openstack_servers_v3 s + FROM openstack_servers_v4 s JOIN vrops_vm_metrics m ON s.id = m.instance_uuid JOIN projects_avg_cpu p ON s.tenant_id = p.tenant_id JOIN openstack_hypervisors h ON s.os_ext_srv_attr_hypervisor_hostname = h.hostname diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index 1746a3377..8f7992ca1 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -162,8 +162,8 @@ func (c *Controller) reconcileOne( cur := existingByName[flavor.Name] cur.FlavorName = flavor.Name - totalVMSlots, totalHosts, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName) - placeableVMs, placeableHosts, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName) + totalVMSlots, totalHosts, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName, true) + placeableVMs, placeableHosts, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName, false) if totalErr != nil { allFresh = false @@ -257,11 +257,15 @@ func (c *Controller) reconcileOne( // probeScheduler calls the scheduler with the given pipeline and returns VM slots + host count. // Capacity is computed as sum of floor(hostMemory / flavorMemory) across returned hosts. +// When ignoreAllocations is true (total/empty-datacenter probe), raw effective capacity is used. +// When false (placeable probe), hv.Status.Allocation is subtracted first so that slots reflect +// remaining capacity after running VMs. func (c *Controller) probeScheduler( ctx context.Context, flavor compute.FlavorInGroup, az, pipeline string, hvByName map[string]hv1.Hypervisor, + ignoreAllocations bool, ) (capacity, hosts int64, err error) { flavorBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec @@ -309,7 +313,16 @@ func (c *Controller) probeScheduler( if !ok { continue } - if capBytes := memCap.Value(); capBytes > 0 { + capBytes := memCap.Value() + if !ignoreAllocations { + if alloc, ok := hv.Status.Allocation[hv1.ResourceMemory]; ok { + capBytes -= alloc.Value() + } + if capBytes < 0 { + capBytes = 0 + } + } + if capBytes > 0 { capacity += capBytes / flavorBytes } } diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 69a4e80bb..8938b8564 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -429,7 +429,7 @@ func TestProbeScheduler_CapacityCalculation(t *testing.T) { } flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} - capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName) + capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName, true) if err != nil { t.Fatalf("probeScheduler failed: %v", err) } @@ -442,6 +442,49 @@ func TestProbeScheduler_CapacityCalculation(t *testing.T) { } } +// TestProbeScheduler_SubtractsAllocationsWhenNotIgnored verifies that placeable-probe slot +// counting uses remaining capacity (effectiveCapacity − allocation) while the total-probe uses +// raw capacity. This is the regression test for the bug where both probes used raw capacity, +// making running VMs invisible in the usage = total − placeable calculation. +func TestProbeScheduler_SubtractsAllocationsWhenNotIgnored(t *testing.T) { + const memMB = 4096 + const memBytes = int64(memMB) * 1024 * 1024 + + scheme := newTestScheme(t) + + // Host has 2-slot capacity (2 × flavor), with 1 slot already used by a running VM. + hv := newHypervisor("host-1", "az-a", memBytes*2) + hv.Status.Allocation = map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: *resource.NewQuantity(memBytes, resource.BinarySI), + } + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + srv := newMockSchedulerServer(t, []string{"host-1"}) + defer srv.Close() + + c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} + + // Total probe (ignoreAllocations=true): raw capacity → 2 slots. + totalCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "total-pipeline", hvByName, true) + if err != nil { + t.Fatalf("probeScheduler (total) failed: %v", err) + } + if totalCap != 2 { + t.Errorf("total capacity = %d, want 2 (raw slots)", totalCap) + } + + // Placeable probe (ignoreAllocations=false): capacity − allocation → 1 slot. + placeableCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "placeable-pipeline", hvByName, false) + if err != nil { + t.Fatalf("probeScheduler (placeable) failed: %v", err) + } + if placeableCap != 1 { + t.Errorf("placeable capacity = %d, want 1 (remaining slot after running VM)", placeableCap) + } +} + func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { scheme := newTestScheme(t) diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go index 657a9b22d..235cf6cdc 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go @@ -5,6 +5,7 @@ package commitments import ( "context" + "errors" "fmt" "time" @@ -34,8 +35,9 @@ const ( // CommittedResourceController reconciles CommittedResource CRDs and owns all child Reservation CRUD. type CommittedResourceController struct { client.Client - Scheme *runtime.Scheme - Conf CommittedResourceControllerConfig + Scheme *runtime.Scheme + Conf CommittedResourceControllerConfig + Monitor *CRControllerMonitor } func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -313,8 +315,17 @@ func (r *CommittedResourceController) applyReservationState(ctx context.Context, state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx) state.ParentGeneration = cr.Generation - result, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller") + mgr := NewReservationManager(r.Client) + mgr.SlotCreationDelay = r.Conf.SlotCreationDelay.Duration + if cr.Spec.AllowRejection { + mgr.MaxSlots = r.Conf.MaxSlotsPerCommitment + } + result, err := mgr.ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller") if err != nil { + var limitErr *SlotLimitExceededError + if errors.As(err, &limitErr) && r.Monitor != nil { + r.Monitor.RecordSlotLimitRejection(cr.Spec.FlavorGroupName, cr.Spec.AvailabilityZone) + } return nil, err } logger.Info("commitment state applied", "created", result.Created, "deleted", result.Deleted, "repaired", result.Repaired) diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_monitor.go b/internal/scheduling/reservations/commitments/committed_resource_controller_monitor.go index 116764f7f..57c6dbd77 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_monitor.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_monitor.go @@ -23,8 +23,9 @@ var crControllerMonitorLog = ctrl.Log.WithName("committed-resource-controller-mo // after a failure. API-originated dry-run probes (AllowRejection=true) are excluded. // The metric is absent for a given label set when no CRs match — absence means zero. type CRControllerMonitor struct { - client client.Client - unfulfilled *prometheus.Desc + client client.Client + unfulfilled *prometheus.Desc + slotLimitRejections *prometheus.CounterVec } func NewCRControllerMonitor(c client.Client) CRControllerMonitor { @@ -36,12 +37,22 @@ func NewCRControllerMonitor(c client.Client) CRControllerMonitor { []string{"flavor_group", "resource_type", "availability_zone"}, nil, ), + slotLimitRejections: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_slot_limit_rejections_total", + Help: "Number of times a commitment was rejected because the requested slot count exceeded the configured limit.", + }, []string{"flavor_group", "availability_zone"}), } } +// RecordSlotLimitRejection increments the slot-limit rejection counter for the given flavor group and AZ. +func (m *CRControllerMonitor) RecordSlotLimitRejection(flavorGroup, az string) { + m.slotLimitRejections.WithLabelValues(flavorGroup, az).Inc() +} + // Describe implements prometheus.Collector. func (m *CRControllerMonitor) Describe(ch chan<- *prometheus.Desc) { ch <- m.unfulfilled + m.slotLimitRejections.Describe(ch) } // Collect implements prometheus.Collector. Lists all CommittedResource CRDs and counts @@ -80,4 +91,5 @@ func (m *CRControllerMonitor) Collect(ch chan<- prometheus.Metric) { k.flavorGroup, k.resourceType, k.az, ) } + m.slotLimitRejections.Collect(ch) } diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index c53ffd55e..8e18c2f9f 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -79,24 +79,18 @@ type CommittedResourceControllerConfig struct { // MaxRequeueInterval caps the exponential backoff delay. // Once this ceiling is reached, every subsequent retry fires after exactly this interval. MaxRequeueInterval metav1.Duration `json:"maxRequeueInterval"` -} - -func DefaultCommittedResourceControllerConfig() CommittedResourceControllerConfig { - return CommittedResourceControllerConfig{ - RequeueIntervalRetry: metav1.Duration{Duration: 30 * time.Second}, - MaxRequeueInterval: metav1.Duration{Duration: 30 * time.Minute}, - } -} -// ApplyDefaults fills in zero-value fields from the defaults, leaving explicitly configured values intact. -func (c *CommittedResourceControllerConfig) ApplyDefaults() { - d := DefaultCommittedResourceControllerConfig() - if c.RequeueIntervalRetry.Duration == 0 { - c.RequeueIntervalRetry = d.RequeueIntervalRetry - } - if c.MaxRequeueInterval.Duration == 0 { - c.MaxRequeueInterval = d.MaxRequeueInterval - } + // SlotCreationDelay is the pause inserted between consecutive Reservation CRD creates. + // Spreads scheduler calls over time instead of bursting them all at once. + // 0 disables the delay. + SlotCreationDelay metav1.Duration `json:"slotCreationDelay"` + + // MaxSlotsPerCommitment caps the number of Reservation CRDs that may be created for a single + // CommittedResource on the AllowRejection=true (API) path. Requests that would exceed this + // limit are rejected immediately before any slots are created. + // Has no effect on the AllowRejection=false (syncer) path. + // 0 disables the cap. + MaxSlotsPerCommitment int `json:"maxSlotsPerCommitment"` } // ResourceTypeConfig holds per-resource flags for a single resource type within a flavor group. diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index 23316fdb8..01f3b2f74 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -6,6 +6,7 @@ package commitments import ( "context" "fmt" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" @@ -34,9 +35,28 @@ type ApplyResult struct { RemovedReservations []v1alpha1.Reservation } +// SlotLimitExceededError is returned by ApplyCommitmentState when the number of new reservation +// slots would exceed MaxSlots. It is a distinct type so callers can detect and metric it separately +// from ordinary capacity failures. +type SlotLimitExceededError struct { + NewSlots int + Limit int +} + +func (e *SlotLimitExceededError) Error() string { + return fmt.Sprintf("commitment would create %d new reservation slots, exceeds limit of %d", e.NewSlots, e.Limit) +} + // ReservationManager handles CRUD operations for Reservation CRDs. type ReservationManager struct { client.Client + // SlotCreationDelay adds a pause between consecutive Reservation CRD creates to spread + // scheduler load across time rather than bursting all creates at once. + SlotCreationDelay time.Duration + // MaxSlots caps the total number of Reservation CRDs for a single commitment. + // When non-zero, ApplyCommitmentState returns an error if the desired slot count would + // exceed this limit. Only set by the caller on the AllowRejection=true (API) path. + MaxSlots int } func NewReservationManager(k8sClient client.Client) *ReservationManager { @@ -176,6 +196,17 @@ func (m *ReservationManager) ApplyCommitmentState( } // Phase 5 (CREATE): Create new reservations (capacity increased) + if deltaMemoryBytes > 0 { + newSlots := countNewSlots(deltaMemoryBytes, flavorGroup) + if m.MaxSlots > 0 && newSlots > m.MaxSlots { + return nil, &SlotLimitExceededError{NewSlots: newSlots, Limit: m.MaxSlots} + } + log.Info("creating reservation slots", + "commitmentUUID", desiredState.CommitmentUUID, + "slots", newSlots, + "slotCreationDelay", m.SlotCreationDelay, + ) + } for deltaMemoryBytes > 0 { // Select the largest flavor that fits the remaining delta (flavors sorted descending by memory). reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) @@ -196,6 +227,18 @@ func (m *ReservationManager) ApplyCommitmentState( } nextSlotIndex++ + + // Throttle: pause between consecutive creates to spread scheduler load. + // Skip after the last slot (deltaMemoryBytes <= 0) — no follow-up create to defer. + if m.SlotCreationDelay > 0 && deltaMemoryBytes > 0 { + timer := time.NewTimer(m.SlotCreationDelay) + select { + case <-ctx.Done(): + timer.Stop() + return result, ctx.Err() + case <-timer.C: + } + } } // Phase 6 (UPDATE): Sync metadata for remaining reservations @@ -274,6 +317,35 @@ func (m *ReservationManager) syncReservationMetadata( } } +// selectFlavor picks the largest flavor whose memory fits within deltaMemoryBytes. +// Returns the selected flavor and its memory in bytes. If no flavor fits, returns the +// smallest flavor with memoryBytes = deltaMemoryBytes (consumes the full remainder). +func selectFlavor(deltaMemoryBytes int64, flavorGroup compute.FlavorGroupFeature) (flavor compute.FlavorInGroup, memoryBytes int64) { + flavor = flavorGroup.Flavors[len(flavorGroup.Flavors)-1] + memoryBytes = deltaMemoryBytes + for _, f := range flavorGroup.Flavors { + flavorBytes := int64(f.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + if flavorBytes <= deltaMemoryBytes { + flavor = f + memoryBytes = flavorBytes + break + } + } + return +} + +// countNewSlots returns how many Reservation slots would be created to cover deltaMemoryBytes. +// Used to pre-check MaxSlots before creating any slots, so a limit violation never leaves partial state. +func countNewSlots(deltaMemoryBytes int64, flavorGroup compute.FlavorGroupFeature) int { + count := 0 + for deltaMemoryBytes > 0 { + _, memoryBytes := selectFlavor(deltaMemoryBytes, flavorGroup) + deltaMemoryBytes -= memoryBytes + count++ + } + return count +} + func (m *ReservationManager) newReservation( state *CommitmentState, slotIndex int, @@ -290,20 +362,9 @@ func (m *ReservationManager) newReservation( // Select largest flavor that fits remaining memory (flavors sorted descending by memory then vCPUs). // This works for both fixed and varying CPU:RAM ratio groups. - flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest - memoryBytes := deltaMemoryBytes + flavorInGroup, memoryBytes := selectFlavor(deltaMemoryBytes, flavorGroup) cpus := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded - for _, flavor := range flavorGroup.Flavors { - flavorMemoryBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded - if flavorMemoryBytes <= deltaMemoryBytes { - flavorInGroup = flavor - memoryBytes = flavorMemoryBytes - cpus = int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded - break - } - } - spec := v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, SchedulingDomain: v1alpha1.SchedulingDomainNova, diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go index e2ce2f26c..d890bd496 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager_test.go +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -76,6 +76,7 @@ func TestApplyCommitmentState(t *testing.T) { desiredAZ string desiredDomainID string flavorGroupOverride map[string]compute.FlavorGroupFeature // nil = testFlavorGroups() + maxSlots int // 0 = no limit wantError bool wantRemovedCount int // exact count; -1 = at least one validateRemoved func(t *testing.T, removed []v1alpha1.Reservation) @@ -317,6 +318,45 @@ func TestApplyCommitmentState(t *testing.T) { } }, }, + // ---------------------------------------------------------------- + // MaxSlots limit + // ---------------------------------------------------------------- + { + name: "max slots: rejects when new slots exceed limit", + desiredMemoryGiB: 56, // 32+16+8 = 3 slots with testFlavorGroup + maxSlots: 2, + wantError: true, + }, + { + name: "max slots: allows when new slots are within limit", + desiredMemoryGiB: 56, // 32+16+8 = 3 slots + maxSlots: 3, + validateTouched: func(t *testing.T, touched []v1alpha1.Reservation) { + if len(touched) != 3 { + t.Errorf("expected 3 slots created, got %d", len(touched)) + } + }, + }, + { + name: "max slots: only new slots counted, existing do not contribute", + existingSlots: []v1alpha1.Reservation{ + newTestCRSlot("commitment-abc123-0", 8, "", "test-group", nil), + newTestCRSlot("commitment-abc123-1", 8, "", "test-group", nil), + }, + desiredMemoryGiB: 56, // existing=16GiB, delta=40GiB → 32+8 = 2 new slots; maxSlots=2 allows it + maxSlots: 2, + validateTouched: func(t *testing.T, touched []v1alpha1.Reservation) { + created := 0 + for _, r := range touched { + if r.Name == "commitment-abc123-2" || r.Name == "commitment-abc123-3" { + created++ + } + } + if created != 2 { + t.Errorf("expected 2 new slots created, got %d touched: %v", created, touched) + } + }, + }, } scheme := newCRTestScheme(t) @@ -329,6 +369,7 @@ func TestApplyCommitmentState(t *testing.T) { } k8sClient := newCRTestClient(scheme, objects...) manager := NewReservationManager(k8sClient) + manager.MaxSlots = tt.maxSlots flavorGroups := testFlavorGroups() if tt.flavorGroupOverride != nil { @@ -534,3 +575,57 @@ func TestNewReservation_VariableRatioGroup_SelectsLargestByMemory(t *testing.T) }) } } + +// ============================================================================ +// Tests: selectFlavor +// ============================================================================ + +func TestSelectFlavor(t *testing.T) { + fg := testFlavorGroup() // small=8GiB/4c, medium=16GiB/8c, large=32GiB/16c + + tests := []struct { + name string + deltaGiB int64 + wantFlavor string + wantMemoryGiB int64 + }{ + { + name: "exact fit: picks that flavor", + deltaGiB: 8, + wantFlavor: "small", + wantMemoryGiB: 8, + }, + { + name: "delta between small and medium: picks small", + deltaGiB: 12, + wantFlavor: "small", + wantMemoryGiB: 8, + }, + { + name: "delta larger than all flavors: picks largest, memory = largest flavor size", + deltaGiB: 100, + wantFlavor: "large", + wantMemoryGiB: 32, + }, + { + name: "delta smaller than smallest flavor: falls back, memory = full delta", + deltaGiB: 3, + wantFlavor: "small", // smallest flavor returned as fallback + wantMemoryGiB: 3, // but memory = full delta (remainder consumed) + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deltaBytes := tt.deltaGiB * 1024 * 1024 * 1024 + flavor, memoryBytes := selectFlavor(deltaBytes, fg) + if flavor.Name != tt.wantFlavor { + t.Errorf("flavor: want %s, got %s", tt.wantFlavor, flavor.Name) + } + wantBytes := tt.wantMemoryGiB * 1024 * 1024 * 1024 + if memoryBytes != wantBytes { + t.Errorf("memoryBytes: want %d, got %d", wantBytes, memoryBytes) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index a6d360bd1..fb9984a75 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -707,10 +707,9 @@ func (c *dbUsageClient) ListProjectVMs(ctx context.Context, projectID string) ([ COALESCE(f.vcpus, 0) AS flavor_vcpus, COALESCE(f.disk, 0) AS flavor_disk, COALESCE(f.extra_specs, '') AS flavor_extras, - COALESCE(NULLIF(i.os_type, ''), 'unknown') AS os_type + COALESCE(NULLIF(s.os_type, ''), 'unknown') AS os_type FROM ` + nova.Server{}.TableName() + ` s LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON f.name = s.flavor_name - LEFT JOIN ` + nova.Image{}.TableName() + ` i ON i.id = s.image_ref WHERE s.tenant_id = $1` var rows []vmQueryRow diff --git a/postgres/rebuild-trigger b/postgres/rebuild-trigger new file mode 100644 index 000000000..0b6201c6a --- /dev/null +++ b/postgres/rebuild-trigger @@ -0,0 +1 @@ +26455164324