fix: use maxSurge=0 rolling update to avoid CPU pressure on small cluster

During rolling updates with the default maxSurge=1, an extra surge pod was created temporarily (3 pods instead of 2), causing all 3 nodes to report "Insufficient CPU" and delaying scheduling past the Jenkins rollout timeout. With maxSurge=0 / maxUnavailable=1, one old pod terminates first before a new one starts — pod count stays at 2 throughout, no extra CPU needed. Also increase Jenkins rollout timeout from 300s to 600s as a safety net for CPU-constrained nodes that may still need extra scheduling time. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-28 00:10:04 +05:30
parent 7900114303
commit 245301450c
3 changed files with 15 additions and 7 deletions
--- a/12
+++ b/12
@@ -137,16 +137,14 @@ pipeline {
                    // Show pod state immediately after apply so we can see pull/init status in logs
                    sh "kubectl get pods -n scrum-manager -o wide"

-                    // MySQL uses Recreate strategy: old pod terminates (~30s) before
-                    // new pod starts. Readiness probe initialDelaySeconds=30 + up to
-                    // 10 retries × 5s = 80s. Total worst-case: ~110s → 300s is safe.
+                    // MySQL uses Recreate strategy: old pod terminates then new starts.
                    sh "kubectl rollout status deployment/mysql   -n scrum-manager --timeout=300s"

-                    // Backend initContainer sleeps 15s after MySQL TCP is up before
-                    // starting the Node process. 512Mi memory limit avoids OOMKill.
-                    sh "kubectl rollout status deployment/backend  -n scrum-manager --timeout=300s"
+                    // maxSurge=0: old pod terminates first, new pod starts after.
+                    // CPU-constrained nodes may delay scheduling — 600s covers this.
+                    sh "kubectl rollout status deployment/backend  -n scrum-manager --timeout=600s"

-                    sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=180s"
+                    sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=600s"

                    echo "All deployments rolled out."
                }
--- a/k8s/base/backend/deployment.yaml
+++ b/k8s/base/backend/deployment.yaml
@@ -7,6 +7,11 @@ metadata:
    app.kubernetes.io/component: api
 spec:
  replicas: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0       # Don't create extra pods during update — avoids CPU pressure
+      maxUnavailable: 1 # Terminate one old pod first, then start new one
  selector:
    matchLabels:
      app.kubernetes.io/name: backend
--- a/k8s/base/frontend/deployment.yaml
+++ b/k8s/base/frontend/deployment.yaml
@@ -7,6 +7,11 @@ metadata:
    app.kubernetes.io/component: web
 spec:
  replicas: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0       # Don't create extra pods during update — avoids CPU pressure
+      maxUnavailable: 1 # Terminate one old pod first, then start new one
  selector:
    matchLabels:
      app.kubernetes.io/name: frontend