From 245301450cbb8779671c3ce1215665d1d82fcf94 Mon Sep 17 00:00:00 2001
From: tusuii <tusuii764@gmail.com>
Date: Sat, 28 Feb 2026 00:10:04 +0530
Subject: [PATCH] fix: use maxSurge=0 rolling update to avoid CPU pressure on
 small cluster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During rolling updates with the default maxSurge=1, an extra surge pod was
created temporarily (3 pods instead of 2), causing all 3 nodes to report
"Insufficient CPU" and delaying scheduling past the Jenkins rollout timeout.

With maxSurge=0 / maxUnavailable=1, one old pod terminates first before a
new one starts — pod count stays at 2 throughout, no extra CPU needed.

Also increase Jenkins rollout timeout from 300s to 600s as a safety net
for CPU-constrained nodes that may still need extra scheduling time.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Jenkinsfile                       | 12 +++++-------
 k8s/base/backend/deployment.yaml  |  5 +++++
 k8s/base/frontend/deployment.yaml |  5 +++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 16dd557..1208721 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -137,16 +137,14 @@ pipeline {
                     // Show pod state immediately after apply so we can see pull/init status in logs
                     sh "kubectl get pods -n scrum-manager -o wide"
 
-                    // MySQL uses Recreate strategy: old pod terminates (~30s) before
-                    // new pod starts. Readiness probe initialDelaySeconds=30 + up to
-                    // 10 retries × 5s = 80s. Total worst-case: ~110s → 300s is safe.
+                    // MySQL uses Recreate strategy: old pod terminates then new starts.
                     sh "kubectl rollout status deployment/mysql   -n scrum-manager --timeout=300s"
 
-                    // Backend initContainer sleeps 15s after MySQL TCP is up before
-                    // starting the Node process. 512Mi memory limit avoids OOMKill.
-                    sh "kubectl rollout status deployment/backend  -n scrum-manager --timeout=300s"
+                    // maxSurge=0: old pod terminates first, new pod starts after.
+                    // CPU-constrained nodes may delay scheduling — 600s covers this.
+                    sh "kubectl rollout status deployment/backend  -n scrum-manager --timeout=600s"
 
-                    sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=180s"
+                    sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=600s"
 
                     echo "All deployments rolled out."
                 }
diff --git a/k8s/base/backend/deployment.yaml b/k8s/base/backend/deployment.yaml
index 1d6b3eb..bf11394 100644
--- a/k8s/base/backend/deployment.yaml
+++ b/k8s/base/backend/deployment.yaml
@@ -7,6 +7,11 @@ metadata:
     app.kubernetes.io/component: api
 spec:
   replicas: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0       # Don't create extra pods during update — avoids CPU pressure
+      maxUnavailable: 1 # Terminate one old pod first, then start new one
   selector:
     matchLabels:
       app.kubernetes.io/name: backend
diff --git a/k8s/base/frontend/deployment.yaml b/k8s/base/frontend/deployment.yaml
index f435d99..4c59fe3 100644
--- a/k8s/base/frontend/deployment.yaml
+++ b/k8s/base/frontend/deployment.yaml
@@ -7,6 +7,11 @@ metadata:
     app.kubernetes.io/component: web
 spec:
   replicas: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0       # Don't create extra pods during update — avoids CPU pressure
+      maxUnavailable: 1 # Terminate one old pod first, then start new one
   selector:
     matchLabels:
       app.kubernetes.io/name: frontend