From 55287c6f1d1dd6a11cf5ab4cc3792d4bb2f65062 Mon Sep 17 00:00:00 2001
From: tusuii <tusuii764@gmail.com>
Date: Fri, 27 Feb 2026 23:24:19 +0530
Subject: [PATCH] fix: increase backend memory limit and add rollout failure
 diagnostics

Backend was OOMKilled during rolling update startup (Node.js + Socket.io +
MySQL pool exceeds 256Mi). Raised limit to 512Mi and request to 256Mi.

Jenkinsfile: show kubectl get pods immediately after apply so pod state
is visible in build logs. Added full diagnostics (describe + logs) in
post.failure block so the root cause of any future rollout failure is
visible without needing to SSH into the cluster.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Jenkinsfile                      | 24 ++++++++++++++++++++++--
 k8s/base/backend/deployment.yaml |  4 ++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 43ee608..0425224 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -115,13 +115,16 @@ pipeline {
                 withKubeConfig([credentialsId: "${K8S_CRED_ID}"]) {
                     sh "kubectl apply -k ${K8S_OVERLAY}"
 
+                    // Show pod state immediately after apply so we can see pull/init status in logs
+                    sh "kubectl get pods -n scrum-manager -o wide"
+
                     // MySQL uses Recreate strategy: old pod terminates (~30s) before
                     // new pod starts. Readiness probe initialDelaySeconds=30 + up to
                     // 10 retries × 5s = 80s. Total worst-case: ~110s → 300s is safe.
                     sh "kubectl rollout status deployment/mysql   -n scrum-manager --timeout=300s"
 
                     // Backend initContainer sleeps 15s after MySQL TCP is up before
-                    // starting the Node process. 300s covers slow-start scenarios.
+                    // starting the Node process. 512Mi memory limit avoids OOMKill.
                     sh "kubectl rollout status deployment/backend  -n scrum-manager --timeout=300s"
 
                     sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=180s"
@@ -170,7 +173,24 @@ pipeline {
             echo "✅  Build #${env.BUILD_NUMBER} deployed → http://scrum.local"
         }
         failure {
-            echo "❌  Pipeline failed. Check stage logs above."
+            withKubeConfig([credentialsId: "${K8S_CRED_ID}"]) {
+                sh """
+                    echo '=== Pod Status ==='
+                    kubectl get pods -n scrum-manager -o wide || true
+
+                    echo '=== Backend Pod Events ==='
+                    kubectl describe pods -l app.kubernetes.io/name=backend -n scrum-manager || true
+
+                    echo '=== Backend Logs (last 50 lines) ==='
+                    kubectl logs -l app.kubernetes.io/name=backend -n scrum-manager --tail=50 --all-containers=true || true
+
+                    echo '=== Frontend Pod Events ==='
+                    kubectl describe pods -l app.kubernetes.io/name=frontend -n scrum-manager || true
+
+                    echo '=== MySQL Pod Events ==='
+                    kubectl describe pods -l app.kubernetes.io/name=mysql -n scrum-manager || true
+                """
+            }
         }
         always {
             sh "docker logout ${HARBOR_URL} || true"
diff --git a/k8s/base/backend/deployment.yaml b/k8s/base/backend/deployment.yaml
index 8406612..31645d0 100644
--- a/k8s/base/backend/deployment.yaml
+++ b/k8s/base/backend/deployment.yaml
@@ -64,10 +64,10 @@ spec:
           resources:
             requests:
               cpu: 100m
-              memory: 128Mi
+              memory: 256Mi
             limits:
               cpu: 500m
-              memory: 256Mi
+              memory: 512Mi
           livenessProbe:
             httpGet:
               path: /api/health