From 55287c6f1d1dd6a11cf5ab4cc3792d4bb2f65062 Mon Sep 17 00:00:00 2001 From: tusuii Date: Fri, 27 Feb 2026 23:24:19 +0530 Subject: [PATCH] fix: increase backend memory limit and add rollout failure diagnostics Backend was OOMKilled during rolling update startup (Node.js + Socket.io + MySQL pool exceeds 256Mi). Raised limit to 512Mi and request to 256Mi. Jenkinsfile: show kubectl get pods immediately after apply so pod state is visible in build logs. Added full diagnostics (describe + logs) in post.failure block so the root cause of any future rollout failure is visible without needing to SSH into the cluster. Co-Authored-By: Claude Sonnet 4.6 --- Jenkinsfile | 24 ++++++++++++++++++++++-- k8s/base/backend/deployment.yaml | 4 ++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 43ee608..0425224 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -115,13 +115,16 @@ pipeline { withKubeConfig([credentialsId: "${K8S_CRED_ID}"]) { sh "kubectl apply -k ${K8S_OVERLAY}" + // Show pod state immediately after apply so we can see pull/init status in logs + sh "kubectl get pods -n scrum-manager -o wide" + // MySQL uses Recreate strategy: old pod terminates (~30s) before // new pod starts. Readiness probe initialDelaySeconds=30 + up to // 10 retries × 5s = 80s. Total worst-case: ~110s → 300s is safe. sh "kubectl rollout status deployment/mysql -n scrum-manager --timeout=300s" // Backend initContainer sleeps 15s after MySQL TCP is up before - // starting the Node process. 300s covers slow-start scenarios. + // starting the Node process. 512Mi memory limit avoids OOMKill. sh "kubectl rollout status deployment/backend -n scrum-manager --timeout=300s" sh "kubectl rollout status deployment/frontend -n scrum-manager --timeout=180s" @@ -170,7 +173,24 @@ pipeline { echo "✅ Build #${env.BUILD_NUMBER} deployed → http://scrum.local" } failure { - echo "❌ Pipeline failed. Check stage logs above." + withKubeConfig([credentialsId: "${K8S_CRED_ID}"]) { + sh """ + echo '=== Pod Status ===' + kubectl get pods -n scrum-manager -o wide || true + + echo '=== Backend Pod Events ===' + kubectl describe pods -l app.kubernetes.io/name=backend -n scrum-manager || true + + echo '=== Backend Logs (last 50 lines) ===' + kubectl logs -l app.kubernetes.io/name=backend -n scrum-manager --tail=50 --all-containers=true || true + + echo '=== Frontend Pod Events ===' + kubectl describe pods -l app.kubernetes.io/name=frontend -n scrum-manager || true + + echo '=== MySQL Pod Events ===' + kubectl describe pods -l app.kubernetes.io/name=mysql -n scrum-manager || true + """ + } } always { sh "docker logout ${HARBOR_URL} || true" diff --git a/k8s/base/backend/deployment.yaml b/k8s/base/backend/deployment.yaml index 8406612..31645d0 100644 --- a/k8s/base/backend/deployment.yaml +++ b/k8s/base/backend/deployment.yaml @@ -64,10 +64,10 @@ spec: resources: requests: cpu: 100m - memory: 128Mi + memory: 256Mi limits: cpu: 500m - memory: 256Mi + memory: 512Mi livenessProbe: httpGet: path: /api/health