0-downtime collab deploys? (#8926)

Before this change Kubernetes would send a SIGTERM to the old server before the new one was ready. Now it will wait. From my reading it seems like startupProbe should not be necessary if we have a readinessProbe; but from testing it seems like without startupProbe we still drop requests when using `rollout restart` Release Notes: - Fixed connectivity issues during Zed deploys.
2025-02-12 05:27:07 +00:00 · 2024-03-05 21:58:00 -07:00 · 2024-03-05 21:58:00 -07:00 · 6d53846824
commit 6d53846824
parent 01e5e4224a
2 changed files with 16 additions and 1 deletions
--- a/crates/collab/k8s/collab.template.yml
+++ b/crates/collab/k8s/collab.template.yml
@ -33,6 +33,11 @@ metadata:

 spec:
  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 0
  selector:
    matchLabels:
      app: ${ZED_SERVICE_NAME}
@ -78,6 +83,13 @@ spec:
              port: 8080
            initialDelaySeconds: 1
            periodSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /
+              port: 8080
+            initialDelaySeconds: 1
+            periodSeconds: 1
+            failureThreshold: 15
          env:
            - name: HTTP_PORT
              value: "8080"
@ -173,6 +185,7 @@ spec:
              value: "true"
            - name: ZED_ENVIRONMENT
              value: ${ZED_ENVIRONMENT}
+          terminationGracePeriodSeconds: 10
          securityContext:
            capabilities:
              # FIXME - Switch to the more restrictive `PERFMON` capability.
--- a/crates/collab/src/rpc.rs
+++ b/crates/collab/src/rpc.rs
@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument};
 use util::SemanticVersion;

 pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);
-pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10);
+
+// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
+pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);

 const MESSAGE_COUNT_PER_PAGE: usize = 100;
 const MAX_MESSAGE_LEN: usize = 1024;