diff --git a/crates/collab/k8s/collab.template.yml b/crates/collab/k8s/collab.template.yml index f0484bf57e..41584e94a7 100644 --- a/crates/collab/k8s/collab.template.yml +++ b/crates/collab/k8s/collab.template.yml @@ -33,6 +33,11 @@ metadata: spec: replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 selector: matchLabels: app: ${ZED_SERVICE_NAME} @@ -78,6 +83,13 @@ spec: port: 8080 initialDelaySeconds: 1 periodSeconds: 1 + startupProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 1 + periodSeconds: 1 + failureThreshold: 15 env: - name: HTTP_PORT value: "8080" @@ -173,6 +185,7 @@ spec: value: "true" - name: ZED_ENVIRONMENT value: ${ZED_ENVIRONMENT} + terminationGracePeriodSeconds: 10 securityContext: capabilities: # FIXME - Switch to the more restrictive `PERFMON` capability. diff --git a/crates/collab/src/rpc.rs b/crates/collab/src/rpc.rs index 2cb69c264b..c3df8d12fd 100644 --- a/crates/collab/src/rpc.rs +++ b/crates/collab/src/rpc.rs @@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument}; use util::SemanticVersion; pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30); -pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10); + +// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources. +pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15); const MESSAGE_COUNT_PER_PAGE: usize = 100; const MAX_MESSAGE_LEN: usize = 1024;