0-downtime collab deploys? (#8926)

Before this change Kubernetes would send a SIGTERM to the old server
before the new one was ready. Now it will wait.

From my reading it seems like startupProbe should not be necessary if we
have a
readinessProbe; but from testing it seems like without startupProbe we
still
drop requests when using `rollout restart`

Release Notes:

- Fixed connectivity issues during Zed deploys.
This commit is contained in:
Conrad Irwin 2024-03-05 21:58:00 -07:00 committed by GitHub
parent 01e5e4224a
commit 6d53846824
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 16 additions and 1 deletions

View file

@ -33,6 +33,11 @@ metadata:
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: ${ZED_SERVICE_NAME}
@ -78,6 +83,13 @@ spec:
port: 8080
initialDelaySeconds: 1
periodSeconds: 1
startupProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 1
periodSeconds: 1
failureThreshold: 15
env:
- name: HTTP_PORT
value: "8080"
@ -173,6 +185,7 @@ spec:
value: "true"
- name: ZED_ENVIRONMENT
value: ${ZED_ENVIRONMENT}
terminationGracePeriodSeconds: 10
securityContext:
capabilities:
# FIXME - Switch to the more restrictive `PERFMON` capability.

View file

@ -67,7 +67,9 @@ use tracing::{field, info_span, instrument, Instrument};
use util::SemanticVersion;
pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);
pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(10);
// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
const MESSAGE_COUNT_PER_PAGE: usize = 100;
const MAX_MESSAGE_LEN: usize = 1024;