Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/hub/health-checks.ts
1496 views
1
/*
2
* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
// endpoints for various health checks
7
8
import getLogger from "@cocalc/backend/logger";
9
import { new_counter } from "@cocalc/server/metrics/metrics-recorder";
10
import { howLongDisconnectedMins } from "@cocalc/database/postgres/record-connect-error";
11
import type { PostgreSQL } from "@cocalc/database/postgres/types";
12
import { seconds2hms } from "@cocalc/util/misc";
13
import express, { Response } from "express";
14
import { createServer, Server } from "net";
15
import { isFloat } from "validator";
16
import { database_is_working } from "@cocalc/server/metrics/hub_register";
17
const logger = getLogger("hub:healthcheck");
18
const { debug: L } = logger;
19
20
const HEALTHCHECKS = new_counter(
21
"healthchecks_total",
22
"test healthcheck counter",
23
["status"],
24
);
25
26
interface HealthcheckData {
27
code: 200 | 404;
28
txt: string;
29
}
30
31
// self termination is only activated, if there is a COCALC_HUB_SELF_TERMINATE environment variable
32
// it's value is an interval in hours, minimum and maximum, for how long it should be alive
33
// and a drain period in minutes at the end.
34
// e.g. "24,48,15" for an uptime between 1 and 2 days and 15 minutes of draining
35
function init_self_terminate(): {
36
startup: number;
37
shutdown?: number; // when to shutdown (causes a failed health check)
38
drain?: number; // when to start draining, causes a proxy server to no longer send traffic
39
} {
40
const D = logger.extend("init_self_terminate").debug;
41
const startup = Date.now();
42
const conf = process.env.COCALC_HUB_SELF_TERMINATE;
43
if (conf == null) {
44
D("COCALC_HUB_SELF_TERMINATE env var not set, hence no self-termination");
45
return { startup };
46
}
47
const [from_str, to_str, drain_str] = conf.trim().split(",");
48
if (!isFloat(from_str, { gt: 0 }))
49
throw new Error("COCALC_HUB_SELF_TERMINATE/from not a positive float");
50
if (!isFloat(to_str, { gt: 0 }))
51
throw new Error("COCALC_HUB_SELF_TERMINATE/to not a positive float");
52
if (!isFloat(drain_str, { gt: 0 }))
53
throw new Error("COCALC_HUB_SELF_TERMINATE/drain not a positive float");
54
const from = parseFloat(from_str);
55
const to = parseFloat(to_str);
56
const drain_h = parseFloat(drain_str) / 60; // minutes to hours
57
D("parsed data:", { from, to, drain_h });
58
if (from > to)
59
throw Error(
60
"COCALC_HUB_SELF_TERMINATE 'from' must be smaller than 'to', e.g. '24,48,15'",
61
);
62
const uptime = Math.random() * (to - from); // hours
63
const hours2ms = 1000 * 60 * 60;
64
const shutdown = startup + (from + uptime) * hours2ms;
65
const drain = shutdown - drain_h * hours2ms;
66
if (startup > drain) {
67
throw new Error(
68
`COCALC_HUB_SELF_TERMINATE: startup must be smaller than drain – ${startup}>${drain}`,
69
);
70
}
71
D({
72
startup: new Date(startup).toISOString(),
73
drain: new Date(drain).toISOString(),
74
shutdown: new Date(shutdown).toISOString(),
75
uptime: seconds2hms((hours2ms * uptime) / 1000),
76
draintime: seconds2hms((drain_h * hours2ms) / 1000),
77
});
78
return { startup, shutdown, drain };
79
}
80
81
const { startup, shutdown, drain } = init_self_terminate();
82
83
let agent_port = 0;
84
let agent_host = "0.0.0.0";
85
export function set_agent_endpoint(port: number, host: string) {
86
L(`set_agent_endpoint ${agent_host}:${agent_port}`);
87
agent_port = port;
88
agent_host = host;
89
}
90
91
let agent_check_server: Server | undefined;
92
93
// HAProxy agent-check TCP endpoint
94
// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-agent-check
95
// for development, set the env var in your startup script or terminal init file
96
// export COCALC_HUB_SELF_TERMINATE=.1,.2,1
97
// and then query it like that
98
// $ telnet 0.0.0.0 $(cat $COCALC_ROOT/dev/project/ports/agent-port)
99
function setup_agent_check() {
100
if (agent_port == 0 || drain == null) {
101
L("setup_agent_check: agent_port not set, no agent checks");
102
return;
103
}
104
105
// TODO this could also return a "weight" for this server, based on load values
106
// there is also "drain", but we set it to "10%" to avoid a nasty situation, when all endpoints are draining.
107
// ATTN: weight must be set as well, which is poorly documented here:
108
// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-weight
109
agent_check_server = createServer((c) => {
110
c.on("error", (err) => {
111
L(`agent_check: connection error`, err);
112
});
113
let msg = Date.now() < drain ? "ready up 100%" : "10%";
114
c.write(msg + "\r\n");
115
c.destroy();
116
});
117
118
agent_check_server.listen(agent_port, agent_host);
119
L(`setup_agent_check: listening on ${agent_host}:${agent_port}`);
120
}
121
122
export interface Check {
123
status: string;
124
abort?: boolean;
125
}
126
127
interface Opts {
128
db: PostgreSQL;
129
router: express.Router;
130
extra?: (() => Promise<Check>)[]; // additional health checks
131
}
132
133
// this could be directly in setup_health_checks, but we also need it in proxy.coffee
134
// proxy.coffee must be rewritten and restructured first – just wrapping it with a router
135
// didn't work at all for me
136
export function process_alive(): HealthcheckData {
137
let txt = "alive: YES";
138
let is_dead = true;
139
if (!database_is_working()) {
140
// this will stop haproxy from routing traffic to us
141
// until db connection starts working again.
142
txt = "alive: NO – database not working";
143
} else if (shutdown != null && Date.now() > shutdown) {
144
txt = "alive: NO – shutdown initiated";
145
} else {
146
is_dead = false;
147
}
148
const code = is_dead ? 404 : 200;
149
return { txt, code };
150
}
151
152
function checkConcurrent(db: PostgreSQL): Check {
153
const c = db.concurrent();
154
if (c >= db._concurrent_warn) {
155
return {
156
status: `hub not healthy, since concurrent ${c} >= ${db._concurrent_warn}`,
157
abort: true,
158
};
159
} else {
160
return { status: `concurrent ${c} < ${db._concurrent_warn}` };
161
}
162
}
163
164
function checkUptime(): Check {
165
const now = Date.now();
166
const uptime = seconds2hms((now - startup) / 1000);
167
if (shutdown != null && drain != null) {
168
if (now >= shutdown) {
169
const msg = `uptime ${uptime} – expired, terminating now`;
170
L(msg);
171
return { status: msg, abort: true };
172
} else {
173
const until = seconds2hms((shutdown - now) / 1000);
174
const drain_str =
175
drain > now
176
? `draining in ${seconds2hms((drain - now) / 1000)}`
177
: "draining now";
178
const msg = `uptime ${uptime} – ${drain_str} – terminating in ${until}`;
179
L(msg);
180
return { status: msg };
181
}
182
} else {
183
const msg = `uptime ${uptime} – no self-termination`;
184
L(msg);
185
return { status: msg };
186
}
187
}
188
189
// if there are is no connection to the database for that many minutes,
190
// declare the hub unhealthy
191
const DB_ERRORS_THRESHOLD_MIN = parseInt(
192
process.env.COCALC_DB_ERRORS_THRESHOLD_MIN ?? "5",
193
);
194
195
function checkDBConnectivity(): Check {
196
if (DB_ERRORS_THRESHOLD_MIN <= 0) {
197
return { status: "db connectivity check disabled" };
198
}
199
const num = howLongDisconnectedMins();
200
if (num == null) {
201
return { status: "no DB connection problems", abort: false };
202
}
203
// round num to 2 decimal places
204
const numStr = num.toFixed(2);
205
const above = num >= DB_ERRORS_THRESHOLD_MIN;
206
const status = above
207
? `DB problems for ${numStr} >= ${DB_ERRORS_THRESHOLD_MIN} mins`
208
: `DB problems for ${numStr} < ${DB_ERRORS_THRESHOLD_MIN} mins`;
209
return { status, abort: above };
210
}
211
212
// same note as above for process_alive()
213
async function process_health_check(
214
db: PostgreSQL,
215
extra: (() => Promise<Check>)[] = [],
216
): Promise<HealthcheckData> {
217
let any_abort = false;
218
let txt = "healthchecks:\n";
219
for (const test of [
220
() => checkConcurrent(db),
221
checkUptime,
222
checkDBConnectivity,
223
...extra,
224
]) {
225
try {
226
const { status, abort = false } = await test();
227
const statusTxt = abort ? "FAIL" : "OK";
228
txt += `${status} – ${statusTxt}\n`;
229
any_abort = any_abort || abort;
230
L(`process_health_check: ${status} – ${statusTxt}`);
231
} catch (err) {
232
L(`process_health_check ERRROR: ${err}`);
233
HEALTHCHECKS.labels("ERROR").inc();
234
}
235
}
236
const code = any_abort ? 404 : 200;
237
HEALTHCHECKS.labels(any_abort ? "FAIL" : "OK").inc();
238
return { code, txt };
239
}
240
241
export async function setup_health_checks(opts: Opts): Promise<void> {
242
const { db, extra, router } = opts;
243
setup_agent_check();
244
245
// used by HAPROXY for testing that this hub is OK to receive traffic
246
router.get("/alive", (_, res: Response) => {
247
const { code, txt } = process_alive();
248
res.type("txt");
249
res.status(code);
250
res.send(txt);
251
});
252
253
// this is a more general check than concurrent-warn
254
// additionally to checking the database condition, it also self-terminates
255
// this hub if it is running for quite some time. beyond that, in the future
256
// there could be even more checks on top of that.
257
router.get("/healthcheck", async (_, res: Response) => {
258
const { txt, code } = await process_health_check(db, extra);
259
res.status(code);
260
res.type("txt");
261
res.send(txt);
262
});
263
264
// /concurrent-warn -- could be used by kubernetes to decide whether or not to kill the container; if
265
// below the warn thresh, returns number of concurrent connection; if hits warn, then
266
// returns 404 error, meaning hub may be unhealthy. Kubernetes will try a few times before
267
// killing the container. Will also return 404 if there is no working database connection.
268
router.get("/concurrent-warn", (_, res) => {
269
res.type("txt");
270
if (!database_is_working()) {
271
L("/concurrent-warn: not healthy, since database connection not working");
272
res.status(404).end();
273
return;
274
}
275
276
const c = db.concurrent();
277
if (c >= db._concurrent_warn) {
278
L(
279
`/concurrent-warn: not healthy, since concurrent ${c} >= ${db._concurrent_warn}`,
280
);
281
res.status(404).end();
282
return;
283
}
284
res.send(`${c}`);
285
});
286
287
// Return number of concurrent connections (could be useful)
288
router.get("/concurrent", (_, res) => {
289
res.type("txt");
290
res.send(`${db.concurrent()}`);
291
});
292
}
293
294