Path: blob/master/src/packages/file-server/zfs/pull.ts
1447 views
/*1Use zfs replication over ssh to pull recent filesystems from2one file-server to another one.34This will be used for:56- backup7- moving a filesystem from one region/cluster to another8*/910import {11type Filesystem,12type RawFilesystem,13primaryKey,14PrimaryKey,15} from "./types";16import { exec } from "./util";17import {18databaseFilename,19filesystemDataset,20filesystemMountpoint,21} from "./names";22import { filesystemExists, getRecent, get, set } from "./db";23import getLogger from "@cocalc/backend/logger";24import { getSnapshots } from "./snapshots";25import { createFilesystem, deleteFilesystem } from "./create";26import { context } from "./config";27import { archiveFilesystem, dearchiveFilesystem } from "./archive";28import { deleteSnapshot } from "./snapshots";29import { isEqual } from "lodash";30import { join } from "path";31import { readdir, unlink } from "fs/promises";3233const logger = getLogger("file-server:zfs:pull");3435// number of remote backups of db sqlite file to keep.36const NUM_DB_TO_KEEP = 10;3738// This is used for unit testing. It's what fields should match39// after doing a sync, except snapshots where local is a superset,40// unless you pull with deleteSnapshots set to true.41export const SYNCED_FIELDS = [42// these four fields identify the filesystem, so they better get sync'd:43"namespace",44"owner_type",45"owner_id",46"name",47// snaphots -- reflects that we replicated properly.48"snapshots",4950// last_edited is useful for targetting sync work and making decisions, e.g.., should we delete51"last_edited",52// these just get directly sync'd. They aren't used unless somehow local were to actually server53// data directly.54"affinity",55"nfs",56];5758interface Remote {59// remote = user@hostname that you can ssh to60remote: string;61// filesystem prefix of the remote server, so {prefix}/database.sqlite3 has the62// database that defines the state of the remote server.63prefix: string;64}6566// Copy from remote to here every filesystem that has changed since cutoff.67export async function pull({68cutoff,69filesystem,70remote,71prefix,72deleteFilesystemCutoff,73deleteSnapshots,74dryRun,75}: Remote & {76// pulls everything that's changed with remote last_edited >= cutoff.77cutoff?: Date;78// alternatively -- if given, only pull this filesystem and nothing else:79filesystem?: PrimaryKey;8081// DANGER: if set, any local filesystem with82// cutoff <= last_edited <= deleteFilesystemCutoff83// gets actually deleted. This makes it possible, e.g., to delete every filesystem84// that was deleted on the main server in the last 6 months and deleted at least 185// month ago, so we have a bit of time before destroy backups.86deleteFilesystemCutoff?: Date;87// if true, delete local snapshots if they were deleted on the remote.88deleteSnapshots?: boolean;89// just say how much will happen, but don't do anything.90dryRun?: boolean;91}): Promise<{92toUpdate: { remoteFs: Filesystem; localFs?: Filesystem }[];93toDelete: RawFilesystem[];94}> {95logger.debug("pull: from ", { remote, prefix, cutoff, filesystem });96if (prefix.startsWith("/")) {97throw Error("prefix should not start with /");98}99if (cutoff == null) {100cutoff = new Date(Date.now() - 1000 * 60 * 60 * 24 * 7);101}102logger.debug("pull: get the remote sqlite database");103await exec({ command: "mkdir", args: ["-p", context.PULL] });104const remoteDatabase = join(105context.PULL,106`${remote}:${prefix}---${new Date().toISOString()}.sqlite3`,107);108// delete all but the most recent remote database files for this remote/prefix (?).109const oldDbFiles = (await readdir(context.PULL))110.sort()111.filter((x) => x.startsWith(`${remote}:${prefix}---`))112.slice(0, -NUM_DB_TO_KEEP);113for (const path of oldDbFiles) {114await unlink(join(context.PULL, path));115}116117await exec({118command: "scp",119args: [`${remote}:/${databaseFilename(prefix)}`, remoteDatabase],120});121122logger.debug("pull: compare state");123const recent =124filesystem != null125? [get(filesystem, remoteDatabase)]126: getRecent({ cutoff, databaseFile: remoteDatabase });127const toUpdate: { remoteFs: Filesystem; localFs?: Filesystem }[] = [];128for (const fs of recent) {129const remoteFs = get(fs, remoteDatabase);130if (!filesystemExists(fs)) {131toUpdate.push({ remoteFs });132} else {133const localFs = get(fs);134if (remoteFs.archived != localFs.archived) {135// different archive state, so needs an update to resolve this (either way)136toUpdate.push({ remoteFs, localFs });137continue;138}139if (deleteSnapshots) {140// sync if *any* snapshots differ141if (!isEqual(remoteFs.snapshots, localFs.snapshots)) {142toUpdate.push({ remoteFs, localFs });143}144} else {145// only sync if newest snapshots are different146const newestRemoteSnapshot =147remoteFs.snapshots[remoteFs.snapshots.length - 1];148if (!newestRemoteSnapshot) {149// no snapshots yet, so nothing to do.150continue;151}152const newestLocalSnapshot =153localFs.snapshots[localFs.snapshots.length - 1];154if (155!newestLocalSnapshot ||156newestRemoteSnapshot > newestLocalSnapshot157) {158toUpdate.push({ remoteFs, localFs });159}160}161}162}163164logger.debug(`pull: toUpdate.length = ${toUpdate.length}`);165if (!dryRun) {166for (const x of toUpdate) {167logger.debug("pull: updating ", x);168await pullOne({ ...x, remote, deleteSnapshots });169}170}171172const toDelete: RawFilesystem[] = [];173if (deleteFilesystemCutoff) {174for (const fs of getRecent({ cutoff })) {175if (!filesystemExists(fs, remoteDatabase)) {176if (new Date(fs.last_edited ?? 0) <= deleteFilesystemCutoff) {177// it's old enough to delete:178toDelete.push(fs);179}180}181}182}183logger.debug(`pull: toDelete.length = ${toDelete.length}`);184if (!dryRun) {185for (const fs of toDelete) {186logger.debug("pull: deleting", fs);187await deleteFilesystem(fs);188}189}190191return { toUpdate, toDelete };192}193194async function pullOne({195remoteFs,196localFs,197remote,198deleteSnapshots,199}: {200remoteFs: Filesystem;201localFs?: Filesystem;202remote?: string;203deleteSnapshots?: boolean;204}) {205logger.debug("pull:", { remoteFs, localFs, remote, deleteSnapshots });206if (localFs == null) {207localFs = await createFilesystem(remoteFs);208}209210// sync last_edited, affinity and nfs fields in all cases211set({212...primaryKey(localFs),213last_edited: remoteFs.last_edited,214affinity: remoteFs.affinity,215nfs: remoteFs.nfs,216});217218if (localFs.archived && !remoteFs.archived) {219// it's back in use:220await dearchiveFilesystem(localFs);221// don't return -- will then possibly sync more below, in case of new changes222} else if (!localFs.archived && remoteFs.archived) {223// we just archive ours. Note in theory there is a chance224// that our local version is not update-to-date with the remote225// version. However, the point of archiving is it should only happen226// many weeks after a filesystem stopped being used, and by that227// point we should have already pull'd the latest version.228// Don't bother worrying about deleting snapshots.229await archiveFilesystem(localFs);230return;231}232if (localFs.archived && remoteFs.archived) {233// nothing to do234// Also, don't bother worrying about deleting snapshots, since can't.235return;236}237const snapshot = newestCommonSnapshot(localFs.snapshots, remoteFs.snapshots);238const newest_snapshot = remoteFs.snapshots[remoteFs.snapshots.length - 1];239if (!newest_snapshot || snapshot == newest_snapshot) {240logger.debug("pull: already have the newest snapshot locally");241} else {242const mountpoint = filesystemMountpoint(localFs);243try {244if (!snapshot) {245// full replication with nothing local246await exec({247verbose: true,248command: `ssh ${remote} "zfs send -e -c -R ${filesystemDataset(remoteFs)}@${newest_snapshot}" | sudo zfs recv -o mountpoint=${mountpoint} -F ${filesystemDataset(localFs)}`,249what: {250...localFs,251desc: "pull: doing a full receive from remote",252},253});254} else {255// incremental based on the last common snapshot256const force =257localFs.snapshots[localFs.snapshots.length - 1] == snapshot258? ""259: " -F ";260await exec({261verbose: true,262command: `ssh ${remote} "zfs send -e -c -I @${snapshot} ${filesystemDataset(remoteFs)}@${newest_snapshot}" | sudo zfs recv -o mountpoint=${mountpoint} -F ${filesystemDataset(localFs)} ${force}`,263what: {264...localFs,265desc: "pull: doing an incremental replication from remote",266},267});268}269} finally {270// even if there was an error, update local snapshots, since we likely have some new271// ones (e.g., even if there was a partial receive, interrupted by a network drop).272await getSnapshots(localFs);273}274}275276if (deleteSnapshots) {277// In general due to snapshot trimming, the278// list of snapshots on local might NOT match remote, but after replication279// local will always have a *supserset* of remote. We thus may have to280// trim some snapshots:281const remoteSnapshots = new Set(remoteFs.snapshots);282const localSnapshots = get(localFs).snapshots;283for (const snapshot of localSnapshots) {284if (!remoteSnapshots.has(snapshot)) {285await deleteSnapshot({ ...localFs, snapshot });286}287}288}289}290291// s0 and s1 are sorted oldest-to-newest lists of names of snapshots.292// return largest that is in common between the two or undefined if nothing is in common293function newestCommonSnapshot(s0: string[], s1: string[]) {294const t1 = new Set(s1);295for (let i = s0.length - 1; i >= 0; i--) {296if (t1.has(s0[i])) {297return s0[i];298}299}300}301302303