diff --git a/main.py b/main.py index ce2d00a..f747cbe 100755 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ import os.path import shutil import sys -from spqr.curateipsum.backup import initiate_backup +import spqr.curateipsum.backup as backup _lg = logging.getLogger("spqr.curateipsum") SUPPORTED_PLATFORMS = ("linux", "darwin") @@ -56,7 +56,8 @@ def main(): _lg.info("Starting %s: %s", parser.prog, args) if sys.platform not in SUPPORTED_PLATFORMS: - _lg.error(f"Not supported platform: {sys.platform}. Supported platforms: {SUPPORTED_PLATFORMS}") + _lg.error("Not supported platform: %s. Supported platforms: %s", + sys.platform, SUPPORTED_PLATFORMS) return 1 if args.external_rsync and not shutil.which("rsync"): @@ -65,7 +66,8 @@ def main(): cp_program = "gcp" if sys.platform == "darwin" else "cp" if args.external_hardlink and not shutil.which(cp_program): - _lg.error(f"{cp_program} should be installed to use --external-hardlink option.") + _lg.error("%s should be installed to use --external-hardlink option.", + cp_program) return 1 backup_dir_abs = os.path.abspath(args.backup_dir) @@ -78,7 +80,9 @@ def main(): _lg.error("Source directory %s does not exist", src_dir) return 1 - initiate_backup( + backup.cleanup_old_backups(backup_dir=backup_dir_abs, dry_run=args.dry_run) + + backup.initiate_backup( sources=args.sources, backup_dir=backup_dir_abs, dry_run=args.dry_run, diff --git a/spqr/curateipsum/backup.py b/spqr/curateipsum/backup.py index f230879..e400e9f 100644 --- a/spqr/curateipsum/backup.py +++ b/spqr/curateipsum/backup.py @@ -6,44 +6,166 @@ import logging import os import shutil import time -from datetime import datetime -from typing import Optional +from datetime import datetime, timedelta +from typing import Optional, Iterable import spqr.curateipsum.fs as fs -BACKUP_ENT_FMT = "%y%m%d_%H%M" +BACKUP_ENT_FMT = "%Y%m%d_%H%M" DELTA_DIR = "_delta" _lg = logging.getLogger(__name__) -def _is_backup_entity(entity_path: str) -> bool: +def _is_backup_entity(backup_entry: os.DirEntry) -> bool: """ Check if entity_path is a single backup dir. """ - if not os.path.isdir(entity_path): + if not backup_entry.is_dir(): return False try: - datetime.strptime(os.path.basename(entity_path), BACKUP_ENT_FMT) + datetime.strptime(backup_entry.name, BACKUP_ENT_FMT) return True except ValueError: return False -def _get_latest_backup(backup_dir: str) -> Optional[str]: +def _iterate_backups(backup_dir: str) -> Iterable[os.DirEntry]: + b_iter = os.scandir(backup_dir) + + b_ent: os.DirEntry + for b_ent in b_iter: + if not _is_backup_entity(b_ent): + continue + if not os.listdir(b_ent.path): + _lg.info("Removing empty backup entity: %s", b_ent.name) + os.rmdir(b_ent.path) + continue + yield b_ent + + b_iter.close() + + +def _get_latest_backup(backup_dir: str) -> Optional[os.DirEntry]: """ Returns path to latest backup created in backup_dir or None. """ - backups = sorted(os.listdir(backup_dir), reverse=True) - - for b_ent in backups: - b_ent_abs = os.path.join(backup_dir, b_ent) - if not _is_backup_entity(b_ent_abs): - continue - if not os.listdir(b_ent_abs): - _lg.info("Removing empty backup entity: %s", os.path.basename(b_ent_abs)) - os.rmdir(b_ent_abs) - continue - return b_ent_abs - + all_backups = sorted(_iterate_backups(backup_dir), key=lambda e: e.name) + if all_backups: + return all_backups[-1] return None +def _date_from_backup(backup: os.DirEntry) -> datetime: + return datetime.strptime(backup.name, BACKUP_ENT_FMT) + + +def cleanup_old_backups( + backup_dir: str, + dry_run: bool = False, + keep_all: int = 7, + keep_daily: int = 30, + keep_weekly: int = 52, + keep_monthly: int = 12, + keep_yearly: int = 5, + min_free_space: int = 0 +): + """ + Delete old backups. Never deletes the only backup. + :param backup_dir: full path to backup directory. + :param dry_run: don't do anything actually. + :param keep_all: the number of days that all backups must be kept. + :param keep_daily: the number of days that all daily backups must be kept. + :param keep_weekly: the number of weeks of which one weekly backup must be kept. + :param keep_monthly: the number of months (1 month = 30 days) of which + one monthly backup must be kept. + :param keep_yearly: the number of years of which one yearly backup must be kept. + :param min_free_space: not used right now + :return: + """ + all_backups = sorted(_iterate_backups(backup_dir), + key=lambda e: e.name, reverse=True) + if dry_run: + _lg.info("Dry-run, no backups will be actually removed") + if not all_backups: + _lg.debug("No backups, exiting") + return + elif len(all_backups) == 1: + _lg.debug("Only one backup (%s) exists, will not remove it", + all_backups[0].name) + return + + now = datetime.now() + thresholds = {k: now.strftime(BACKUP_ENT_FMT) + for k in ("all", "daily", "weekly", "monthly", "yearly")} + if keep_all is not None: + thresholds["all"] = ((now - timedelta(days=keep_all)) + .replace(hour=0, minute=0, second=0) + .strftime(BACKUP_ENT_FMT)) + if keep_daily is not None: + thresholds["daily"] = ((now - timedelta(days=keep_daily)) + .replace(hour=0, minute=0, second=0) + .strftime(BACKUP_ENT_FMT)) + if keep_weekly is not None: + thresholds["weekly"] = ( + (now - timedelta(weeks=keep_weekly, days=now.weekday())) + .strftime(BACKUP_ENT_FMT) + ) + if keep_monthly is not None: + thresholds["monthly"] = ((now - timedelta(days=30*keep_monthly)) + .replace(day=1, hour=0, minute=0, second=0) + .strftime(BACKUP_ENT_FMT)) + if keep_yearly is not None: + thresholds["yearly"] = ( + (now - timedelta(days=365*keep_yearly)) + .replace(month=1, day=1, hour=0, minute=0, second=0) + .strftime(BACKUP_ENT_FMT) + ) + + prev_backup = all_backups[0] + to_remove = {b: False for b in all_backups} + + for backup in all_backups[1:]: + # skip all backups made after threshold + if backup.name > thresholds["all"]: + prev_backup = backup + continue + + # leave only one backup per day for backups made after threshold + if backup.name > thresholds["daily"]: + if (_date_from_backup(prev_backup).date() + == _date_from_backup(backup).date()): + to_remove[prev_backup] = True + prev_backup = backup + continue + + # leave only one backup per week for backups made after threshold + if backup.name > thresholds["weekly"]: + if (_date_from_backup(prev_backup).isocalendar()[1] + == _date_from_backup(backup).isocalendar()[1]): + to_remove[prev_backup] = True + prev_backup = backup + continue + + # leave only one backup per month for backups made after threshold + if backup.name > thresholds["monthly"]: + if (_date_from_backup(prev_backup).date().replace(day=1) + == _date_from_backup(backup).date().replace(day=1)): + to_remove[prev_backup] = True + prev_backup = backup + continue + + # leave only one backup per year for backups made after threshold + if backup.name > thresholds["yearly"]: + if (_date_from_backup(prev_backup).date().replace(month=1, day=1) + == _date_from_backup(backup).date().replace(month=1, day=1)): + to_remove[prev_backup] = True + prev_backup = backup + continue + + to_remove[backup] = True + + for backup, do_delete in to_remove.items(): + _lg.info("Removing old backup %s", backup.name) + if not dry_run and do_delete: + shutil.rmtree(backup.path) + + def process_backed_entry(backup_dir: str, entry_relpath: str, action: fs.Actions): _lg.debug("%s %s", action, entry_relpath) if action is not fs.Actions.delete: @@ -60,33 +182,37 @@ def initiate_backup(sources, start_time = time.time() start_time_fmt = datetime.fromtimestamp(start_time).strftime(BACKUP_ENT_FMT) - cur_backup = os.path.join(backup_dir, start_time_fmt) - cur_backup_name = os.path.basename(cur_backup) - _lg.debug("Current backup dir: %s", cur_backup) + cur_backup = fs.PseudoDirEntry(os.path.join(backup_dir, start_time_fmt)) + _lg.debug("Current backup dir: %s", cur_backup.path) latest_backup = _get_latest_backup(backup_dir) - if cur_backup == latest_backup: - _lg.warning("Latest backup %s was created less than minute ago, exiting", - os.path.basename(latest_backup)) - return if latest_backup is None: - _lg.info("Creating empty directory for current backup: %s", cur_backup_name) - os.mkdir(cur_backup) - else: - _lg.info("Copying data from latest backup %s to current backup %s", - os.path.basename(latest_backup), cur_backup_name) + _lg.info("Creating empty directory for current backup: %s", + cur_backup.name) + os.mkdir(cur_backup.path) - hl_res = fs.hardlink_dir(src_dir=latest_backup, dst_dir=cur_backup, + else: + # TODO check last backup is finalized + if cur_backup.name == latest_backup.name: + _lg.warning("Latest backup %s was created less than minute ago, exiting", + latest_backup.name) + return + + _lg.info("Copying data from latest backup %s to current backup %s", + latest_backup.name, cur_backup.name) + + hl_res = fs.hardlink_dir(src_dir=latest_backup.path, + dst_dir=cur_backup.path, use_external=external_hardlink) if not hl_res: _lg.error("Something went wrong during copying data from latest backup," - " removing created %s", cur_backup_name) - shutil.rmtree(cur_backup, ignore_errors=True) + " removing created %s", cur_backup.name) + shutil.rmtree(cur_backup.path, ignore_errors=True) return # clean up delta dir from copied backup - shutil.rmtree(os.path.join(cur_backup, DELTA_DIR), ignore_errors=True) + shutil.rmtree(os.path.join(cur_backup.path, DELTA_DIR), ignore_errors=True) rsync_func = fs.rsync_ext if external_rsync else fs.rsync @@ -94,12 +220,12 @@ def initiate_backup(sources, for src in sources: src_abs = os.path.abspath(src) src_name = os.path.basename(src_abs) - dst_abs = os.path.join(cur_backup, src_name) - _lg.info("Backing up directory %s to %s backup", src_abs, cur_backup_name) + dst_abs = os.path.join(cur_backup.path, src_name) + _lg.info("Backing up directory %s to %s backup", src_abs, cur_backup.name) for entry_relpath, action in rsync_func(src_abs, dst_abs, dry_run=dry_run): if latest_backup is not None: process_backed_entry( - backup_dir=cur_backup, + backup_dir=cur_backup.path, entry_relpath=os.path.join(src_name, entry_relpath), action=action ) @@ -107,15 +233,15 @@ def initiate_backup(sources, # do not create backup on dry-run if dry_run: - _lg.info("Dry-run, removing created backup: %s", cur_backup_name) - shutil.rmtree(cur_backup, ignore_errors=True) + _lg.info("Dry-run, removing created backup: %s", cur_backup.name) + shutil.rmtree(cur_backup.path, ignore_errors=True) # do not create backup if no change from previous one elif latest_backup is not None and not backup_changed: _lg.info("Newly created backup %s is the same as previous one %s, removing", - cur_backup_name, os.path.basename(latest_backup)) - shutil.rmtree(cur_backup, ignore_errors=True) + cur_backup.name, latest_backup.name) + shutil.rmtree(cur_backup.path, ignore_errors=True) else: - _lg.info("Backup created: %s", cur_backup_name) + _lg.info("Backup created: %s", cur_backup.name) end_time = time.time() spend_time = end_time - start_time diff --git a/spqr/curateipsum/fs.py b/spqr/curateipsum/fs.py index 23491af..1316965 100644 --- a/spqr/curateipsum/fs.py +++ b/spqr/curateipsum/fs.py @@ -22,6 +22,27 @@ class Actions(enum.Enum): update_owner = enum.auto() create = enum.auto() + +class PseudoDirEntry: + def __init__(self, path): + self.path = os.path.realpath(path) + self.name = os.path.basename(self.path) + self._is_dir = None + self._stat = None + + def __str__(self): + return self.name + + def is_dir(self) -> bool: + if self._is_dir is None: + self._is_dir = os.path.isdir(self.path) + return self._is_dir + + def stat(self): + if self._stat is None: + self._stat = os.lstat(self.path) + return self._stat + # *deleting will_be_deleted # >f.st.... .gitignore # >f+++++++ LICENSE @@ -79,7 +100,9 @@ def rsync_ext(src, dst, dry_run=False): rsync_args.append(str(dst)) _lg.info("Executing external command: %s", " ".join(rsync_args)) - process = subprocess.Popen(rsync_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen(rsync_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) with process.stdout: prev_line = None for line in iter(process.stdout.readline, b""): @@ -246,10 +269,12 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]: dst_root_abs = os.path.abspath(dst_dir) if not os.path.isdir(src_root_abs): - raise RuntimeError(f"Error during reading source directory: {src_root_abs}") + raise RuntimeError("Error during reading source directory: %s" + % src_root_abs) if os.path.exists(dst_root_abs): if not os.path.isdir(dst_root_abs): - raise RuntimeError("Destination path is not a directory: %s" % dst_root_abs) + raise RuntimeError("Destination path is not a directory: %s" + % dst_root_abs) else: os.mkdir(dst_root_abs) @@ -278,19 +303,22 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]: # rewrite dst if it has different than src type if src_entry.is_file(follow_symlinks=False): if not dst_entry.is_file(follow_symlinks=False): - _lg.debug("Rewriting (src is a file, dst is not a file): %s", rel_path) + _lg.debug("Rewriting (src is a file, dst is not a file): %s", + rel_path) update_direntry(src_entry, dst_entry) yield rel_path, Actions.rewrite continue if src_entry.is_dir(follow_symlinks=False): if not dst_entry.is_dir(follow_symlinks=False): - _lg.debug("Rewriting (src is a dir, dst is not a dir): %s", rel_path) + _lg.debug("Rewriting (src is a dir, dst is not a dir): %s", + rel_path) update_direntry(src_entry, dst_entry) yield rel_path, Actions.rewrite continue if src_entry.is_symlink(): if not dst_entry.is_symlink(): - _lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s", rel_path) + _lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s", + rel_path) update_direntry(src_entry, dst_entry) yield rel_path, Actions.rewrite continue @@ -379,7 +407,9 @@ def _recursive_hardlink_ext(src: str, dst: str) -> bool: src_content = glob.glob(f"{src}/*") cmd = [cp, "--archive", "--verbose", "--link", *src_content, dst] _lg.info("Executing external command: %s", " ".join(cmd)) - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + process = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) with process.stdout: for line in iter(process.stdout.readline, b""): _lg.debug("%s: %s", cp, line.decode("utf-8").strip()) @@ -458,10 +488,11 @@ def nest_hardlink(src_dir: str, src_relpath: str, dst_dir: str): # check source entity and destination directory if not os.path.exists(src_full_path): - raise RuntimeError(f"Error reading source entity: {src_full_path}") + raise RuntimeError("Error reading source entity: %s" % src_full_path) if os.path.exists(dst_dir_abs): if not os.path.isdir(dst_dir_abs): - raise RuntimeError("Destination path is not a directory: %s" % dst_dir_abs) + raise RuntimeError("Destination path is not a directory: %s" + % dst_dir_abs) else: os.mkdir(dst_dir_abs) diff --git a/tests/test_backups.py b/tests/test_backups.py new file mode 100644 index 0000000..f52ba8c --- /dev/null +++ b/tests/test_backups.py @@ -0,0 +1,203 @@ +import os +import random +import string +import tempfile +from unittest import TestCase, mock +from datetime import datetime + +import spqr.curateipsum.backup as bk +import spqr.curateipsum.fs as fs + + +class TestBackupCleanup(TestCase): + def setUp(self) -> None: + self.backup_dir = tempfile.TemporaryDirectory(prefix="backup_") + + def tearDown(self) -> None: + self.backup_dir.cleanup() + + def _add_backup(self, backup_name: str) -> fs.PseudoDirEntry: + backup = fs.PseudoDirEntry(os.path.join(self.backup_dir.name, backup_name)) + os.mkdir(backup.path) + + fd, path = tempfile.mkstemp(prefix="backup_file_", dir=backup.path) + with open(fd, "w") as f: + f.write(''.join(random.choices(string.printable, k=128))) + return backup + + @staticmethod + def _check_backup_not_empty(backup: fs.PseudoDirEntry) -> bool: + return bool(os.listdir(backup.path)) + + def _check_backups(self, expected_backups): + backups_list = os.listdir(self.backup_dir.name) + self.assertEqual(sorted(b.name for b in expected_backups), + sorted(backups_list)) + for b in expected_backups: + self.assertTrue(self._check_backup_not_empty(b)) + + def _run_cleanup(self, **kwargs): + """ Run cleanup_old_backups with null parameters. """ + cleanup_kwargs = { + "backup_dir": self.backup_dir.name, + "dry_run": False, + "keep_all": None, + "keep_daily": None, + "keep_weekly": None, + "keep_monthly": None, + "keep_yearly": None, + } + cleanup_kwargs.update(**kwargs) + bk.cleanup_old_backups(**cleanup_kwargs) + + def test_no_backups(self): + """ Test behaviour with no available backups """ + bk.cleanup_old_backups(self.backup_dir.name) + self.assertFalse(os.listdir(self.backup_dir.name)) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_only_one_backup(self, mock_datetime): + """ Test the only backup will not be removed in any case """ + mock_datetime.now.return_value = datetime(2021, 10, 20) + only_backup = self._add_backup("20010101_0000") + self._run_cleanup(keep_all=1) + self._check_backups([only_backup]) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_at_least_one_should_be_left(self, mock_datetime): + """ Test at least one backup should be left """ + mock_datetime.now.return_value = datetime(2021, 10, 20) + backups = [ + self._add_backup("20211103_0300"), + self._add_backup("20201216_0100"), + self._add_backup("20200716_0100"), + self._add_backup("20181116_0100"), + ] + expected_backups = [backups[0]] + self._run_cleanup() + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_all_threshold_only(self, mock_datetime): + """ Test threshold for keeping all backups """ + mock_datetime.now.return_value = datetime(2021, 10, 20) + backups = [ + self._add_backup("20211019_0300"), + self._add_backup("20211017_0100"), + self._add_backup("20211016_2300"), + ] + expected_backups = backups[:2] + self._run_cleanup(keep_all=3) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_daily_threshold_only(self, mock_datetime): + """ Test threshold for keeping daily backups """ + mock_datetime.now.return_value = datetime(2021, 10, 20) + backups = [ + self._add_backup("20211019_0300"), + self._add_backup("20211017_2100"), + self._add_backup("20211017_0100"), + self._add_backup("20211017_0030"), + self._add_backup("20211016_2300"), + self._add_backup("20211016_0100"), + ] + expected_backups = [backups[0], backups[3]] + self._run_cleanup(keep_daily=3) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_all_and_daily_thresholds(self, mock_datetime): + """ Test threshold for keeping all and daily backups """ + mock_datetime.now.return_value = datetime(2021, 10, 20) + backups = [ + self._add_backup("20211019_0300"), + self._add_backup("20211017_0200"), + self._add_backup("20211017_0100"), + self._add_backup("20211016_2300"), + self._add_backup("20211016_2200"), + self._add_backup("20211015_2200"), + self._add_backup("20211015_1500"), + self._add_backup("20211015_0200"), + self._add_backup("20211014_2200"), + self._add_backup("20211014_2000"), + self._add_backup("20211014_1232"), + ] + expected_backups = backups[0:3] + [backups[4]] + [backups[7]] + self._run_cleanup(keep_all=3, keep_daily=5) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_weekly_threshold_only(self, mock_datetime): + """ Test threshold for keeping weekly backups """ + mock_datetime.now.return_value = datetime(2021, 11, 11) + backups = [ + self._add_backup("20211111_0300"), + self._add_backup("20211110_0300"), + self._add_backup("20211108_0100"), + self._add_backup("20211107_2300"), + self._add_backup("20211107_0100"), + self._add_backup("20211031_0100"), + self._add_backup("20211025_0100"), + self._add_backup("20211024_0100"), + self._add_backup("20211023_0100"), + self._add_backup("20211022_0100"), + self._add_backup("20211008_0100"), + self._add_backup("20211007_0100"), + self._add_backup("20211004_0100"), + self._add_backup("20211003_0100"), + ] + expected_backups = [backups[2], backups[4], backups[6], + backups[9], backups[12]] + self._run_cleanup(keep_weekly=5) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_monthly_threshold_only(self, mock_datetime): + """ Test threshold for keeping monthly backups """ + mock_datetime.now.return_value = datetime(2021, 11, 11) + backups = [ + self._add_backup("20211103_0300"), + self._add_backup("20211019_0300"), + self._add_backup("20211017_2100"), + self._add_backup("20211017_0100"), + self._add_backup("20210916_2300"), + self._add_backup("20210916_0100"), + self._add_backup("20210816_0100"), + self._add_backup("20210810_0000"), + self._add_backup("20210716_0100"), + ] + expected_backups = [backups[0], backups[3], backups[5], backups[7]] + self._run_cleanup(keep_monthly=3) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_keep_yearly_threshold_only(self, mock_datetime): + """ Test threshold for keeping yearly backups """ + mock_datetime.now.return_value = datetime(2021, 11, 11) + backups = [ + self._add_backup("20211103_0300"), + self._add_backup("20210810_0000"), + self._add_backup("20210716_0100"), + self._add_backup("20201216_0100"), + self._add_backup("20200716_0100"), + self._add_backup("20190316_0100"), + self._add_backup("20181116_0100"), + ] + expected_backups = [backups[2], backups[4], backups[5], backups[6]] + self._run_cleanup(keep_yearly=3) + self._check_backups(expected_backups) + + @mock.patch(f"{bk.__name__}.datetime", wraps=datetime) + def test_dry_run(self, mock_datetime): + """ Test dry run does not remove anything """ + mock_datetime.now.return_value = datetime(2021, 11, 11) + backups = [ + self._add_backup("20211103_0300"), + self._add_backup("20210810_0000"), + self._add_backup("20210716_0100"), + self._add_backup("20200716_0100"), + self._add_backup("20181116_0100"), + ] + self._run_cleanup(keep_all=2, dry_run=True) + self._check_backups(backups)