Add backups cleanup

This commit is contained in:
Maks Snegov 2021-11-12 02:53:21 +03:00
parent d521ef3c89
commit c64955362a
4 changed files with 420 additions and 56 deletions

12
main.py
View File

@ -6,7 +6,7 @@ import os.path
import shutil
import sys
from spqr.curateipsum.backup import initiate_backup
import spqr.curateipsum.backup as backup
_lg = logging.getLogger("spqr.curateipsum")
SUPPORTED_PLATFORMS = ("linux", "darwin")
@ -56,7 +56,8 @@ def main():
_lg.info("Starting %s: %s", parser.prog, args)
if sys.platform not in SUPPORTED_PLATFORMS:
_lg.error(f"Not supported platform: {sys.platform}. Supported platforms: {SUPPORTED_PLATFORMS}")
_lg.error("Not supported platform: %s. Supported platforms: %s",
sys.platform, SUPPORTED_PLATFORMS)
return 1
if args.external_rsync and not shutil.which("rsync"):
@ -65,7 +66,8 @@ def main():
cp_program = "gcp" if sys.platform == "darwin" else "cp"
if args.external_hardlink and not shutil.which(cp_program):
_lg.error(f"{cp_program} should be installed to use --external-hardlink option.")
_lg.error("%s should be installed to use --external-hardlink option.",
cp_program)
return 1
backup_dir_abs = os.path.abspath(args.backup_dir)
@ -78,7 +80,9 @@ def main():
_lg.error("Source directory %s does not exist", src_dir)
return 1
initiate_backup(
backup.cleanup_old_backups(backup_dir=backup_dir_abs, dry_run=args.dry_run)
backup.initiate_backup(
sources=args.sources,
backup_dir=backup_dir_abs,
dry_run=args.dry_run,

View File

@ -6,44 +6,166 @@ import logging
import os
import shutil
import time
from datetime import datetime
from typing import Optional
from datetime import datetime, timedelta
from typing import Optional, Iterable
import spqr.curateipsum.fs as fs
BACKUP_ENT_FMT = "%y%m%d_%H%M"
BACKUP_ENT_FMT = "%Y%m%d_%H%M"
DELTA_DIR = "_delta"
_lg = logging.getLogger(__name__)
def _is_backup_entity(entity_path: str) -> bool:
def _is_backup_entity(backup_entry: os.DirEntry) -> bool:
""" Check if entity_path is a single backup dir. """
if not os.path.isdir(entity_path):
if not backup_entry.is_dir():
return False
try:
datetime.strptime(os.path.basename(entity_path), BACKUP_ENT_FMT)
datetime.strptime(backup_entry.name, BACKUP_ENT_FMT)
return True
except ValueError:
return False
def _get_latest_backup(backup_dir: str) -> Optional[str]:
def _iterate_backups(backup_dir: str) -> Iterable[os.DirEntry]:
b_iter = os.scandir(backup_dir)
b_ent: os.DirEntry
for b_ent in b_iter:
if not _is_backup_entity(b_ent):
continue
if not os.listdir(b_ent.path):
_lg.info("Removing empty backup entity: %s", b_ent.name)
os.rmdir(b_ent.path)
continue
yield b_ent
b_iter.close()
def _get_latest_backup(backup_dir: str) -> Optional[os.DirEntry]:
""" Returns path to latest backup created in backup_dir or None. """
backups = sorted(os.listdir(backup_dir), reverse=True)
for b_ent in backups:
b_ent_abs = os.path.join(backup_dir, b_ent)
if not _is_backup_entity(b_ent_abs):
continue
if not os.listdir(b_ent_abs):
_lg.info("Removing empty backup entity: %s", os.path.basename(b_ent_abs))
os.rmdir(b_ent_abs)
continue
return b_ent_abs
all_backups = sorted(_iterate_backups(backup_dir), key=lambda e: e.name)
if all_backups:
return all_backups[-1]
return None
def _date_from_backup(backup: os.DirEntry) -> datetime:
return datetime.strptime(backup.name, BACKUP_ENT_FMT)
def cleanup_old_backups(
backup_dir: str,
dry_run: bool = False,
keep_all: int = 7,
keep_daily: int = 30,
keep_weekly: int = 52,
keep_monthly: int = 12,
keep_yearly: int = 5,
min_free_space: int = 0
):
"""
Delete old backups. Never deletes the only backup.
:param backup_dir: full path to backup directory.
:param dry_run: don't do anything actually.
:param keep_all: the number of days that all backups must be kept.
:param keep_daily: the number of days that all daily backups must be kept.
:param keep_weekly: the number of weeks of which one weekly backup must be kept.
:param keep_monthly: the number of months (1 month = 30 days) of which
one monthly backup must be kept.
:param keep_yearly: the number of years of which one yearly backup must be kept.
:param min_free_space: not used right now
:return:
"""
all_backups = sorted(_iterate_backups(backup_dir),
key=lambda e: e.name, reverse=True)
if dry_run:
_lg.info("Dry-run, no backups will be actually removed")
if not all_backups:
_lg.debug("No backups, exiting")
return
elif len(all_backups) == 1:
_lg.debug("Only one backup (%s) exists, will not remove it",
all_backups[0].name)
return
now = datetime.now()
thresholds = {k: now.strftime(BACKUP_ENT_FMT)
for k in ("all", "daily", "weekly", "monthly", "yearly")}
if keep_all is not None:
thresholds["all"] = ((now - timedelta(days=keep_all))
.replace(hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_daily is not None:
thresholds["daily"] = ((now - timedelta(days=keep_daily))
.replace(hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_weekly is not None:
thresholds["weekly"] = (
(now - timedelta(weeks=keep_weekly, days=now.weekday()))
.strftime(BACKUP_ENT_FMT)
)
if keep_monthly is not None:
thresholds["monthly"] = ((now - timedelta(days=30*keep_monthly))
.replace(day=1, hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_yearly is not None:
thresholds["yearly"] = (
(now - timedelta(days=365*keep_yearly))
.replace(month=1, day=1, hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT)
)
prev_backup = all_backups[0]
to_remove = {b: False for b in all_backups}
for backup in all_backups[1:]:
# skip all backups made after threshold
if backup.name > thresholds["all"]:
prev_backup = backup
continue
# leave only one backup per day for backups made after threshold
if backup.name > thresholds["daily"]:
if (_date_from_backup(prev_backup).date()
== _date_from_backup(backup).date()):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per week for backups made after threshold
if backup.name > thresholds["weekly"]:
if (_date_from_backup(prev_backup).isocalendar()[1]
== _date_from_backup(backup).isocalendar()[1]):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per month for backups made after threshold
if backup.name > thresholds["monthly"]:
if (_date_from_backup(prev_backup).date().replace(day=1)
== _date_from_backup(backup).date().replace(day=1)):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per year for backups made after threshold
if backup.name > thresholds["yearly"]:
if (_date_from_backup(prev_backup).date().replace(month=1, day=1)
== _date_from_backup(backup).date().replace(month=1, day=1)):
to_remove[prev_backup] = True
prev_backup = backup
continue
to_remove[backup] = True
for backup, do_delete in to_remove.items():
_lg.info("Removing old backup %s", backup.name)
if not dry_run and do_delete:
shutil.rmtree(backup.path)
def process_backed_entry(backup_dir: str, entry_relpath: str, action: fs.Actions):
_lg.debug("%s %s", action, entry_relpath)
if action is not fs.Actions.delete:
@ -60,33 +182,37 @@ def initiate_backup(sources,
start_time = time.time()
start_time_fmt = datetime.fromtimestamp(start_time).strftime(BACKUP_ENT_FMT)
cur_backup = os.path.join(backup_dir, start_time_fmt)
cur_backup_name = os.path.basename(cur_backup)
_lg.debug("Current backup dir: %s", cur_backup)
cur_backup = fs.PseudoDirEntry(os.path.join(backup_dir, start_time_fmt))
_lg.debug("Current backup dir: %s", cur_backup.path)
latest_backup = _get_latest_backup(backup_dir)
if cur_backup == latest_backup:
_lg.warning("Latest backup %s was created less than minute ago, exiting",
os.path.basename(latest_backup))
return
if latest_backup is None:
_lg.info("Creating empty directory for current backup: %s", cur_backup_name)
os.mkdir(cur_backup)
else:
_lg.info("Copying data from latest backup %s to current backup %s",
os.path.basename(latest_backup), cur_backup_name)
_lg.info("Creating empty directory for current backup: %s",
cur_backup.name)
os.mkdir(cur_backup.path)
hl_res = fs.hardlink_dir(src_dir=latest_backup, dst_dir=cur_backup,
else:
# TODO check last backup is finalized
if cur_backup.name == latest_backup.name:
_lg.warning("Latest backup %s was created less than minute ago, exiting",
latest_backup.name)
return
_lg.info("Copying data from latest backup %s to current backup %s",
latest_backup.name, cur_backup.name)
hl_res = fs.hardlink_dir(src_dir=latest_backup.path,
dst_dir=cur_backup.path,
use_external=external_hardlink)
if not hl_res:
_lg.error("Something went wrong during copying data from latest backup,"
" removing created %s", cur_backup_name)
shutil.rmtree(cur_backup, ignore_errors=True)
" removing created %s", cur_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
return
# clean up delta dir from copied backup
shutil.rmtree(os.path.join(cur_backup, DELTA_DIR), ignore_errors=True)
shutil.rmtree(os.path.join(cur_backup.path, DELTA_DIR), ignore_errors=True)
rsync_func = fs.rsync_ext if external_rsync else fs.rsync
@ -94,12 +220,12 @@ def initiate_backup(sources,
for src in sources:
src_abs = os.path.abspath(src)
src_name = os.path.basename(src_abs)
dst_abs = os.path.join(cur_backup, src_name)
_lg.info("Backing up directory %s to %s backup", src_abs, cur_backup_name)
dst_abs = os.path.join(cur_backup.path, src_name)
_lg.info("Backing up directory %s to %s backup", src_abs, cur_backup.name)
for entry_relpath, action in rsync_func(src_abs, dst_abs, dry_run=dry_run):
if latest_backup is not None:
process_backed_entry(
backup_dir=cur_backup,
backup_dir=cur_backup.path,
entry_relpath=os.path.join(src_name, entry_relpath),
action=action
)
@ -107,15 +233,15 @@ def initiate_backup(sources,
# do not create backup on dry-run
if dry_run:
_lg.info("Dry-run, removing created backup: %s", cur_backup_name)
shutil.rmtree(cur_backup, ignore_errors=True)
_lg.info("Dry-run, removing created backup: %s", cur_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
# do not create backup if no change from previous one
elif latest_backup is not None and not backup_changed:
_lg.info("Newly created backup %s is the same as previous one %s, removing",
cur_backup_name, os.path.basename(latest_backup))
shutil.rmtree(cur_backup, ignore_errors=True)
cur_backup.name, latest_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
else:
_lg.info("Backup created: %s", cur_backup_name)
_lg.info("Backup created: %s", cur_backup.name)
end_time = time.time()
spend_time = end_time - start_time

View File

@ -22,6 +22,27 @@ class Actions(enum.Enum):
update_owner = enum.auto()
create = enum.auto()
class PseudoDirEntry:
def __init__(self, path):
self.path = os.path.realpath(path)
self.name = os.path.basename(self.path)
self._is_dir = None
self._stat = None
def __str__(self):
return self.name
def is_dir(self) -> bool:
if self._is_dir is None:
self._is_dir = os.path.isdir(self.path)
return self._is_dir
def stat(self):
if self._stat is None:
self._stat = os.lstat(self.path)
return self._stat
# *deleting will_be_deleted
# >f.st.... .gitignore
# >f+++++++ LICENSE
@ -79,7 +100,9 @@ def rsync_ext(src, dst, dry_run=False):
rsync_args.append(str(dst))
_lg.info("Executing external command: %s", " ".join(rsync_args))
process = subprocess.Popen(rsync_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process = subprocess.Popen(rsync_args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
prev_line = None
for line in iter(process.stdout.readline, b""):
@ -246,10 +269,12 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]:
dst_root_abs = os.path.abspath(dst_dir)
if not os.path.isdir(src_root_abs):
raise RuntimeError(f"Error during reading source directory: {src_root_abs}")
raise RuntimeError("Error during reading source directory: %s"
% src_root_abs)
if os.path.exists(dst_root_abs):
if not os.path.isdir(dst_root_abs):
raise RuntimeError("Destination path is not a directory: %s" % dst_root_abs)
raise RuntimeError("Destination path is not a directory: %s"
% dst_root_abs)
else:
os.mkdir(dst_root_abs)
@ -278,19 +303,22 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]:
# rewrite dst if it has different than src type
if src_entry.is_file(follow_symlinks=False):
if not dst_entry.is_file(follow_symlinks=False):
_lg.debug("Rewriting (src is a file, dst is not a file): %s", rel_path)
_lg.debug("Rewriting (src is a file, dst is not a file): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
if src_entry.is_dir(follow_symlinks=False):
if not dst_entry.is_dir(follow_symlinks=False):
_lg.debug("Rewriting (src is a dir, dst is not a dir): %s", rel_path)
_lg.debug("Rewriting (src is a dir, dst is not a dir): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
if src_entry.is_symlink():
if not dst_entry.is_symlink():
_lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s", rel_path)
_lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
@ -379,7 +407,9 @@ def _recursive_hardlink_ext(src: str, dst: str) -> bool:
src_content = glob.glob(f"{src}/*")
cmd = [cp, "--archive", "--verbose", "--link", *src_content, dst]
_lg.info("Executing external command: %s", " ".join(cmd))
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
for line in iter(process.stdout.readline, b""):
_lg.debug("%s: %s", cp, line.decode("utf-8").strip())
@ -458,10 +488,11 @@ def nest_hardlink(src_dir: str, src_relpath: str, dst_dir: str):
# check source entity and destination directory
if not os.path.exists(src_full_path):
raise RuntimeError(f"Error reading source entity: {src_full_path}")
raise RuntimeError("Error reading source entity: %s" % src_full_path)
if os.path.exists(dst_dir_abs):
if not os.path.isdir(dst_dir_abs):
raise RuntimeError("Destination path is not a directory: %s" % dst_dir_abs)
raise RuntimeError("Destination path is not a directory: %s"
% dst_dir_abs)
else:
os.mkdir(dst_dir_abs)

203
tests/test_backups.py Normal file
View File

@ -0,0 +1,203 @@
import os
import random
import string
import tempfile
from unittest import TestCase, mock
from datetime import datetime
import spqr.curateipsum.backup as bk
import spqr.curateipsum.fs as fs
class TestBackupCleanup(TestCase):
def setUp(self) -> None:
self.backup_dir = tempfile.TemporaryDirectory(prefix="backup_")
def tearDown(self) -> None:
self.backup_dir.cleanup()
def _add_backup(self, backup_name: str) -> fs.PseudoDirEntry:
backup = fs.PseudoDirEntry(os.path.join(self.backup_dir.name, backup_name))
os.mkdir(backup.path)
fd, path = tempfile.mkstemp(prefix="backup_file_", dir=backup.path)
with open(fd, "w") as f:
f.write(''.join(random.choices(string.printable, k=128)))
return backup
@staticmethod
def _check_backup_not_empty(backup: fs.PseudoDirEntry) -> bool:
return bool(os.listdir(backup.path))
def _check_backups(self, expected_backups):
backups_list = os.listdir(self.backup_dir.name)
self.assertEqual(sorted(b.name for b in expected_backups),
sorted(backups_list))
for b in expected_backups:
self.assertTrue(self._check_backup_not_empty(b))
def _run_cleanup(self, **kwargs):
""" Run cleanup_old_backups with null parameters. """
cleanup_kwargs = {
"backup_dir": self.backup_dir.name,
"dry_run": False,
"keep_all": None,
"keep_daily": None,
"keep_weekly": None,
"keep_monthly": None,
"keep_yearly": None,
}
cleanup_kwargs.update(**kwargs)
bk.cleanup_old_backups(**cleanup_kwargs)
def test_no_backups(self):
""" Test behaviour with no available backups """
bk.cleanup_old_backups(self.backup_dir.name)
self.assertFalse(os.listdir(self.backup_dir.name))
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_only_one_backup(self, mock_datetime):
""" Test the only backup will not be removed in any case """
mock_datetime.now.return_value = datetime(2021, 10, 20)
only_backup = self._add_backup("20010101_0000")
self._run_cleanup(keep_all=1)
self._check_backups([only_backup])
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_at_least_one_should_be_left(self, mock_datetime):
""" Test at least one backup should be left """
mock_datetime.now.return_value = datetime(2021, 10, 20)
backups = [
self._add_backup("20211103_0300"),
self._add_backup("20201216_0100"),
self._add_backup("20200716_0100"),
self._add_backup("20181116_0100"),
]
expected_backups = [backups[0]]
self._run_cleanup()
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_all_threshold_only(self, mock_datetime):
""" Test threshold for keeping all backups """
mock_datetime.now.return_value = datetime(2021, 10, 20)
backups = [
self._add_backup("20211019_0300"),
self._add_backup("20211017_0100"),
self._add_backup("20211016_2300"),
]
expected_backups = backups[:2]
self._run_cleanup(keep_all=3)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_daily_threshold_only(self, mock_datetime):
""" Test threshold for keeping daily backups """
mock_datetime.now.return_value = datetime(2021, 10, 20)
backups = [
self._add_backup("20211019_0300"),
self._add_backup("20211017_2100"),
self._add_backup("20211017_0100"),
self._add_backup("20211017_0030"),
self._add_backup("20211016_2300"),
self._add_backup("20211016_0100"),
]
expected_backups = [backups[0], backups[3]]
self._run_cleanup(keep_daily=3)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_all_and_daily_thresholds(self, mock_datetime):
""" Test threshold for keeping all and daily backups """
mock_datetime.now.return_value = datetime(2021, 10, 20)
backups = [
self._add_backup("20211019_0300"),
self._add_backup("20211017_0200"),
self._add_backup("20211017_0100"),
self._add_backup("20211016_2300"),
self._add_backup("20211016_2200"),
self._add_backup("20211015_2200"),
self._add_backup("20211015_1500"),
self._add_backup("20211015_0200"),
self._add_backup("20211014_2200"),
self._add_backup("20211014_2000"),
self._add_backup("20211014_1232"),
]
expected_backups = backups[0:3] + [backups[4]] + [backups[7]]
self._run_cleanup(keep_all=3, keep_daily=5)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_weekly_threshold_only(self, mock_datetime):
""" Test threshold for keeping weekly backups """
mock_datetime.now.return_value = datetime(2021, 11, 11)
backups = [
self._add_backup("20211111_0300"),
self._add_backup("20211110_0300"),
self._add_backup("20211108_0100"),
self._add_backup("20211107_2300"),
self._add_backup("20211107_0100"),
self._add_backup("20211031_0100"),
self._add_backup("20211025_0100"),
self._add_backup("20211024_0100"),
self._add_backup("20211023_0100"),
self._add_backup("20211022_0100"),
self._add_backup("20211008_0100"),
self._add_backup("20211007_0100"),
self._add_backup("20211004_0100"),
self._add_backup("20211003_0100"),
]
expected_backups = [backups[2], backups[4], backups[6],
backups[9], backups[12]]
self._run_cleanup(keep_weekly=5)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_monthly_threshold_only(self, mock_datetime):
""" Test threshold for keeping monthly backups """
mock_datetime.now.return_value = datetime(2021, 11, 11)
backups = [
self._add_backup("20211103_0300"),
self._add_backup("20211019_0300"),
self._add_backup("20211017_2100"),
self._add_backup("20211017_0100"),
self._add_backup("20210916_2300"),
self._add_backup("20210916_0100"),
self._add_backup("20210816_0100"),
self._add_backup("20210810_0000"),
self._add_backup("20210716_0100"),
]
expected_backups = [backups[0], backups[3], backups[5], backups[7]]
self._run_cleanup(keep_monthly=3)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_keep_yearly_threshold_only(self, mock_datetime):
""" Test threshold for keeping yearly backups """
mock_datetime.now.return_value = datetime(2021, 11, 11)
backups = [
self._add_backup("20211103_0300"),
self._add_backup("20210810_0000"),
self._add_backup("20210716_0100"),
self._add_backup("20201216_0100"),
self._add_backup("20200716_0100"),
self._add_backup("20190316_0100"),
self._add_backup("20181116_0100"),
]
expected_backups = [backups[2], backups[4], backups[5], backups[6]]
self._run_cleanup(keep_yearly=3)
self._check_backups(expected_backups)
@mock.patch(f"{bk.__name__}.datetime", wraps=datetime)
def test_dry_run(self, mock_datetime):
""" Test dry run does not remove anything """
mock_datetime.now.return_value = datetime(2021, 11, 11)
backups = [
self._add_backup("20211103_0300"),
self._add_backup("20210810_0000"),
self._add_backup("20210716_0100"),
self._add_backup("20200716_0100"),
self._add_backup("20181116_0100"),
]
self._run_cleanup(keep_all=2, dry_run=True)
self._check_backups(backups)