Add backups cleanup

This commit is contained in:
2021-11-12 02:53:21 +03:00
parent d521ef3c89
commit c64955362a
4 changed files with 420 additions and 56 deletions

View File

@@ -6,44 +6,166 @@ import logging
import os
import shutil
import time
from datetime import datetime
from typing import Optional
from datetime import datetime, timedelta
from typing import Optional, Iterable
import spqr.curateipsum.fs as fs
BACKUP_ENT_FMT = "%y%m%d_%H%M"
BACKUP_ENT_FMT = "%Y%m%d_%H%M"
DELTA_DIR = "_delta"
_lg = logging.getLogger(__name__)
def _is_backup_entity(entity_path: str) -> bool:
def _is_backup_entity(backup_entry: os.DirEntry) -> bool:
""" Check if entity_path is a single backup dir. """
if not os.path.isdir(entity_path):
if not backup_entry.is_dir():
return False
try:
datetime.strptime(os.path.basename(entity_path), BACKUP_ENT_FMT)
datetime.strptime(backup_entry.name, BACKUP_ENT_FMT)
return True
except ValueError:
return False
def _get_latest_backup(backup_dir: str) -> Optional[str]:
def _iterate_backups(backup_dir: str) -> Iterable[os.DirEntry]:
b_iter = os.scandir(backup_dir)
b_ent: os.DirEntry
for b_ent in b_iter:
if not _is_backup_entity(b_ent):
continue
if not os.listdir(b_ent.path):
_lg.info("Removing empty backup entity: %s", b_ent.name)
os.rmdir(b_ent.path)
continue
yield b_ent
b_iter.close()
def _get_latest_backup(backup_dir: str) -> Optional[os.DirEntry]:
""" Returns path to latest backup created in backup_dir or None. """
backups = sorted(os.listdir(backup_dir), reverse=True)
for b_ent in backups:
b_ent_abs = os.path.join(backup_dir, b_ent)
if not _is_backup_entity(b_ent_abs):
continue
if not os.listdir(b_ent_abs):
_lg.info("Removing empty backup entity: %s", os.path.basename(b_ent_abs))
os.rmdir(b_ent_abs)
continue
return b_ent_abs
all_backups = sorted(_iterate_backups(backup_dir), key=lambda e: e.name)
if all_backups:
return all_backups[-1]
return None
def _date_from_backup(backup: os.DirEntry) -> datetime:
return datetime.strptime(backup.name, BACKUP_ENT_FMT)
def cleanup_old_backups(
backup_dir: str,
dry_run: bool = False,
keep_all: int = 7,
keep_daily: int = 30,
keep_weekly: int = 52,
keep_monthly: int = 12,
keep_yearly: int = 5,
min_free_space: int = 0
):
"""
Delete old backups. Never deletes the only backup.
:param backup_dir: full path to backup directory.
:param dry_run: don't do anything actually.
:param keep_all: the number of days that all backups must be kept.
:param keep_daily: the number of days that all daily backups must be kept.
:param keep_weekly: the number of weeks of which one weekly backup must be kept.
:param keep_monthly: the number of months (1 month = 30 days) of which
one monthly backup must be kept.
:param keep_yearly: the number of years of which one yearly backup must be kept.
:param min_free_space: not used right now
:return:
"""
all_backups = sorted(_iterate_backups(backup_dir),
key=lambda e: e.name, reverse=True)
if dry_run:
_lg.info("Dry-run, no backups will be actually removed")
if not all_backups:
_lg.debug("No backups, exiting")
return
elif len(all_backups) == 1:
_lg.debug("Only one backup (%s) exists, will not remove it",
all_backups[0].name)
return
now = datetime.now()
thresholds = {k: now.strftime(BACKUP_ENT_FMT)
for k in ("all", "daily", "weekly", "monthly", "yearly")}
if keep_all is not None:
thresholds["all"] = ((now - timedelta(days=keep_all))
.replace(hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_daily is not None:
thresholds["daily"] = ((now - timedelta(days=keep_daily))
.replace(hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_weekly is not None:
thresholds["weekly"] = (
(now - timedelta(weeks=keep_weekly, days=now.weekday()))
.strftime(BACKUP_ENT_FMT)
)
if keep_monthly is not None:
thresholds["monthly"] = ((now - timedelta(days=30*keep_monthly))
.replace(day=1, hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT))
if keep_yearly is not None:
thresholds["yearly"] = (
(now - timedelta(days=365*keep_yearly))
.replace(month=1, day=1, hour=0, minute=0, second=0)
.strftime(BACKUP_ENT_FMT)
)
prev_backup = all_backups[0]
to_remove = {b: False for b in all_backups}
for backup in all_backups[1:]:
# skip all backups made after threshold
if backup.name > thresholds["all"]:
prev_backup = backup
continue
# leave only one backup per day for backups made after threshold
if backup.name > thresholds["daily"]:
if (_date_from_backup(prev_backup).date()
== _date_from_backup(backup).date()):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per week for backups made after threshold
if backup.name > thresholds["weekly"]:
if (_date_from_backup(prev_backup).isocalendar()[1]
== _date_from_backup(backup).isocalendar()[1]):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per month for backups made after threshold
if backup.name > thresholds["monthly"]:
if (_date_from_backup(prev_backup).date().replace(day=1)
== _date_from_backup(backup).date().replace(day=1)):
to_remove[prev_backup] = True
prev_backup = backup
continue
# leave only one backup per year for backups made after threshold
if backup.name > thresholds["yearly"]:
if (_date_from_backup(prev_backup).date().replace(month=1, day=1)
== _date_from_backup(backup).date().replace(month=1, day=1)):
to_remove[prev_backup] = True
prev_backup = backup
continue
to_remove[backup] = True
for backup, do_delete in to_remove.items():
_lg.info("Removing old backup %s", backup.name)
if not dry_run and do_delete:
shutil.rmtree(backup.path)
def process_backed_entry(backup_dir: str, entry_relpath: str, action: fs.Actions):
_lg.debug("%s %s", action, entry_relpath)
if action is not fs.Actions.delete:
@@ -60,33 +182,37 @@ def initiate_backup(sources,
start_time = time.time()
start_time_fmt = datetime.fromtimestamp(start_time).strftime(BACKUP_ENT_FMT)
cur_backup = os.path.join(backup_dir, start_time_fmt)
cur_backup_name = os.path.basename(cur_backup)
_lg.debug("Current backup dir: %s", cur_backup)
cur_backup = fs.PseudoDirEntry(os.path.join(backup_dir, start_time_fmt))
_lg.debug("Current backup dir: %s", cur_backup.path)
latest_backup = _get_latest_backup(backup_dir)
if cur_backup == latest_backup:
_lg.warning("Latest backup %s was created less than minute ago, exiting",
os.path.basename(latest_backup))
return
if latest_backup is None:
_lg.info("Creating empty directory for current backup: %s", cur_backup_name)
os.mkdir(cur_backup)
else:
_lg.info("Copying data from latest backup %s to current backup %s",
os.path.basename(latest_backup), cur_backup_name)
_lg.info("Creating empty directory for current backup: %s",
cur_backup.name)
os.mkdir(cur_backup.path)
hl_res = fs.hardlink_dir(src_dir=latest_backup, dst_dir=cur_backup,
else:
# TODO check last backup is finalized
if cur_backup.name == latest_backup.name:
_lg.warning("Latest backup %s was created less than minute ago, exiting",
latest_backup.name)
return
_lg.info("Copying data from latest backup %s to current backup %s",
latest_backup.name, cur_backup.name)
hl_res = fs.hardlink_dir(src_dir=latest_backup.path,
dst_dir=cur_backup.path,
use_external=external_hardlink)
if not hl_res:
_lg.error("Something went wrong during copying data from latest backup,"
" removing created %s", cur_backup_name)
shutil.rmtree(cur_backup, ignore_errors=True)
" removing created %s", cur_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
return
# clean up delta dir from copied backup
shutil.rmtree(os.path.join(cur_backup, DELTA_DIR), ignore_errors=True)
shutil.rmtree(os.path.join(cur_backup.path, DELTA_DIR), ignore_errors=True)
rsync_func = fs.rsync_ext if external_rsync else fs.rsync
@@ -94,12 +220,12 @@ def initiate_backup(sources,
for src in sources:
src_abs = os.path.abspath(src)
src_name = os.path.basename(src_abs)
dst_abs = os.path.join(cur_backup, src_name)
_lg.info("Backing up directory %s to %s backup", src_abs, cur_backup_name)
dst_abs = os.path.join(cur_backup.path, src_name)
_lg.info("Backing up directory %s to %s backup", src_abs, cur_backup.name)
for entry_relpath, action in rsync_func(src_abs, dst_abs, dry_run=dry_run):
if latest_backup is not None:
process_backed_entry(
backup_dir=cur_backup,
backup_dir=cur_backup.path,
entry_relpath=os.path.join(src_name, entry_relpath),
action=action
)
@@ -107,15 +233,15 @@ def initiate_backup(sources,
# do not create backup on dry-run
if dry_run:
_lg.info("Dry-run, removing created backup: %s", cur_backup_name)
shutil.rmtree(cur_backup, ignore_errors=True)
_lg.info("Dry-run, removing created backup: %s", cur_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
# do not create backup if no change from previous one
elif latest_backup is not None and not backup_changed:
_lg.info("Newly created backup %s is the same as previous one %s, removing",
cur_backup_name, os.path.basename(latest_backup))
shutil.rmtree(cur_backup, ignore_errors=True)
cur_backup.name, latest_backup.name)
shutil.rmtree(cur_backup.path, ignore_errors=True)
else:
_lg.info("Backup created: %s", cur_backup_name)
_lg.info("Backup created: %s", cur_backup.name)
end_time = time.time()
spend_time = end_time - start_time

View File

@@ -22,6 +22,27 @@ class Actions(enum.Enum):
update_owner = enum.auto()
create = enum.auto()
class PseudoDirEntry:
def __init__(self, path):
self.path = os.path.realpath(path)
self.name = os.path.basename(self.path)
self._is_dir = None
self._stat = None
def __str__(self):
return self.name
def is_dir(self) -> bool:
if self._is_dir is None:
self._is_dir = os.path.isdir(self.path)
return self._is_dir
def stat(self):
if self._stat is None:
self._stat = os.lstat(self.path)
return self._stat
# *deleting will_be_deleted
# >f.st.... .gitignore
# >f+++++++ LICENSE
@@ -79,7 +100,9 @@ def rsync_ext(src, dst, dry_run=False):
rsync_args.append(str(dst))
_lg.info("Executing external command: %s", " ".join(rsync_args))
process = subprocess.Popen(rsync_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process = subprocess.Popen(rsync_args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
prev_line = None
for line in iter(process.stdout.readline, b""):
@@ -246,10 +269,12 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]:
dst_root_abs = os.path.abspath(dst_dir)
if not os.path.isdir(src_root_abs):
raise RuntimeError(f"Error during reading source directory: {src_root_abs}")
raise RuntimeError("Error during reading source directory: %s"
% src_root_abs)
if os.path.exists(dst_root_abs):
if not os.path.isdir(dst_root_abs):
raise RuntimeError("Destination path is not a directory: %s" % dst_root_abs)
raise RuntimeError("Destination path is not a directory: %s"
% dst_root_abs)
else:
os.mkdir(dst_root_abs)
@@ -278,19 +303,22 @@ def rsync(src_dir, dst_dir, dry_run=False) -> Iterable[tuple]:
# rewrite dst if it has different than src type
if src_entry.is_file(follow_symlinks=False):
if not dst_entry.is_file(follow_symlinks=False):
_lg.debug("Rewriting (src is a file, dst is not a file): %s", rel_path)
_lg.debug("Rewriting (src is a file, dst is not a file): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
if src_entry.is_dir(follow_symlinks=False):
if not dst_entry.is_dir(follow_symlinks=False):
_lg.debug("Rewriting (src is a dir, dst is not a dir): %s", rel_path)
_lg.debug("Rewriting (src is a dir, dst is not a dir): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
if src_entry.is_symlink():
if not dst_entry.is_symlink():
_lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s", rel_path)
_lg.debug("Rewriting (src is a symlink, dst is not a symlink): %s",
rel_path)
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.rewrite
continue
@@ -379,7 +407,9 @@ def _recursive_hardlink_ext(src: str, dst: str) -> bool:
src_content = glob.glob(f"{src}/*")
cmd = [cp, "--archive", "--verbose", "--link", *src_content, dst]
_lg.info("Executing external command: %s", " ".join(cmd))
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
process = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
for line in iter(process.stdout.readline, b""):
_lg.debug("%s: %s", cp, line.decode("utf-8").strip())
@@ -458,10 +488,11 @@ def nest_hardlink(src_dir: str, src_relpath: str, dst_dir: str):
# check source entity and destination directory
if not os.path.exists(src_full_path):
raise RuntimeError(f"Error reading source entity: {src_full_path}")
raise RuntimeError("Error reading source entity: %s" % src_full_path)
if os.path.exists(dst_dir_abs):
if not os.path.isdir(dst_dir_abs):
raise RuntimeError("Destination path is not a directory: %s" % dst_dir_abs)
raise RuntimeError("Destination path is not a directory: %s"
% dst_dir_abs)
else:
os.mkdir(dst_dir_abs)