From 862c20ed24cffc5ebf9378288f8749dce1ae00a8 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 19 Jun 2021 15:28:42 +0300 Subject: [PATCH] Add rsync support --- spqr/curateipsum/backup.py | 7 ++ spqr/curateipsum/fs.py | 185 +++++++++++++++++++++++++++------ tests/test_fs.py | 205 ++++++++++++++++++++++++++++++++----- 3 files changed, 340 insertions(+), 57 deletions(-) diff --git a/spqr/curateipsum/backup.py b/spqr/curateipsum/backup.py index 5d505c6..51ee2a6 100644 --- a/spqr/curateipsum/backup.py +++ b/spqr/curateipsum/backup.py @@ -69,4 +69,11 @@ def initiate_backup(sources, backup_dir: pathlib.Path): latest_backup.name, cur_backup.name, ) + hardlink_dir(latest_backup, cur_backup) + + # for src in sources: + # src_abs = pathlib.Path(os.path.abspath(src)) + # dst_abs = pathlib.Path(os.path.join(cur_backup, src_abs.name)) + # _lg.info("Backing up directory %s to %s backup", src_abs, cur_backup.name) + # rsync(src_abs, cur_backup) diff --git a/spqr/curateipsum/fs.py b/spqr/curateipsum/fs.py index ccf2940..2e5f162 100644 --- a/spqr/curateipsum/fs.py +++ b/spqr/curateipsum/fs.py @@ -2,6 +2,7 @@ Module with filesystem-related functions. """ +import enum import logging import os import subprocess @@ -28,15 +29,83 @@ def rsync_ext(src, dst, dry_run=False): return res -def scantree(path) -> Iterable[os.DirEntry]: +def scantree(path, dir_first=True) -> Iterable[os.DirEntry]: """Recursively yield DirEntry file objects for given directory.""" entry: os.DirEntry - for entry in os.scandir(path): - if entry.is_dir(follow_symlinks=False): - yield entry - yield from scantree(entry.path) - else: - yield entry + """Recursively yield DirEntry objects for given directory.""" + with os.scandir(path) as scan_it: + for entry in scan_it: + if entry.is_dir(follow_symlinks=False): + if dir_first: + yield entry + yield from scantree(entry.path, dir_first) + if not dir_first: + yield entry + else: + yield entry + + +def rm_direntry(entry: os.DirEntry): + """ Recursively delete DirEntry (dir, file or symlink). """ + if entry.is_file(follow_symlinks=False) or entry.is_symlink(): + os.unlink(entry.path) + return + if entry.is_dir(follow_symlinks=False): + with os.scandir(entry.path) as it: + child_entry: os.DirEntry + for child_entry in it: + rm_direntry(child_entry) + os.rmdir(entry.path) + + +try: + O_BINARY = os.O_BINARY # Windows only +except AttributeError: + O_BINARY = 0 +READ_FLAGS = os.O_RDONLY | O_BINARY +WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | O_BINARY +BUFFER_SIZE = 128 * 1024 + + +def copyfile(src, dst): + fin = os.open(src, READ_FLAGS) + stat = os.fstat(fin) + fout = os.open(dst, WRITE_FLAGS, stat.st_mode) + for x in iter(lambda: os.read(fin, BUFFER_SIZE), b""): + os.write(fout, x) + os.close(fout) + os.close(fin) + + +def copy_direntry(entry: os.DirEntry, dst_path): + if entry.is_dir(): + os.mkdir(dst_path) + + elif entry.is_symlink(): + link_target = os.readlink(entry.path) + os.symlink(link_target, dst_path) + + else: + copyfile(entry.path, dst_path) + + src_stat = entry.stat(follow_symlinks=False) + os.chown(dst_path, src_stat.st_uid, src_stat.st_gid, follow_symlinks=False) + os.chmod(dst_path, src_stat.st_mode, follow_symlinks=False) + os.utime(dst_path, (src_stat.st_atime, src_stat.st_mtime), follow_symlinks=False) + + +def update_direntry(src_entry: os.DirEntry, dst_entry: os.DirEntry): + rm_direntry(dst_entry) + copy_direntry(src_entry, dst_entry.path) + + +class Actions(enum.Enum): + nothing = enum.auto() + delete = enum.auto() + rewrite = enum.auto() + update_perm = enum.auto() + update_owner = enum.auto() + create = enum.auto() def rsync(src_dir, dst_dir=None): @@ -48,37 +117,93 @@ def rsync(src_dir, dst_dir=None): """ _lg.info(f"Rsync: {src_dir} -> {dst_dir}") - src_abs = os.path.abspath(src_dir) - dst_abs = os.path.abspath(dst_dir) + src_root_abs = os.path.abspath(src_dir) + dst_root_abs = os.path.abspath(dst_dir) - if not os.path.isdir(src_abs): - raise RuntimeError(f"Error during reading source directory: {src_abs}") - if os.path.exists(dst_abs): - if not os.path.isdir(dst_abs): - raise RuntimeError(f"Destination path is not a directory: {dst_abs}") + if not os.path.isdir(src_root_abs): + raise RuntimeError(f"Error during reading source directory: {src_root_abs}") + if os.path.exists(dst_root_abs): + if not os.path.isdir(dst_root_abs): + raise RuntimeError(f"Destination path is not a directory: {dst_root_abs}") else: - os.mkdir(dst_abs) + os.mkdir(dst_root_abs) - for src_entry in scantree(src_abs): - rel_path = src_entry.path[len(src_abs)+1:] - dst_path = os.path.join(dst_abs, rel_path) - src_stat = src_entry.stat(follow_symlinks=False) + # {rel_path: dir_entry} map + src_files_map = { + ent.path[len(src_root_abs) + 1 :]: ent for ent in scantree(src_root_abs) + } - dst_stat = os.lstat(dst_path) + # process dst tree + for dst_entry in scantree(dst_root_abs, dir_first=False): + rel_path = dst_entry.path[len(dst_root_abs) + 1 :] + src_entry = src_files_map.get(rel_path) + + # remove dst entries not existing in source + if src_entry is None: + _lg.debug("deleting %s", rel_path) + rm_direntry(dst_entry) + continue + + # mark src entry as taken for processing + del src_files_map[rel_path] + + src_entry: os.DirEntry + # rewrite dst if it has different than src type + if src_entry.is_file(follow_symlinks=False): + if not dst_entry.is_file(follow_symlinks=False): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue if src_entry.is_dir(follow_symlinks=False): - pass + if not dst_entry.is_dir(follow_symlinks=False): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue + if src_entry.is_symlink(): + if not dst_entry.is_symlink(): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue - do_update = False - # check file size - if src_stat.st_size != dst_stat.st_size: - do_update = True - # check modification time (mtime) - if src_stat.st_mtime > dst_stat.st_mtime: - do_update = True + # rewrite dst if it is hard link to src (bad for backups) + if src_entry.inode() == dst_entry.inode(): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue - if do_update: - _lg.info("Updating %s", src_entry) + src_stat = src_entry.stat(follow_symlinks=False) + dst_stat = dst_entry.stat(follow_symlinks=False) + + # rewrite dst file/symlink which have different with src size or mtime + if src_entry.is_file(follow_symlinks=False): + same_size = src_stat.st_size == dst_stat.st_size + same_mtime = src_stat.st_mtime == dst_stat.st_mtime + if not (same_size and same_mtime): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue + + # rewrite dst symlink if it points somewhere else than src + if src_entry.is_symlink(): + if os.readlink(src_entry.path) != os.readlink(dst_entry.path): + _lg.info("rewriting %s", rel_path) + update_direntry(src_entry, dst_entry) + continue + + # update permissions and ownership + if src_stat.st_mode != dst_stat.st_mode: + _lg.info("updating permissions %s", rel_path) + os.chmod(dst_entry.path, dst_stat.st_mode) + + if src_stat.st_uid != dst_stat.st_uid or src_stat.st_gid != dst_stat.st_gid: + _lg.info("updating owners %s", rel_path) + os.chown(dst_entry.path, src_stat.st_uid, src_stat.st_gid) + + for rel_path, src_entry in src_files_map.items(): + dst_path = os.path.join(dst_root_abs, rel_path) + _lg.info("creating %s", rel_path) + copy_direntry(src_entry, dst_path) def _hardlink_dir_ext(src, dst): diff --git a/tests/test_fs.py b/tests/test_fs.py index 41e9c63..fea6f2b 100644 --- a/tests/test_fs.py +++ b/tests/test_fs.py @@ -8,31 +8,58 @@ import unittest from spqr.curateipsum import fs -class TestHardlinkDir(unittest.TestCase): +class CommonFSTestCase(unittest.TestCase): def setUp(self): - self.tmp_dir = tempfile.TemporaryDirectory() + self.tmp_dir_src = tempfile.TemporaryDirectory(prefix="source_") + self.tmp_dir_dst = tempfile.TemporaryDirectory(prefix="dest_") + self.src_dir = self.tmp_dir_src.name + self.dst_dir = self.tmp_dir_dst.name + + def tearDown(self): + self.tmp_dir_src.cleanup() + self.tmp_dir_dst.cleanup() + + @staticmethod + def create_file(parent_dir, prefix=None): + fd, path = tempfile.mkstemp(prefix=prefix, dir=parent_dir) + with open(fd, "w") as f: + f.write(string.printable) + return path + + @staticmethod + def create_dir(parent_dir, prefix=None): + return tempfile.mkdtemp(prefix=prefix, dir=parent_dir) + + def relpath(self, full_path): + if full_path.startswith(self.src_dir): + p_dir = self.src_dir + elif full_path.startswith(self.dst_dir): + p_dir = self.dst_dir + else: + raise RuntimeError(f"Path {full_path} is not src_dir nor dst_dir") + + return full_path[len(p_dir) + 1 :] + + +class TestHardlinkDir(CommonFSTestCase): + def setUp(self): + self.tmp_dir = tempfile.TemporaryDirectory(prefix="source_") self.src_dir = self.tmp_dir.name self.dst_dir = self.src_dir + ".copy" - def _create_common_file(self): - cf_relpath = "common_file" - cf_path = os.path.join(self.src_dir, cf_relpath) - with open(cf_path, "w") as f: - f.write(string.printable) - return cf_relpath - def test_common_file(self): - cf_relpath = self._create_common_file() + cf_path = self.create_file(self.src_dir) + cf_relpath = self.relpath(cf_path) fs.hardlink_dir(self.src_dir, self.dst_dir) - src_stat = os.lstat(os.path.join(self.dst_dir, cf_relpath)) + src_stat = os.lstat(cf_path) dst_stat = os.lstat(os.path.join(self.src_dir, cf_relpath)) - self.assertTrue(os.path.samestat(src_stat, dst_stat)) - self.assertEqual(src_stat.st_nlink, 2) + assert os.path.samestat(src_stat, dst_stat) + assert src_stat.st_nlink == 2 def test_relative_symlink_to_common_file(self): - cf_relpath = self._create_common_file() + cf_relpath = self.relpath(self.create_file(self.src_dir)) sl2cf_relpath = "symlink_to_common_file" os.chdir(self.src_dir) os.symlink(cf_relpath, sl2cf_relpath) @@ -41,17 +68,16 @@ class TestHardlinkDir(unittest.TestCase): # check link dst_sl2cf_path = os.path.join(self.dst_dir, sl2cf_relpath) - self.assertEqual(os.readlink(dst_sl2cf_path), cf_relpath) + assert os.readlink(dst_sl2cf_path) == cf_relpath # check stats src_stat = os.lstat(os.path.join(self.dst_dir, sl2cf_relpath)) dst_stat = os.lstat(dst_sl2cf_path) - self.assertTrue(os.path.samestat(src_stat, dst_stat)) - self.assertEqual(src_stat.st_nlink, 2) + assert os.path.samestat(src_stat, dst_stat) + assert src_stat.st_nlink == 2 def test_absolute_symlink_to_common_file(self): - cf_relpath = self._create_common_file() - cf_path = os.path.join(self.src_dir, cf_relpath) + cf_path = self.create_file(self.src_dir) sl2cf_relpath = "symlink_to_common_file" sl2cf_path = os.path.join(self.src_dir, sl2cf_relpath) os.symlink(cf_path, sl2cf_path) @@ -60,17 +86,16 @@ class TestHardlinkDir(unittest.TestCase): # check link dst_sl2cf_path = os.path.join(self.dst_dir, sl2cf_relpath) - self.assertEqual(os.readlink(dst_sl2cf_path), cf_path) + assert os.readlink(dst_sl2cf_path) == cf_path # check stats src_stat = os.lstat(os.path.join(self.dst_dir, sl2cf_relpath)) dst_stat = os.lstat(dst_sl2cf_path) - self.assertTrue(os.path.samestat(src_stat, dst_stat)) - self.assertEqual(src_stat.st_nlink, 2) + assert os.path.samestat(src_stat, dst_stat) + assert src_stat.st_nlink == 2 def test_hardlink_to_common_file(self): - cf_relpath = self._create_common_file() - cf_path = os.path.join(self.src_dir, cf_relpath) + cf_path = self.create_file(self.src_dir) hl2cf_relpath = "hardlink_to_common_file" hl2cf_path = os.path.join(self.src_dir, hl2cf_relpath) os.link(cf_path, hl2cf_path) @@ -81,10 +106,136 @@ class TestHardlinkDir(unittest.TestCase): src_hl_stat = os.lstat(hl2cf_path) dst_hl_stat = os.lstat(os.path.join(self.dst_dir, hl2cf_relpath)) - self.assertTrue(os.path.samestat(src_cf_stat, dst_hl_stat)) - self.assertTrue(os.path.samestat(src_hl_stat, dst_hl_stat)) - self.assertEqual(src_cf_stat.st_nlink, 4) + assert os.path.samestat(src_cf_stat, dst_hl_stat) + assert os.path.samestat(src_hl_stat, dst_hl_stat) + assert src_cf_stat.st_nlink == 4 def tearDown(self): self.tmp_dir.cleanup() shutil.rmtree(self.dst_dir, ignore_errors=True) + + +# TODO not finished +class TestRsync(CommonFSTestCase): + @staticmethod + def check_identical_file(file1, file2): + st1 = os.lstat(file1) + st2 = os.lstat(file2) + + assert st1.st_uid == st2.st_uid + assert st1.st_gid == st2.st_gid + assert st1.st_mode == st2.st_mode + assert st1.st_mtime == st2.st_mtime + assert st1.st_size == st2.st_size + + def test_dst_has_excess_file(self): + dst_fpath = self.create_file(self.dst_dir) + + fs.rsync(self.src_dir, self.dst_dir) + assert not os.path.lexists(dst_fpath) + + def test_dst_has_excess_symlink(self): + dst_lpath = os.path.join(self.dst_dir, 'broken_symlink') + os.symlink('broken_symlink', dst_lpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert not os.path.lexists(dst_lpath) + + def test_dst_has_excess_empty_dir(self): + dst_dpath = self.create_dir(self.dst_dir) + + fs.rsync(self.src_dir, self.dst_dir) + assert not os.path.lexists(dst_dpath) + + def test_dst_has_excess_nonempty_dir(self): + dst_dpath = self.create_dir(self.dst_dir) + self.create_file(dst_dpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert not os.path.lexists(dst_dpath) + + def test_dst_has_excess_nonempty_recursive_dir(self): + dst_dpath = self.create_dir(self.dst_dir) + nested_dpath = self.create_dir(dst_dpath) + self.create_file(nested_dpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert not os.path.lexists(dst_dpath) + + def test_different_types_src_file_dst_dir(self): + src_fpath = self.create_file(self.src_dir) + dst_path = os.path.join(self.dst_dir, self.relpath(src_fpath)) + os.mkdir(dst_path) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.isfile(dst_path) + + def test_different_types_src_file_dst_symlink(self): + src_fpath = self.create_file(self.src_dir) + dst_path = os.path.join(self.dst_dir, self.relpath(src_fpath)) + os.symlink('broken_link', dst_path) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.isfile(dst_path) + + def test_different_types_src_symlink_dst_file(self): + dst_path = self.create_file(self.dst_dir) + src_lpath = os.path.join(self.src_dir, self.relpath(dst_path)) + os.symlink('broken_link', src_lpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.islink(dst_path) + + def test_different_types_src_symlink_dst_dir(self): + dst_path = self.create_dir(self.dst_dir) + src_lpath = os.path.join(self.src_dir, self.relpath(dst_path)) + os.symlink('broken_link', src_lpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.islink(dst_path) + + def test_different_types_src_dir_dst_file(self): + src_dpath = self.create_dir(self.src_dir) + dst_path = os.path.join(self.dst_dir, self.relpath(src_dpath)) + with open(dst_path, "w") as f: + f.write(string.printable) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.isdir(dst_path) + + def test_different_types_src_dir_dst_symlink(self): + src_dpath = self.create_dir(self.src_dir) + dst_path = os.path.join(self.dst_dir, self.relpath(src_dpath)) + os.symlink('broken_link', dst_path) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_path) + assert os.path.isdir(dst_path) + + def test_src_dst_same_inode(self): + src_fpath = self.create_file(self.src_dir) + dst_fpath = os.path.join(self.dst_dir, self.relpath(src_fpath)) + os.link(src_fpath, dst_fpath) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_fpath) + src_stat = os.lstat(src_fpath) + dst_stat = os.lstat(dst_fpath) + assert src_stat.st_nlink == 1 + assert dst_stat.st_nlink == 1 + assert src_stat.st_ino != dst_stat.st_ino + + def test_src_dst_diff_size(self): + src_fpath = self.create_file(self.src_dir) + dst_fpath = os.path.join(self.dst_dir, self.relpath(src_fpath)) + with open(dst_fpath, "w") as df: + df.write(string.printable * 2) + + fs.rsync(self.src_dir, self.dst_dir) + assert os.path.lexists(dst_fpath) + self.check_identical_file(src_fpath, dst_fpath)