Files
cura-te-ipsum/curateipsum/fs.py
Maks Snegov 7c54bf3823 Fix PseudoDirEntry follow_symlinks handling and add documentation
- Fix follow_symlinks parameter being ignored in is_dir(), is_file()
- Change from realpath() to abspath() to preserve symlinks
- Add separate caches for stat() and lstat() results
- Remove incorrect follow_symlinks param from is_symlink()
- Add comprehensive docstring explaining purpose and design

When follow_symlinks=False, methods now correctly return False for
symlinks instead of following them. Previously all symlinks were
resolved, breaking symlink-aware backup operations.

Fixes #8
2026-02-04 23:00:06 -08:00

596 lines
22 KiB
Python

"""
Module with filesystem-related functions.
"""
import enum
import glob
import logging
import os
import subprocess
import sys
from typing import Iterable, Tuple, Union
_lg = logging.getLogger(__name__)
class BackupCreationError(Exception):
pass
class Actions(enum.Enum):
NOTHING = enum.auto()
DELETE = enum.auto()
REWRITE = enum.auto()
UPDATE_TIME = enum.auto()
UPDATE_PERM = enum.auto()
UPDATE_OWNER = enum.auto()
CREATE = enum.auto()
ERROR = enum.auto()
class PseudoDirEntry:
"""
Duck-typed os.DirEntry for paths that don't exist yet or when you need
DirEntry-like interface for arbitrary paths.
Problem: os.DirEntry is created by os.scandir() and cannot be manually
constructed. But we need DirEntry-compatible objects for:
- Paths that will exist soon (new backup directories)
- Constructed paths (marker files)
- Uniform interface in functions accepting both real and future entries
Why not just use strings? Functions like rm_direntry(), copy_direntry()
accept Union[os.DirEntry, PseudoDirEntry] and call .is_dir(), .stat()
methods. Using this class avoids branching on type throughout the codebase.
Why not pathlib.Path? We heavily use os.scandir() which returns DirEntry
objects with cached stat info. PseudoDirEntry maintains API consistency
with minimal overhead.
Example usage:
# Create entry for future backup directory
cur_backup = PseudoDirEntry("/backups/20260204_120000")
os.mkdir(cur_backup.path)
set_backup_marker(cur_backup) # accepts DirEntry-like object
Caches stat results like real DirEntry to avoid repeated syscalls.
"""
def __init__(self, path):
# Use abspath, not realpath - realpath resolves symlinks
self.path = os.path.abspath(path)
self.name = os.path.basename(self.path)
self._is_dir = None
self._is_file = None
self._is_symlink = None
# Cache both stat and lstat separately
self._stat_follow = None
self._stat_nofollow = None
def __str__(self):
return self.name
def is_dir(self, follow_symlinks: bool = True) -> bool:
if follow_symlinks:
if self._is_dir is None:
self._is_dir = os.path.isdir(self.path)
return self._is_dir
else:
# When not following symlinks, must return False if path is symlink
return os.path.isdir(self.path) and not os.path.islink(self.path)
def is_file(self, follow_symlinks: bool = True) -> bool:
if follow_symlinks:
if self._is_file is None:
self._is_file = os.path.isfile(self.path)
return self._is_file
else:
# When not following symlinks, must return False if path is symlink
return os.path.isfile(self.path) and not os.path.islink(self.path)
def is_symlink(self) -> bool:
if self._is_symlink is None:
self._is_symlink = os.path.islink(self.path)
return self._is_symlink
def stat(self, follow_symlinks: bool = True):
if follow_symlinks:
if self._stat_follow is None:
self._stat_follow = os.stat(self.path)
return self._stat_follow
else:
if self._stat_nofollow is None:
self._stat_nofollow = os.lstat(self.path)
return self._stat_nofollow
def _parse_rsync_output(line: str) -> Tuple[str, Actions, str]:
action = None
change_string, relpath = line.split(' ', maxsplit=1)
if change_string == "*deleting":
return relpath, Actions.DELETE, ""
update_type = change_string[0]
entity_type = change_string[1]
change_type = change_string[2:]
if update_type == "c" and entity_type in {"d", "L"} and "+" in change_type:
action = Actions.CREATE
elif update_type == ">" and entity_type == "f" and "+" in change_type:
action = Actions.CREATE
elif entity_type == "f" and ("s" in change_type or "t" in change_type):
action = Actions.REWRITE
elif entity_type == "d" and "t" in change_type:
action = Actions.UPDATE_TIME
elif "p" in change_type:
action = Actions.UPDATE_PERM
elif "o" in change_type or "g" in change_type:
action = Actions.UPDATE_OWNER
if action is None:
raise RuntimeError("Not parsed string: %s" % line)
return relpath, action, ""
def rsync_ext(src, dst, dry_run=False) -> Iterable[Tuple[str, Actions, str]]:
"""
Call external rsync command for syncing files from src to dst.
Yield (path, action, error message) tuples.
"""
rsync_args = ["rsync"]
if dry_run:
rsync_args.append("--dry-run")
rsync_args.append("--archive")
# rsync_args.append("--compress")
# rsync_args.append("--inplace")
rsync_args.append("--whole-file")
rsync_args.append("--human-readable")
rsync_args.append("--delete-during")
rsync_args.append("--itemize-changes")
rsync_args.append(f"{src}/")
rsync_args.append(str(dst))
_lg.info("Executing external command: %s", " ".join(rsync_args))
process = subprocess.Popen(rsync_args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
prev_line = None
for line in iter(process.stdout.readline, b""):
_lg.debug("Rsync current line: %s", line)
if prev_line is None:
prev_line = line
continue
try:
prev_line = prev_line.decode("utf-8").strip()
# some issues with cyrillic in filenames
except UnicodeDecodeError:
_lg.error("Can't process rsync line: %s", prev_line)
continue
_lg.debug("Rsync itemize line: %s", prev_line)
yield _parse_rsync_output(prev_line)
prev_line = line
try:
prev_line = prev_line.decode("utf-8").strip()
_lg.debug("Rsync itemize line: %s", prev_line)
yield _parse_rsync_output(prev_line)
# some issues with cyrillic in filenames
except UnicodeDecodeError:
_lg.error("Can't process rsync line: %s", prev_line)
process.wait()
def scantree(path, dir_first=True) -> Iterable[os.DirEntry]:
"""
Recursively yield DirEntry objects (dir/file/symlink) for given directory.
"""
entry: os.DirEntry
with os.scandir(path) as scan_it:
for entry in scan_it:
if entry.is_dir(follow_symlinks=False):
if dir_first:
yield entry
yield from scantree(entry.path, dir_first)
if not dir_first:
yield entry
else:
yield entry
def rm_direntry(entry: Union[os.DirEntry, PseudoDirEntry]):
""" Recursively delete DirEntry (dir/file/symlink). """
if entry.is_file(follow_symlinks=False) or entry.is_symlink():
os.unlink(entry.path)
elif entry.is_dir(follow_symlinks=False):
with os.scandir(entry.path) as it:
child_entry: os.DirEntry
for child_entry in it:
rm_direntry(child_entry)
os.rmdir(entry.path)
try:
O_BINARY = os.O_BINARY # Windows only
except AttributeError:
O_BINARY = 0
READ_FLAGS = os.O_RDONLY | O_BINARY
WRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | O_BINARY
BUFFER_SIZE = 128 * 1024
def copy_file(src, dst):
""" Copy file from src to dst. Faster than shutil.copy. """
fin = None
fout = None
try:
fin = os.open(src, READ_FLAGS)
fstat = os.fstat(fin)
fout = os.open(dst, WRITE_FLAGS, fstat.st_mode)
for x in iter(lambda: os.read(fin, BUFFER_SIZE), b""):
os.write(fout, x)
finally:
if fout is not None:
try:
os.close(fout)
except OSError:
pass
if fin is not None:
try:
os.close(fin)
except OSError:
pass
def copy_direntry(entry: Union[os.DirEntry, PseudoDirEntry], dst_path):
""" Non-recursive DirEntry (file/dir/symlink) copy. """
src_stat = entry.stat(follow_symlinks=False)
if entry.is_dir():
os.mkdir(dst_path)
elif entry.is_symlink():
link_target = os.readlink(entry.path)
os.symlink(link_target, dst_path)
else:
copy_file(entry.path, dst_path)
if entry.is_symlink():
# change symlink attributes only if supported by OS
if os.chown in os.supports_follow_symlinks:
try:
os.chown(dst_path, src_stat.st_uid, src_stat.st_gid,
follow_symlinks=False)
except PermissionError:
_lg.debug("Cannot change ownership (not root): %s", dst_path)
if os.chmod in os.supports_follow_symlinks:
os.chmod(dst_path, src_stat.st_mode, follow_symlinks=False)
if os.utime in os.supports_follow_symlinks:
os.utime(dst_path, (src_stat.st_atime, src_stat.st_mtime),
follow_symlinks=False)
else:
try:
os.chown(dst_path, src_stat.st_uid, src_stat.st_gid)
except PermissionError:
_lg.debug("Cannot change ownership (not root): %s", dst_path)
os.chmod(dst_path, src_stat.st_mode)
os.utime(dst_path, (src_stat.st_atime, src_stat.st_mtime))
def update_direntry(src_entry: os.DirEntry, dst_entry: os.DirEntry):
"""
Make dst DirEntry (file/dir/symlink) same as src.
If dst is directory, its content will be removed.
Src dir content will not be copied into dst dir.
"""
rm_direntry(dst_entry)
copy_direntry(src_entry, dst_entry.path)
def rsync(src_dir,
dst_dir,
dry_run=False) -> Iterable[Tuple[str, Actions, str]]:
"""
Sync files/dirs/symlinks from src_dir to dst_dir.
Yield (path, action, error message) tuples.
Entries in dst_dir will be removed if not present in src_dir.
Analog of 'rsync --delete -irltpog'.
"""
_lg.debug("Rsync: %s -> %s", src_dir, dst_dir)
src_root_abs = os.path.abspath(src_dir)
dst_root_abs = os.path.abspath(dst_dir)
if not os.path.isdir(src_root_abs):
raise BackupCreationError(
"Error during reading source directory: %s" % src_root_abs
)
if os.path.exists(dst_root_abs):
if not os.path.isdir(dst_root_abs):
raise BackupCreationError(
"Destination path is not a directory: %s" % dst_root_abs
)
else:
os.mkdir(dst_root_abs)
# Create source map {rel_path: dir_entry}
src_files_map = {
ent.path[len(src_root_abs) + 1:]: ent for ent in scantree(src_root_abs)
}
# process dst tree
for dst_entry in scantree(dst_root_abs, dir_first=False):
rel_path = dst_entry.path[len(dst_root_abs) + 1:]
src_entry = src_files_map.get(rel_path)
# remove dst entries not existing in source
if src_entry is None:
_lg.debug("Rsync, deleting: %s", rel_path)
try:
rm_direntry(dst_entry)
yield rel_path, Actions.DELETE, ""
continue
except OSError as exc:
raise BackupCreationError(exc) from exc
# mark src entry as taken for processing
del src_files_map[rel_path]
src_entry: os.DirEntry
# rewrite dst if it has different type from src
if src_entry.is_file(follow_symlinks=False):
if not dst_entry.is_file(follow_symlinks=False):
_lg.debug("Rsync, rewriting"
" (src is a file, dst is not a file): %s",
rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
if src_entry.is_dir(follow_symlinks=False):
if not dst_entry.is_dir(follow_symlinks=False):
_lg.debug("Rsync, rewriting"
" (src is a dir, dst is not a dir): %s",
rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
if src_entry.is_symlink():
if not dst_entry.is_symlink():
_lg.debug("Rsync, rewriting"
" (src is a symlink, dst is not a symlink): %s",
rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
# rewrite dst if it is hard link to src (bad for backups)
if src_entry.inode() == dst_entry.inode():
_lg.debug("Rsync, rewriting (different inodes): %s", rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
src_stat = src_entry.stat(follow_symlinks=False)
dst_stat = dst_entry.stat(follow_symlinks=False)
# rewrite dst file/symlink which have different size or mtime than src
if src_entry.is_file(follow_symlinks=False):
same_size = src_stat.st_size == dst_stat.st_size
same_mtime = src_stat.st_mtime == dst_stat.st_mtime
if not (same_size and same_mtime):
reason = "size" if not same_size else "time"
_lg.debug("Rsync, rewriting (different %s): %s",
reason, rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
# rewrite dst symlink if it points somewhere else than src
if src_entry.is_symlink():
if os.readlink(src_entry.path) != os.readlink(dst_entry.path):
_lg.debug("Rsync, rewriting (different symlink target): %s",
rel_path)
try:
update_direntry(src_entry, dst_entry)
yield rel_path, Actions.REWRITE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
continue
# update permissions and ownership
if src_stat.st_mode != dst_stat.st_mode:
_lg.debug("Rsync, updating permissions: %s", rel_path)
os.chmod(dst_entry.path, src_stat.st_mode)
yield rel_path, Actions.UPDATE_PERM, ""
if (src_stat.st_uid != dst_stat.st_uid
or src_stat.st_gid != dst_stat.st_gid):
_lg.debug("Rsync, updating owners: %s", rel_path)
try:
os.chown(dst_entry.path, src_stat.st_uid, src_stat.st_gid)
yield rel_path, Actions.UPDATE_OWNER, ""
except PermissionError:
_lg.debug("Cannot change ownership (not root): %s", rel_path)
# process remained source entries (new files/dirs/symlinks)
for rel_path, src_entry in src_files_map.items():
dst_path = os.path.join(dst_root_abs, rel_path)
_lg.debug("Rsync, creating: %s", rel_path)
try:
copy_direntry(src_entry, dst_path)
yield rel_path, Actions.CREATE, ""
except OSError as exc:
yield rel_path, Actions.ERROR, str(exc)
# restore dir mtimes in dst, updated by updating files
for src_entry in scantree(src_root_abs, dir_first=True):
if not src_entry.is_dir():
continue
rel_path = src_entry.path[len(src_root_abs) + 1:]
dst_path = os.path.join(dst_root_abs, rel_path)
src_stat = src_entry.stat(follow_symlinks=False)
dst_stat = os.lstat(dst_path)
if src_stat.st_mtime != dst_stat.st_mtime:
_lg.debug("Rsync, restoring directory mtime: %s", dst_path)
os.utime(dst_path,
(src_stat.st_atime, src_stat.st_mtime),
follow_symlinks=False)
# restore dst_root dir mtime
src_root_stat = os.lstat(src_root_abs)
dst_root_stat = os.lstat(dst_root_abs)
if src_root_stat.st_mtime != dst_root_stat.st_mtime:
_lg.debug("Rsync, restoring root directory mtime: %s", dst_root_abs)
os.utime(dst_root_abs,
(src_root_stat.st_atime, src_root_stat.st_mtime),
follow_symlinks=False)
def _recursive_hardlink_ext(src: str, dst: str) -> bool:
"""
Make hardlink for a directory using cp -al. Both src and dst should exist.
:param src: absolute path to source directory.
:param dst: absolute path to target directory.
:return: success or not
"""
if sys.platform == "darwin":
cp = "gcp"
else:
cp = "cp"
src_content = glob.glob(f"{src}/*")
cmd = [cp, "--archive", "--verbose", "--link", *src_content, dst]
_lg.info("Executing external command: %s", " ".join(cmd))
process = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
with process.stdout:
for line in iter(process.stdout.readline, b""):
_lg.debug("%s: %s", cp, line.decode("utf-8").strip())
exitcode = process.wait()
return not bool(exitcode)
def _recursive_hardlink(src: str, dst: str) -> bool:
"""
Do hardlink directory recursively using python only.
Both src and dst directories should exist.
:param src: absolute path to source directory.
:param dst: absolute path to target directory.
:return: True if success, False otherwise.
"""
with os.scandir(src) as it:
ent: os.DirEntry
for ent in it:
ent_dst_path = os.path.join(dst, ent.name)
if ent.is_dir(follow_symlinks=False):
_lg.debug("Hardlink, copying directory: %s -> %s",
ent.path, ent_dst_path)
os.mkdir(ent_dst_path)
# process directory children
_recursive_hardlink(ent.path, ent_dst_path)
# save directory's metainfo
ent_stat = ent.stat(follow_symlinks=False)
try:
os.chown(ent_dst_path, ent_stat.st_uid, ent_stat.st_gid)
except PermissionError:
_lg.debug("Cannot change ownership (not root): %s", ent_dst_path)
os.chmod(ent_dst_path, ent_stat.st_mode)
os.utime(ent_dst_path, (ent_stat.st_atime, ent_stat.st_mtime))
continue
if ent.is_file(follow_symlinks=False) or ent.is_symlink():
_lg.debug("Hardlink, creating link for file: %s -> %s",
ent.path, ent_dst_path)
os.link(ent.path, ent_dst_path, follow_symlinks=False)
continue
# something that is not a file, symlink or directory
raise NotImplementedError(ent.path)
return True
def hardlink_dir(src_dir, dst_dir, use_external: bool = False) -> bool:
"""
Make hardlink for a directory with all its content.
:param src_dir: path to source directory
:param dst_dir: path to target directory
:param use_external: whether to use external cp -al command
:return: True if success, False otherwise.
"""
_lg.debug("Recursive hardlinking: %s -> %s", src_dir, dst_dir)
src_abs = os.path.abspath(src_dir)
dst_abs = os.path.abspath(dst_dir)
if not os.path.isdir(src_abs):
raise RuntimeError(f"Error reading source directory: {src_dir}")
if os.path.exists(dst_abs):
raise RuntimeError(f"Destination already exists: {dst_dir}")
_lg.debug("Hardlink, creating directory: %s", dst_abs)
os.mkdir(dst_abs)
hardlink_func = (_recursive_hardlink_ext if use_external
else _recursive_hardlink)
return hardlink_func(src_abs, dst_abs)
def nest_hardlink(src_dir: str, src_relpath: str, dst_dir: str):
"""
Hardlink entity from (src_dir + src_relpath) to dst_dir preserving dir
structure of src_relpath.
"""
_lg.debug("Nested hardlinking: %s%s%s -> %s",
src_dir, os.path.sep, src_relpath, dst_dir)
src_dir_abs = os.path.abspath(src_dir)
src_full_path = os.path.join(src_dir_abs, src_relpath)
dst_dir_abs = os.path.abspath(dst_dir)
dst_full_path = os.path.join(dst_dir_abs, src_relpath)
# check source entity and destination directory
if not os.path.lexists(src_full_path):
raise RuntimeError("Error reading source entity: %s" % src_full_path)
if os.path.lexists(dst_dir_abs):
if not os.path.isdir(dst_dir_abs):
raise RuntimeError("Destination path is not a directory: %s"
% dst_dir_abs)
else:
os.mkdir(dst_dir_abs)
# if destination entity exists, check it points to source entity
dst_entry = PseudoDirEntry(dst_full_path)
if os.path.lexists(dst_entry.path):
src_stat = os.lstat(src_full_path)
if os.path.samestat(src_stat, dst_entry.stat()):
return
# remove otherwise
rm_direntry(dst_entry)
src_cur_path = src_dir_abs
dst_cur_path = dst_dir_abs
for rel_part in src_relpath.split(sep=os.path.sep):
src_cur_path = os.path.join(src_cur_path, rel_part)
dst_cur_path = os.path.join(dst_cur_path, rel_part)
if os.path.exists(dst_cur_path):
continue
copy_direntry(PseudoDirEntry(src_cur_path), dst_cur_path)