movie-renamer/renamer.py

289 lines
9.5 KiB
Python
Raw Normal View History

2022-01-03 07:56:15 +00:00
#!/usr/bin/env python3
import argparse
import collections
2022-02-09 15:08:06 +00:00
import enum
2022-01-03 07:56:15 +00:00
import logging
import os
import os.path
2022-02-09 15:08:06 +00:00
import pprint
2022-01-03 07:56:15 +00:00
import re
import sys
PROCESSED_FILETYPES = (
"mkv",
"avi",
"ts",
)
SEPARATORS = r"[() .!,_\[\]]"
SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]
LANGUAGES = r"(rus|eng|ukr|jap|ita|chi|kor|ger|fre|spa|pol)"
PATTERNS = (
("episode", r"s\d{1,2}(e\d{1,2})?"),
("year", r"(19|20)\d{2}"),
("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
r"|imax[-.]edition"
r"|noir[-.]edition"
2022-01-12 20:24:10 +00:00
r"|black[-.]chrome[-.]edition"
r"|extended[-.]edition"
2022-01-16 06:11:00 +00:00
r"|hq[-.]edition"
2022-01-03 07:56:15 +00:00
r"|theatrical)"),
("restrictions", r"(unrated)"),
("resolution", r"[0-9]{3,4}[pi]"),
("quality", r"((blu[-.]?ray|bd)[-.]?remux"
r"|(blu[-.]?ray|bd|uhd|hd(dvd|tv)?|web([-.]?dl)?|dvd)[-.]?rip"
r"|web[-.]?dl|blu[-.]?ray|hdtv|hddvd|dvd(9)?|f-hd|uhd|remastered"
r"|amzn)"),
("codec", r"([hx]\.?26[45]|(mpeg4-)?avc|hevc(10)?|xvid|divx)"),
("hdr", r"(hdr(10)?|10bit)"),
("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
("subtitles", r"%s?sub" % LANGUAGES),
("language", r"(\d{1,2}x)?%s" % LANGUAGES),
2022-01-16 06:11:00 +00:00
("file_extension", r"mkv|avi"),
2022-01-03 07:56:15 +00:00
("unknown", r".*")
)
2022-02-09 15:08:06 +00:00
# noinspection PyInterpreter
class EnumAction(argparse.Action):
"""
Argparse action for handling Enums
"""
def __init__(self, **kwargs):
# Pop off the type value
enum_type = kwargs.pop("type", None)
# Ensure an Enum subclass is provided
if enum_type is None:
raise ValueError("type must be assigned an Enum when using EnumAction")
if not issubclass(enum_type, enum.Enum):
raise TypeError("type must be an Enum when using EnumAction")
# Generate choices from the Enum
kwargs.setdefault("choices", tuple(e.value for e in enum_type))
super(EnumAction, self).__init__(**kwargs)
self._enum = enum_type
def __call__(self, parser, namespace, values, option_string=None):
# Convert value back into an Enum
value = self._enum(values)
setattr(namespace, self.dest, value)
class CliAction(enum.Enum):
parse = "parse"
rename = "rename"
2022-01-03 07:56:15 +00:00
_lg = logging.getLogger("spqr.movie-renamer")
def main():
parser = argparse.ArgumentParser(description="Rename media files.")
2022-02-09 15:08:06 +00:00
parser.add_argument("action", type=CliAction, action=EnumAction, metavar="ACTION",
help="what to do with media file/directory (%(choices)s)")
parser.add_argument("target", type=str, metavar="TARGET",
2022-01-03 07:56:15 +00:00
help="path to the media file/directory")
parser.add_argument("-v", "--verbose", action="store_true", default=False,
help="verbose output")
args = parser.parse_args()
loglevel = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=loglevel)
2022-02-09 15:08:06 +00:00
process_path(args.action, args.target)
2022-01-03 07:56:15 +00:00
return 0
2022-02-09 15:08:06 +00:00
def process_path(action: CliAction, path):
# process only files
if os.path.isdir(path):
for child_path in sorted(os.listdir(path)):
process_path(action, os.path.join(path, child_path))
# split filepath to dir path, title, and extension
dir_path, fname = os.path.split(path)
title, ext = os.path.splitext(fname)
ext = ext[1:]
if ext not in PROCESSED_FILETYPES:
_lg.debug("Extension is not supported: %s", path)
return
parsed_title = parse_title(title)
if action == CliAction.parse:
print_parsed_title(title, parsed_title)
return
if action == CliAction.rename:
pretty_title = generate_pretty_name(parsed_title)
pretty_title += ".%s" % ext
if pretty_title != fname:
_lg.warning("%s -> %s", fname, pretty_title)
return
def print_parsed_title(title, parsed):
print(title)
pprint.pprint(parsed, indent=4)
2022-01-03 07:56:15 +00:00
2022-02-08 20:45:55 +00:00
def generate_pretty_name(parsed_title):
""" Create file name from parsed chunks. """
chunk_order = [k for k, _ in PATTERNS]
chunk_order = ["name"] + chunk_order
ep_idx = chunk_order.index("episode") + 1
chunk_order = chunk_order[:ep_idx] + ["episode_name"] + chunk_order[ep_idx:]
result = []
for chunk_type in chunk_order:
if not parsed_title.get(chunk_type, []):
continue
result.append(".".join(parsed_title[chunk_type]))
result = ".".join(result)
return result
2022-01-16 06:11:00 +00:00
def _get_parsed_title_dict(chunk_list, chunk_map):
2022-02-08 20:45:55 +00:00
""" Get {chunk_type: [chunk_value_1, ..., chunk_value_n]} dictionary. """
2022-01-16 06:11:00 +00:00
p_title = collections.defaultdict(list)
for idx, chunk in enumerate(chunk_list):
chunk_type = chunk_map[idx]
p_title[chunk_type].append(chunk)
return p_title
def _guess_combined(chunk_values, chunk_map):
2022-02-08 20:45:55 +00:00
""" Try to combine unknown chunks in pairs and parse them. """
2022-01-16 06:11:00 +00:00
is_changed = False
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) < 2:
return is_changed, chunk_values, chunk_map
# i - begin of slice, j - end of slice
i = 0
# process up to second-to-last element
while i < len(chunk_map) - 1:
# we need slice with at least two elements
j = i + 2
# we need only unknown elements
while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):
# create combined chunk
cmb_chunk = ".".join(chunk_values[i:j])
cmb_chunk_type = guess_part(cmb_chunk)
# add new combined chunk in lists
# first subelement gets new chunk, rest - None
# (will be removed later)
if cmb_chunk_type != "unknown":
is_changed = True
chunk_values[i] = cmb_chunk
chunk_map[i] = cmb_chunk_type
for idx in range(i+1, j):
chunk_values[idx] = None
chunk_map[idx] = None
# to start checking next chunks right after the end of slice
i = idx
break
# try add more elements to combined chunk
else:
j += 1
# start checking next value
i += 1
# clean up from None values
chunk_values = list(filter(None, chunk_values))
chunk_map = list(filter(None, chunk_map))
return is_changed, chunk_values, chunk_map
2022-01-03 07:56:15 +00:00
def parse_title(title):
""" Split media title to components. """
2022-01-16 06:11:00 +00:00
chunk_values = filter(None, re.split(SEPARATORS, title))
2022-01-03 07:56:15 +00:00
2022-01-16 06:16:28 +00:00
# remove non-word chunks (like single hyphens), but leave ampersands (&)
chunk_values = list(filter(lambda ch: re.search(r"(\w|&)+", ch), chunk_values))
2022-02-08 20:45:55 +00:00
chunk_map = [] # list of chunk_types
2022-01-03 07:56:15 +00:00
# parse each chunk
2022-01-16 06:11:00 +00:00
for ch_value in chunk_values:
chunk_map.append(guess_part(ch_value))
_, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)
2022-02-08 20:45:55 +00:00
# try to parse unknown chunks, replacing all hyphens in them with dots
2022-01-16 06:11:00 +00:00
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
is_changed = False
if p_title.get("unknown"):
spl_ch_values = []
spl_ch_map = []
for idx, ch_value in enumerate(chunk_values):
ch_type = chunk_map[idx]
if ch_type == "unknown" and "-" in ch_value:
spl_values = ch_value.split("-")
for spl_val in spl_values:
if not spl_val:
continue
spl_type = guess_part(spl_val)
if spl_type != "unknown":
is_changed = True
spl_ch_values.append(spl_val)
spl_ch_map.append(spl_type)
else:
spl_ch_values.append(ch_value)
spl_ch_map.append(ch_type)
is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)
if is_changed or is_combined:
chunk_values = spl_ch_values
chunk_map = spl_ch_map
# parse name and episode name
# only if there is something except unknown chunks
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) != len(chunk_values):
idx = 0
while idx < len(chunk_map) and chunk_map[idx] == "unknown":
chunk_map[idx] = "name"
idx += 1
# if episode number is found, next unknown chunks are episode name
if p_title.get("episode"):
idx = chunk_map.index("episode") + 1
while idx < len(chunk_map) and chunk_map[idx] == "unknown":
chunk_map[idx] = "episode_name"
idx += 1
# at last, strip hyphens from unknown chunks
# only if there is something except unknown chunks
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) != len(chunk_values):
for idx, chunk_type in enumerate(chunk_map):
if chunk_type != "unknown":
continue
2022-01-16 06:11:00 +00:00
chunk_value = chunk_values[idx]
if chunk_value[0] != "-" and chunk_value[-1] != "-":
continue
2022-01-16 06:11:00 +00:00
chunk_values[idx] = chunk_value.strip("-")
2022-01-16 06:11:00 +00:00
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
return dict(p_title)
2022-01-03 07:56:15 +00:00
2022-02-08 20:45:55 +00:00
def guess_part(chunk_value):
""" Return chunk type for given chunk value. """
for chunk_type, pattern in PATTERNS:
2022-01-03 07:56:15 +00:00
full_match_pat = r"^" + pattern + r"$"
2022-02-08 20:45:55 +00:00
if re.match(full_match_pat, chunk_value, flags=re.I):
return chunk_type
2022-01-03 07:56:15 +00:00
raise RuntimeError("unhandled pattern type")
if __name__ == "__main__":
sys.exit(main())