#!/usr/bin/env python3 import argparse import collections import logging import os import os.path import re import sys PROCESSED_FILETYPES = ( "mkv", "avi", "ts", ) SEPARATORS = r"[() .!,_\[\]]" SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:] LANGUAGES = r"(rus|eng|ukr|jap|ita|chi|kor|ger|fre|spa|pol)" PATTERNS = ( ("episode", r"s\d{1,2}(e\d{1,2})?"), ("year", r"(19|20)\d{2}"), ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut" r"|imax[-.]edition" r"|noir[-.]edition" r"|black[-.]chrome[-.]edition" r"|extended[-.]edition" r"|hq[-.]edition" r"|theatrical)"), ("restrictions", r"(unrated)"), ("resolution", r"[0-9]{3,4}[pi]"), ("quality", r"((blu[-.]?ray|bd)[-.]?remux" r"|(blu[-.]?ray|bd|uhd|hd(dvd|tv)?|web([-.]?dl)?|dvd)[-.]?rip" r"|web[-.]?dl|blu[-.]?ray|hdtv|hddvd|dvd(9)?|f-hd|uhd|remastered" r"|amzn)"), ("codec", r"([hx]\.?26[45]|(mpeg4-)?avc|hevc(10)?|xvid|divx)"), ("hdr", r"(hdr(10)?|10bit)"), ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES), ("subtitles", r"%s?sub" % LANGUAGES), ("language", r"(\d{1,2}x)?%s" % LANGUAGES), ("file_extension", r"mkv|avi"), ("unknown", r".*") ) _lg = logging.getLogger("spqr.movie-renamer") def main(): parser = argparse.ArgumentParser(description="Rename media files.") parser.add_argument("target", type=str, help="path to the media file/directory") parser.add_argument("-v", "--verbose", action="store_true", default=False, help="verbose output") args = parser.parse_args() loglevel = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=loglevel) if os.path.isdir(args.target): process_dir(args.target) else: process_file(args.target) return 0 def process_dir(dir_path): for fname in os.listdir(dir_path): fpath = os.path.join(dir_path, fname) process_file(fpath) def process_file(fpath): # process only files if not os.path.isfile(fpath): _lg.debug("Not a file: %s", fpath) return # split filepath to dir path, title, and extension dir_path, fname = os.path.split(fpath) title, ext = os.path.splitext(fname) ext = ext[1:] if ext not in PROCESSED_FILETYPES: _lg.debug("Extension is not supported: %s", fpath) return parsed_title = parse_title(title) # create file name from parsed chunks chunk_order = [k for k, _ in PATTERNS] chunk_order = ["name"] + chunk_order result = [] for chunk_type in chunk_order: if not parsed_title.get(chunk_type, []): continue result.append(".".join(parsed_title[chunk_type])) result.append(ext) result = ".".join(result) if result != fname: _lg.warning("%s -> %s", fname, result) def _get_parsed_title_dict(chunk_list, chunk_map): p_title = collections.defaultdict(list) for idx, chunk in enumerate(chunk_list): chunk_type = chunk_map[idx] p_title[chunk_type].append(chunk) return p_title def _guess_combined(chunk_values, chunk_map): """ Try to combine unknown chunks in pairs and parse them """ is_changed = False p_title = _get_parsed_title_dict(chunk_values, chunk_map) if len(p_title["unknown"]) < 2: return is_changed, chunk_values, chunk_map # i - begin of slice, j - end of slice i = 0 # process up to second-to-last element while i < len(chunk_map) - 1: # we need slice with at least two elements j = i + 2 # we need only unknown elements while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map): # create combined chunk cmb_chunk = ".".join(chunk_values[i:j]) cmb_chunk_type = guess_part(cmb_chunk) # add new combined chunk in lists # first subelement gets new chunk, rest - None # (will be removed later) if cmb_chunk_type != "unknown": is_changed = True chunk_values[i] = cmb_chunk chunk_map[i] = cmb_chunk_type for idx in range(i+1, j): chunk_values[idx] = None chunk_map[idx] = None # to start checking next chunks right after the end of slice i = idx break # try add more elements to combined chunk else: j += 1 # start checking next value i += 1 # clean up from None values chunk_values = list(filter(None, chunk_values)) chunk_map = list(filter(None, chunk_map)) return is_changed, chunk_values, chunk_map def parse_title(title): """ Split media title to components. """ chunk_values = filter(None, re.split(SEPARATORS, title)) # remove non-word chunks (like single hyphens) chunk_values = list(filter(lambda ch: re.search(r"\w+", ch), chunk_values)) # parse each chunk chunk_map = [] for ch_value in chunk_values: chunk_map.append(guess_part(ch_value)) _, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map) # # try to parse unknown chunks, replacing all hyphens in them with dots p_title = _get_parsed_title_dict(chunk_values, chunk_map) is_changed = False if p_title.get("unknown"): spl_ch_values = [] spl_ch_map = [] for idx, ch_value in enumerate(chunk_values): ch_type = chunk_map[idx] if ch_type == "unknown" and "-" in ch_value: spl_values = ch_value.split("-") for spl_val in spl_values: if not spl_val: continue spl_type = guess_part(spl_val) if spl_type != "unknown": is_changed = True spl_ch_values.append(spl_val) spl_ch_map.append(spl_type) else: spl_ch_values.append(ch_value) spl_ch_map.append(ch_type) is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map) if is_changed or is_combined: chunk_values = spl_ch_values chunk_map = spl_ch_map # parse name and episode name # only if there is something except unknown chunks p_title = _get_parsed_title_dict(chunk_values, chunk_map) if len(p_title["unknown"]) != len(chunk_values): idx = 0 while idx < len(chunk_map) and chunk_map[idx] == "unknown": chunk_map[idx] = "name" idx += 1 # if episode number is found, next unknown chunks are episode name if p_title.get("episode"): idx = chunk_map.index("episode") + 1 while idx < len(chunk_map) and chunk_map[idx] == "unknown": chunk_map[idx] = "episode_name" idx += 1 # at last, strip hyphens from unknown chunks # only if there is something except unknown chunks p_title = _get_parsed_title_dict(chunk_values, chunk_map) if len(p_title["unknown"]) != len(chunk_values): for idx, chunk_type in enumerate(chunk_map): if chunk_type != "unknown": continue chunk_value = chunk_values[idx] if chunk_value[0] != "-" and chunk_value[-1] != "-": continue chunk_values[idx] = chunk_value.strip("-") p_title = _get_parsed_title_dict(chunk_values, chunk_map) return dict(p_title) def guess_part(fname_part): for pat_type, pattern in PATTERNS: full_match_pat = r"^" + pattern + r"$" if re.match(full_match_pat, fname_part, flags=re.I): return pat_type raise RuntimeError("unhandled pattern type") if __name__ == "__main__": sys.exit(main())