movie-renamer/renamer.py

#!/usr/bin/env python3

import argparse
import collections
import logging
import os
import os.path
import re
import sys


PROCESSED_FILETYPES = (
    "mkv",
    "avi",
    "ts",
)
SEPARATORS = r"[() .!,_\[\]]"
SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]
LANGUAGES = r"(rus|eng|ukr|jap|ita|chi|kor|ger|fre|spa|pol)"
PATTERNS = (
    ("episode", r"s\d{1,2}(e\d{1,2})?"),
    ("year", r"(19|20)\d{2}"),
    ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                r"|imax[-.]edition"
                r"|noir[-.]edition"
                r"|black[-.]chrome[-.]edition"
                r"|extended[-.]edition"
                r"|hq[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
    ("quality", r"((blu[-.]?ray|bd)[-.]?remux"
                r"|(blu[-.]?ray|bd|uhd|hd(dvd|tv)?|web([-.]?dl)?|dvd)[-.]?rip"
                r"|web[-.]?dl|blu[-.]?ray|hdtv|hddvd|dvd(9)?|f-hd|uhd|remastered"
                r"|amzn)"),
    ("codec", r"([hx]\.?26[45]|(mpeg4-)?avc|hevc(10)?|xvid|divx)"),
    ("hdr", r"(hdr(10)?|10bit)"),
    ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
    ("subtitles", r"%s?sub" % LANGUAGES),
    ("language", r"(\d{1,2}x)?%s" % LANGUAGES),
    ("file_extension", r"mkv|avi"),
    ("unknown", r".*")
)

_lg = logging.getLogger("spqr.movie-renamer")


def main():
    parser = argparse.ArgumentParser(description="Rename media files.")
    parser.add_argument("target", type=str,
                        help="path to the media file/directory")
    parser.add_argument("-v", "--verbose", action="store_true", default=False,
                        help="verbose output")
    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=loglevel)

    if os.path.isdir(args.target):
        process_dir(args.target)
    else:
        process_file(args.target)

    return 0


def process_dir(dir_path):
    for fname in os.listdir(dir_path):
        fpath = os.path.join(dir_path, fname)
        process_file(fpath)


def process_file(fpath):
    # process only files
    if not os.path.isfile(fpath):
        _lg.debug("Not a file: %s", fpath)
        return

    # split filepath to dir path, title, and extension
    dir_path, fname = os.path.split(fpath)
    title, ext = os.path.splitext(fname)
    ext = ext[1:]
    if ext not in PROCESSED_FILETYPES:
        _lg.debug("Extension is not supported: %s", fpath)
        return

    parsed_title = parse_title(title)

    # create file name from parsed chunks
    chunk_order = [k for k, _ in PATTERNS]
    chunk_order = ["name"] + chunk_order
    episode_idx = chunk_order.index("episode") + 1
    chunk_order = chunk_order[:episode_idx] + ["episode_name"] + chunk_order[episode_idx:]
    result = []
    for chunk_type in chunk_order:
        if not parsed_title.get(chunk_type, []):
            continue
        result.append(".".join(parsed_title[chunk_type]))
    result.append(ext)
    result = ".".join(result)

    if result != fname:
        _lg.warning("%s -> %s", fname, result)


def _get_parsed_title_dict(chunk_list, chunk_map):
    p_title = collections.defaultdict(list)
    for idx, chunk in enumerate(chunk_list):
        chunk_type = chunk_map[idx]
        p_title[chunk_type].append(chunk)
    return p_title


def _guess_combined(chunk_values, chunk_map):
    """ Try to combine unknown chunks in pairs and parse them """
    is_changed = False
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) < 2:
        return is_changed, chunk_values, chunk_map

    # i - begin of slice, j - end of slice
    i = 0
    # process up to second-to-last element
    while i < len(chunk_map) - 1:
        # we need slice with at least two elements
        j = i + 2
        # we need only unknown elements
        while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):
            # create combined chunk
            cmb_chunk = ".".join(chunk_values[i:j])
            cmb_chunk_type = guess_part(cmb_chunk)

            # add new combined chunk in lists
            # first subelement gets new chunk, rest - None
            # (will be removed later)
            if cmb_chunk_type != "unknown":
                is_changed = True
                chunk_values[i] = cmb_chunk
                chunk_map[i] = cmb_chunk_type
                for idx in range(i+1, j):
                    chunk_values[idx] = None
                    chunk_map[idx] = None
                    # to start checking next chunks right after the end of slice
                    i = idx
                break
            # try add more elements to combined chunk
            else:
                j += 1

        # start checking next value
        i += 1

    # clean up from None values
    chunk_values = list(filter(None, chunk_values))
    chunk_map = list(filter(None, chunk_map))

    return is_changed, chunk_values, chunk_map


def parse_title(title):
    """ Split media title to components. """

    chunk_values = filter(None, re.split(SEPARATORS, title))

    # remove non-word chunks (like single hyphens), but leave ampersands (&)
    chunk_values = list(filter(lambda ch: re.search(r"(\w|&)+", ch), chunk_values))

    # parse each chunk
    chunk_map = []
    for ch_value in chunk_values:
        chunk_map.append(guess_part(ch_value))

    _, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)

    # # try to parse unknown chunks, replacing all hyphens in them with dots
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    is_changed = False
    if p_title.get("unknown"):
        spl_ch_values = []
        spl_ch_map = []
        for idx, ch_value in enumerate(chunk_values):
            ch_type = chunk_map[idx]
            if ch_type == "unknown" and "-" in ch_value:
                spl_values = ch_value.split("-")
                for spl_val in spl_values:
                    if not spl_val:
                        continue
                    spl_type = guess_part(spl_val)
                    if spl_type != "unknown":
                        is_changed = True
                    spl_ch_values.append(spl_val)
                    spl_ch_map.append(spl_type)
            else:
                spl_ch_values.append(ch_value)
                spl_ch_map.append(ch_type)

        is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)
        if is_changed or is_combined:
            chunk_values = spl_ch_values
            chunk_map = spl_ch_map

    # parse name and episode name
    # only if there is something except unknown chunks
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) != len(chunk_values):
        idx = 0
        while idx < len(chunk_map) and chunk_map[idx] == "unknown":
            chunk_map[idx] = "name"
            idx += 1
        # if episode number is found, next unknown chunks are episode name
        if p_title.get("episode"):
            idx = chunk_map.index("episode") + 1
            while idx < len(chunk_map) and chunk_map[idx] == "unknown":
                chunk_map[idx] = "episode_name"
                idx += 1

    # at last, strip hyphens from unknown chunks
    # only if there is something except unknown chunks
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) != len(chunk_values):
        for idx, chunk_type in enumerate(chunk_map):
            if chunk_type != "unknown":
                continue
            chunk_value = chunk_values[idx]
            if chunk_value[0] != "-" and chunk_value[-1] != "-":
                continue
            chunk_values[idx] = chunk_value.strip("-")

    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    return dict(p_title)


def guess_part(fname_part):
    for pat_type, pattern in PATTERNS:
        full_match_pat = r"^" + pattern + r"$"
        if re.match(full_match_pat, fname_part, flags=re.I):
            return pat_type
    raise RuntimeError("unhandled pattern type")


if __name__ == "__main__":
    sys.exit(main())