movie-renamer/renamer.py

#!/usr/bin/env python3

import argparse
import collections
import enum
import logging
import os
import os.path
import pprint
import re
import sys


PROCESSED_FILETYPES = (
    "mkv",
    "avi",
    "ts",
)
SEPARATORS = r"[() .!,_\[\]]"
SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]
LANGUAGES = r"(rus|eng|ukr|jap|ita|chi|kor|ger|fre|spa|pol)"
PATTERNS = (
    ("episode", r"s\d{1,2}(e\d{1,2})?"),
    ("year", r"(19|20)\d{2}"),
    ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                r"|imax[-.]edition"
                r"|noir[-.]edition"
                r"|black[-.]chrome[-.]edition"
                r"|extended[-.]edition"
                r"|hq[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
    ("quality", r"((blu[-.]?ray|bd)[-.]?remux"
                r"|(blu[-.]?ray|bd|uhd|hd(dvd|tv)?|web([-.]?dl)?|dvd)[-.]?rip"
                r"|web[-.]?dl|blu[-.]?ray|hdtv|hddvd|dvd(9)?|f-hd|uhd|remastered"
                r"|amzn)"),
    ("codec", r"([hx]\.?26[45]|(mpeg4-)?avc|hevc(10)?|xvid|divx)"),
    ("hdr", r"(hdr(10)?|10bit)"),
    ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
    ("subtitles", r"%s?sub" % LANGUAGES),
    ("language", r"(\d{1,2}x)?%s" % LANGUAGES),
    ("file_extension", r"mkv|avi"),
    ("unknown", r".*")
)


# noinspection PyInterpreter
class EnumAction(argparse.Action):
    """
    Argparse action for handling Enums
    """
    def __init__(self, **kwargs):
        # Pop off the type value
        enum_type = kwargs.pop("type", None)

        # Ensure an Enum subclass is provided
        if enum_type is None:
            raise ValueError("type must be assigned an Enum when using EnumAction")
        if not issubclass(enum_type, enum.Enum):
            raise TypeError("type must be an Enum when using EnumAction")

        # Generate choices from the Enum
        kwargs.setdefault("choices", tuple(e.value for e in enum_type))

        super(EnumAction, self).__init__(**kwargs)

        self._enum = enum_type

    def __call__(self, parser, namespace, values, option_string=None):
        # Convert value back into an Enum
        value = self._enum(values)
        setattr(namespace, self.dest, value)


class CliAction(enum.Enum):
    parse = "parse"
    rename = "rename"


_lg = logging.getLogger("spqr.movie-renamer")


def main():
    parser = argparse.ArgumentParser(description="Rename media files.")
    parser.add_argument("action", type=CliAction, action=EnumAction, metavar="ACTION",
                        help="what to do with media file/directory (%(choices)s)")
    parser.add_argument("target", type=str, metavar="TARGET",
                        help="path to the media file/directory")
    parser.add_argument("-v", "--verbose", action="store_true", default=False,
                        help="verbose output")
    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=loglevel)

    process_path(args.action, args.target)

    return 0


def process_path(action: CliAction, path):
    # process only files
    if os.path.isdir(path):
        for child_path in sorted(os.listdir(path)):
            process_path(action, os.path.join(path, child_path))

    # split filepath to dir path, title, and extension
    dir_path, fname = os.path.split(path)
    title, ext = os.path.splitext(fname)
    ext = ext[1:]
    if ext not in PROCESSED_FILETYPES:
        _lg.debug("Extension is not supported: %s", path)
        return

    parsed_title = parse_title(title)
    if action == CliAction.parse:
        print_parsed_title(title, parsed_title)
        return

    if action == CliAction.rename:
        pretty_title = generate_pretty_name(parsed_title)
        pretty_title += ".%s" % ext
        if pretty_title != fname:
            _lg.warning("%s -> %s", fname, pretty_title)
        return


def print_parsed_title(title, parsed):
    print(title)
    pprint.pprint(parsed, indent=4)


def generate_pretty_name(parsed_title):
    """ Create file name from parsed chunks. """
    chunk_order = [k for k, _ in PATTERNS]
    chunk_order = ["name"] + chunk_order
    ep_idx = chunk_order.index("episode") + 1
    chunk_order = chunk_order[:ep_idx] + ["episode_name"] + chunk_order[ep_idx:]

    result = []
    for chunk_type in chunk_order:
        if not parsed_title.get(chunk_type, []):
            continue
        result.append(".".join(parsed_title[chunk_type]))
    result = ".".join(result)
    return result


def _get_parsed_title_dict(chunk_list, chunk_map):
    """ Get {chunk_type: [chunk_value_1, ...,  chunk_value_n]} dictionary. """
    p_title = collections.defaultdict(list)
    for idx, chunk in enumerate(chunk_list):
        chunk_type = chunk_map[idx]
        p_title[chunk_type].append(chunk)
    return p_title


def _guess_combined(chunk_values, chunk_map):
    """ Try to combine unknown chunks in pairs and parse them. """
    is_changed = False
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) < 2:
        return is_changed, chunk_values, chunk_map

    # i - begin of slice, j - end of slice
    i = 0
    # process up to second-to-last element
    while i < len(chunk_map) - 1:
        # we need slice with at least two elements
        j = i + 2
        # we need only unknown elements
        while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):
            # create combined chunk
            cmb_chunk = ".".join(chunk_values[i:j])
            cmb_chunk_type = guess_part(cmb_chunk)

            # add new combined chunk in lists
            # first subelement gets new chunk, rest - None
            # (will be removed later)
            if cmb_chunk_type != "unknown":
                is_changed = True
                chunk_values[i] = cmb_chunk
                chunk_map[i] = cmb_chunk_type
                for idx in range(i+1, j):
                    chunk_values[idx] = None
                    chunk_map[idx] = None
                    # to start checking next chunks right after the end of slice
                    i = idx
                break
            # try add more elements to combined chunk
            else:
                j += 1

        # start checking next value
        i += 1

    # clean up from None values
    chunk_values = list(filter(None, chunk_values))
    chunk_map = list(filter(None, chunk_map))

    return is_changed, chunk_values, chunk_map


def parse_title(title):
    """ Split media title to components. """

    chunk_values = filter(None, re.split(SEPARATORS, title))

    # remove non-word chunks (like single hyphens), but leave ampersands (&)
    chunk_values = list(filter(lambda ch: re.search(r"(\w|&)+", ch), chunk_values))

    chunk_map = []  # list of chunk_types
    # parse each chunk
    for ch_value in chunk_values:
        chunk_map.append(guess_part(ch_value))

    _, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)

    # try to parse unknown chunks, replacing all hyphens in them with dots
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    is_changed = False
    if p_title.get("unknown"):
        spl_ch_values = []
        spl_ch_map = []
        for idx, ch_value in enumerate(chunk_values):
            ch_type = chunk_map[idx]
            if ch_type == "unknown" and "-" in ch_value:
                spl_values = ch_value.split("-")
                for spl_val in spl_values:
                    if not spl_val:
                        continue
                    spl_type = guess_part(spl_val)
                    if spl_type != "unknown":
                        is_changed = True
                    spl_ch_values.append(spl_val)
                    spl_ch_map.append(spl_type)
            else:
                spl_ch_values.append(ch_value)
                spl_ch_map.append(ch_type)

        is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)
        if is_changed or is_combined:
            chunk_values = spl_ch_values
            chunk_map = spl_ch_map

    # parse name and episode name
    # only if there is something except unknown chunks
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) != len(chunk_values):
        idx = 0
        while idx < len(chunk_map) and chunk_map[idx] == "unknown":
            chunk_map[idx] = "name"
            idx += 1
        # if episode number is found, next unknown chunks are episode name
        if p_title.get("episode"):
            idx = chunk_map.index("episode") + 1
            while idx < len(chunk_map) and chunk_map[idx] == "unknown":
                chunk_map[idx] = "episode_name"
                idx += 1

    # at last, strip hyphens from unknown chunks
    # only if there is something except unknown chunks
    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    if len(p_title["unknown"]) != len(chunk_values):
        for idx, chunk_type in enumerate(chunk_map):
            if chunk_type != "unknown":
                continue
            chunk_value = chunk_values[idx]
            if chunk_value[0] != "-" and chunk_value[-1] != "-":
                continue
            chunk_values[idx] = chunk_value.strip("-")

    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    return dict(p_title)


def guess_part(chunk_value):
    """ Return chunk type for given chunk value. """
    for chunk_type, pattern in PATTERNS:
        full_match_pat = r"^" + pattern + r"$"
        if re.match(full_match_pat, chunk_value, flags=re.I):
            return chunk_type
    raise RuntimeError("unhandled pattern type")


if __name__ == "__main__":
    sys.exit(main())
Add python parse implementation 2022-01-03 07:56:15 +00:00			`#!/usr/bin/env python3`

			`import argparse`
			`import collections`
Add parse/rename CLI options 2022-02-09 15:08:06 +00:00			`import enum`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`import logging`
			`import os`
			`import os.path`
Add parse/rename CLI options 2022-02-09 15:08:06 +00:00			`import pprint`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`import re`
			`import sys`


			`PROCESSED_FILETYPES = (`
			`"mkv",`
			`"avi",`
			`"ts",`
			`)`
			`SEPARATORS = r"[() .!,_\[\]]"`
			`SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]`
			`LANGUAGES = r"(rus\|eng\|ukr\|jap\|ita\|chi\|kor\|ger\|fre\|spa\|pol)"`
			`PATTERNS = (`
			`("episode", r"s\d{1,2}(e\d{1,2})?"),`
			`("year", r"(19\|20)\d{2}"),`
			`("edition", r"((theatrical\|director'*s\|extended\|un)[-.]?cut"`
			`r"\|imax[-.]edition"`
			`r"\|noir[-.]edition"`
Add tests 2022-01-12 20:24:10 +00:00			`r"\|black[-.]chrome[-.]edition"`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`r"\|extended[-.]edition"`
Add support for episode names 2022-01-16 06:11:00 +00:00			`r"\|hq[-.]edition"`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`r"\|theatrical)"),`
			`("restrictions", r"(unrated)"),`
			`("resolution", r"[0-9]{3,4}[pi]"),`
			`("quality", r"((blu[-.]?ray\|bd)[-.]?remux"`
			`r"\|(blu[-.]?ray\|bd\|uhd\|hd(dvd\|tv)?\|web([-.]?dl)?\|dvd)[-.]?rip"`
			`r"\|web[-.]?dl\|blu[-.]?ray\|hdtv\|hddvd\|dvd(9)?\|f-hd\|uhd\|remastered"`
			`r"\|amzn)"),`
			`("codec", r"([hx]\.?26[45]\|(mpeg4-)?avc\|hevc(10)?\|xvid\|divx)"),`
			`("hdr", r"(hdr(10)?\|10bit)"),`
			`("audio", r"%s?(dts(-es)?\|ac3\|flac\|dd5\.1\|aac2\.0\|dub-line)" % LANGUAGES),`
			`("subtitles", r"%s?sub" % LANGUAGES),`
			`("language", r"(\d{1,2}x)?%s" % LANGUAGES),`
Add support for episode names 2022-01-16 06:11:00 +00:00			`("file_extension", r"mkv\|avi"),`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`("unknown", r".*")`
			`)`

Add parse/rename CLI options 2022-02-09 15:08:06 +00:00
			`# noinspection PyInterpreter`
			`class EnumAction(argparse.Action):`
			`"""`
			`Argparse action for handling Enums`
			`"""`
			`def __init__(self, **kwargs):`
			`# Pop off the type value`
			`enum_type = kwargs.pop("type", None)`

			`# Ensure an Enum subclass is provided`
			`if enum_type is None:`
			`raise ValueError("type must be assigned an Enum when using EnumAction")`
			`if not issubclass(enum_type, enum.Enum):`
			`raise TypeError("type must be an Enum when using EnumAction")`

			`# Generate choices from the Enum`
			`kwargs.setdefault("choices", tuple(e.value for e in enum_type))`

			`super(EnumAction, self).__init__(**kwargs)`

			`self._enum = enum_type`

			`def __call__(self, parser, namespace, values, option_string=None):`
			`# Convert value back into an Enum`
			`value = self._enum(values)`
			`setattr(namespace, self.dest, value)`


			`class CliAction(enum.Enum):`
			`parse = "parse"`
			`rename = "rename"`


Add python parse implementation 2022-01-03 07:56:15 +00:00			`_lg = logging.getLogger("spqr.movie-renamer")`


			`def main():`
			`parser = argparse.ArgumentParser(description="Rename media files.")`
Add parse/rename CLI options 2022-02-09 15:08:06 +00:00			`parser.add_argument("action", type=CliAction, action=EnumAction, metavar="ACTION",`
			`help="what to do with media file/directory (%(choices)s)")`
			`parser.add_argument("target", type=str, metavar="TARGET",`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`help="path to the media file/directory")`
			`parser.add_argument("-v", "--verbose", action="store_true", default=False,`
			`help="verbose output")`
			`args = parser.parse_args()`

			`loglevel = logging.DEBUG if args.verbose else logging.INFO`
			`logging.basicConfig(level=loglevel)`

Add parse/rename CLI options 2022-02-09 15:08:06 +00:00			`process_path(args.action, args.target)`
Add python parse implementation 2022-01-03 07:56:15 +00:00
			`return 0`


Add parse/rename CLI options 2022-02-09 15:08:06 +00:00			`def process_path(action: CliAction, path):`
			`# process only files`
			`if os.path.isdir(path):`
			`for child_path in sorted(os.listdir(path)):`
			`process_path(action, os.path.join(path, child_path))`

			`# split filepath to dir path, title, and extension`
			`dir_path, fname = os.path.split(path)`
			`title, ext = os.path.splitext(fname)`
			`ext = ext[1:]`
			`if ext not in PROCESSED_FILETYPES:`
			`_lg.debug("Extension is not supported: %s", path)`
			`return`

			`parsed_title = parse_title(title)`
			`if action == CliAction.parse:`
			`print_parsed_title(title, parsed_title)`
			`return`

			`if action == CliAction.rename:`
			`pretty_title = generate_pretty_name(parsed_title)`
			`pretty_title += ".%s" % ext`
			`if pretty_title != fname:`
			`_lg.warning("%s -> %s", fname, pretty_title)`
			`return`


			`def print_parsed_title(title, parsed):`
			`print(title)`
			`pprint.pprint(parsed, indent=4)`
Add python parse implementation 2022-01-03 07:56:15 +00:00

Refactoring 2022-02-08 20:45:55 +00:00			`def generate_pretty_name(parsed_title):`
			`""" Create file name from parsed chunks. """`
			`chunk_order = [k for k, _ in PATTERNS]`
			`chunk_order = ["name"] + chunk_order`
			`ep_idx = chunk_order.index("episode") + 1`
			`chunk_order = chunk_order[:ep_idx] + ["episode_name"] + chunk_order[ep_idx:]`

			`result = []`
			`for chunk_type in chunk_order:`
			`if not parsed_title.get(chunk_type, []):`
			`continue`
			`result.append(".".join(parsed_title[chunk_type]))`
			`result = ".".join(result)`
			`return result`


Add support for episode names 2022-01-16 06:11:00 +00:00			`def _get_parsed_title_dict(chunk_list, chunk_map):`
Refactoring 2022-02-08 20:45:55 +00:00			`""" Get {chunk_type: [chunk_value_1, ..., chunk_value_n]} dictionary. """`
Add support for episode names 2022-01-16 06:11:00 +00:00			`p_title = collections.defaultdict(list)`
			`for idx, chunk in enumerate(chunk_list):`
			`chunk_type = chunk_map[idx]`
			`p_title[chunk_type].append(chunk)`
			`return p_title`


			`def _guess_combined(chunk_values, chunk_map):`
Refactoring 2022-02-08 20:45:55 +00:00			`""" Try to combine unknown chunks in pairs and parse them. """`
Add support for episode names 2022-01-16 06:11:00 +00:00			`is_changed = False`
			`p_title = _get_parsed_title_dict(chunk_values, chunk_map)`
			`if len(p_title["unknown"]) < 2:`
			`return is_changed, chunk_values, chunk_map`

			`# i - begin of slice, j - end of slice`
			`i = 0`
			`# process up to second-to-last element`
			`while i < len(chunk_map) - 1:`
			`# we need slice with at least two elements`
			`j = i + 2`
			`# we need only unknown elements`
			`while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):`
			`# create combined chunk`
			`cmb_chunk = ".".join(chunk_values[i:j])`
			`cmb_chunk_type = guess_part(cmb_chunk)`

			`# add new combined chunk in lists`
			`# first subelement gets new chunk, rest - None`
			`# (will be removed later)`
			`if cmb_chunk_type != "unknown":`
			`is_changed = True`
			`chunk_values[i] = cmb_chunk`
			`chunk_map[i] = cmb_chunk_type`
			`for idx in range(i+1, j):`
			`chunk_values[idx] = None`
			`chunk_map[idx] = None`
			`# to start checking next chunks right after the end of slice`
			`i = idx`
			`break`
			`# try add more elements to combined chunk`
			`else:`
			`j += 1`

			`# start checking next value`
			`i += 1`

			`# clean up from None values`
			`chunk_values = list(filter(None, chunk_values))`
			`chunk_map = list(filter(None, chunk_map))`

			`return is_changed, chunk_values, chunk_map`


Add python parse implementation 2022-01-03 07:56:15 +00:00			`def parse_title(title):`
			`""" Split media title to components. """`

Add support for episode names 2022-01-16 06:11:00 +00:00			`chunk_values = filter(None, re.split(SEPARATORS, title))`
Add python parse implementation 2022-01-03 07:56:15 +00:00
Fix bug with missing ampersand 2022-01-16 06:16:28 +00:00			`# remove non-word chunks (like single hyphens), but leave ampersands (&)`
			`chunk_values = list(filter(lambda ch: re.search(r"(\w\|&)+", ch), chunk_values))`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00
Refactoring 2022-02-08 20:45:55 +00:00			`chunk_map = [] # list of chunk_types`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`# parse each chunk`
Add support for episode names 2022-01-16 06:11:00 +00:00			`for ch_value in chunk_values:`
			`chunk_map.append(guess_part(ch_value))`

			`_, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)`

Refactoring 2022-02-08 20:45:55 +00:00			`# try to parse unknown chunks, replacing all hyphens in them with dots`
Add support for episode names 2022-01-16 06:11:00 +00:00			`p_title = _get_parsed_title_dict(chunk_values, chunk_map)`
			`is_changed = False`
			`if p_title.get("unknown"):`
			`spl_ch_values = []`
			`spl_ch_map = []`
			`for idx, ch_value in enumerate(chunk_values):`
			`ch_type = chunk_map[idx]`
			`if ch_type == "unknown" and "-" in ch_value:`
			`spl_values = ch_value.split("-")`
			`for spl_val in spl_values:`
			`if not spl_val:`
			`continue`
			`spl_type = guess_part(spl_val)`
			`if spl_type != "unknown":`
			`is_changed = True`
			`spl_ch_values.append(spl_val)`
			`spl_ch_map.append(spl_type)`
			`else:`
			`spl_ch_values.append(ch_value)`
			`spl_ch_map.append(ch_type)`

			`is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)`
			`if is_changed or is_combined:`
			`chunk_values = spl_ch_values`
			`chunk_map = spl_ch_map`

			`# parse name and episode name`
			`# only if there is something except unknown chunks`
			`p_title = _get_parsed_title_dict(chunk_values, chunk_map)`
			`if len(p_title["unknown"]) != len(chunk_values):`
			`idx = 0`
			`while idx < len(chunk_map) and chunk_map[idx] == "unknown":`
			`chunk_map[idx] = "name"`
			`idx += 1`
			`# if episode number is found, next unknown chunks are episode name`
			`if p_title.get("episode"):`
			`idx = chunk_map.index("episode") + 1`
			`while idx < len(chunk_map) and chunk_map[idx] == "unknown":`
			`chunk_map[idx] = "episode_name"`
			`idx += 1`

			`# at last, strip hyphens from unknown chunks`
			`# only if there is something except unknown chunks`
			`p_title = _get_parsed_title_dict(chunk_values, chunk_map)`
			`if len(p_title["unknown"]) != len(chunk_values):`
			`for idx, chunk_type in enumerate(chunk_map):`
			`if chunk_type != "unknown":`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`continue`
Add support for episode names 2022-01-16 06:11:00 +00:00			`chunk_value = chunk_values[idx]`
			`if chunk_value[0] != "-" and chunk_value[-1] != "-":`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`continue`
Add support for episode names 2022-01-16 06:11:00 +00:00			`chunk_values[idx] = chunk_value.strip("-")`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00
Add support for episode names 2022-01-16 06:11:00 +00:00			`p_title = _get_parsed_title_dict(chunk_values, chunk_map)`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`return dict(p_title)`
Add python parse implementation 2022-01-03 07:56:15 +00:00

Refactoring 2022-02-08 20:45:55 +00:00			`def guess_part(chunk_value):`
			`""" Return chunk type for given chunk value. """`
			`for chunk_type, pattern in PATTERNS:`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`full_match_pat = r"^" + pattern + r"$"`
Refactoring 2022-02-08 20:45:55 +00:00			`if re.match(full_match_pat, chunk_value, flags=re.I):`
			`return chunk_type`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`raise RuntimeError("unhandled pattern type")`


			`if __name__ == "__main__":`
			`sys.exit(main())`