movie-renamer/renamer.py

#!/usr/bin/env python3

import argparse
import collections
import logging
import os
import os.path
import re
import sys


PROCESSED_FILETYPES = (
    "mkv",
    "avi",
    "ts",
)
SEPARATORS = r"[() .!,_\[\]]"
SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]
LANGUAGES = r"(rus|eng|ukr|jap|ita|chi|kor|ger|fre|spa|pol)"
PATTERNS = (
    ("episode", r"s\d{1,2}(e\d{1,2})?"),
    ("year", r"(19|20)\d{2}"),
    ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                r"|imax[-.]edition"
                r"|noir[-.]edition"
                r"|black[-.]chrome[-.]edition"
                r"|extended[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
    ("quality", r"((blu[-.]?ray|bd)[-.]?remux"
                r"|(blu[-.]?ray|bd|uhd|hd(dvd|tv)?|web([-.]?dl)?|dvd)[-.]?rip"
                r"|web[-.]?dl|blu[-.]?ray|hdtv|hddvd|dvd(9)?|f-hd|uhd|remastered"
                r"|amzn)"),
    ("codec", r"([hx]\.?26[45]|(mpeg4-)?avc|hevc(10)?|xvid|divx)"),
    ("hdr", r"(hdr(10)?|10bit)"),
    ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
    ("subtitles", r"%s?sub" % LANGUAGES),
    ("language", r"(\d{1,2}x)?%s" % LANGUAGES),
    ("unknown", r".*")
)

_lg = logging.getLogger("spqr.movie-renamer")


def main():
    parser = argparse.ArgumentParser(description="Rename media files.")
    parser.add_argument("target", type=str,
                        help="path to the media file/directory")
    parser.add_argument("-v", "--verbose", action="store_true", default=False,
                        help="verbose output")
    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=loglevel)

    if os.path.isdir(args.target):
        process_dir(args.target)
    else:
        process_file(args.target)

    return 0


def process_dir(dir_path):
    for fname in os.listdir(dir_path):
        fpath = os.path.join(dir_path, fname)
        process_file(fpath)


def process_file(fpath):
    # process only files
    if not os.path.isfile(fpath):
        _lg.debug("Not a file: %s", fpath)
        return

    # split filepath to dir path, title, and extension
    dir_path, fname = os.path.split(fpath)
    title, ext = os.path.splitext(fname)
    ext = ext[1:]
    if ext not in PROCESSED_FILETYPES:
        _lg.debug("Extension is not supported: %s", fpath)
        return

    parsed_title = parse_title(title)

    # create file name from parsed chunks
    chunk_order = [k for k, _ in PATTERNS]
    chunk_order = ["name"] + chunk_order
    result = []
    for chunk_type in chunk_order:
        if not parsed_title.get(chunk_type, []):
            continue
        result.append(".".join(parsed_title[chunk_type]))
    result.append(ext)
    result = ".".join(result)

    if result != fname:
        _lg.warning("%s -> %s", fname, result)


def parse_title(title):
    """ Split media title to components. """

    chunks = list(filter(None, re.split(SEPARATORS, title)))
    p_title = collections.defaultdict(list)

    # remove non-word chunks (like single hyphens)
    chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))

    # parse each chunk
    unknown_chunks = {}
    for idx, chunk in enumerate(chunks):
        pat_type = guess_part(chunk)
        if pat_type != "unknown":
            p_title[pat_type].append(chunk)
        else:
            unknown_chunks[idx] = chunk

    # try to combine unknown chunks in pairs and parse them
    if len(unknown_chunks) > 1:
        prev_idx = -1
        for idx in sorted(unknown_chunks.keys()):

            # first unknown chunk, skip
            if prev_idx < 0:
                prev_idx = idx
                continue
            # previous unknown chunk does not border with current, skip
            if (prev_idx + 1) != idx:
                prev_idx = idx
                continue

            # create combined chunk
            cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
            cmb_chunk_type = guess_part(cmb_chunk)

            # check next pair if nothing
            if cmb_chunk_type == "unknown":
                prev_idx = idx
                continue

            # if combined chunk matches pattern, add it to found type
            # and remove from unknown chunks its parts
            p_title[cmb_chunk_type].append(cmb_chunk)
            del unknown_chunks[prev_idx]
            del unknown_chunks[idx]
            prev_idx = -1

    # try to parse unknown chunks, replacing all hyphens in them with dots
    if unknown_chunks:
        # create string from unknown_chunks with dots instead of hyphens
        u_chunks_str = ".".join(unknown_chunks.values())
        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
        # recursion exit condition
        if uc_title != title:
            p_uc_title = parse_title(uc_title)
            # if parsed uc_title has smth else than "unknown", update p_title
            if list(p_uc_title.keys()) != ["unknown"]:
                p_title.update(p_uc_title)
                # unknown_chunks should be cleared,
                # because it was processed in nested function call
                unknown_chunks = {}

    # cut name from unknown chunks
    # name is the first n consequent chunks
    # only if amount of unknown chunks differs from overall amount of chunks
    if len(unknown_chunks) != len(chunks):
        i = 0
        for idx in sorted(unknown_chunks.keys()):
            if idx != i:
                break
            p_title["name"].append(unknown_chunks[idx])
            del unknown_chunks[idx]
            i += 1

    for idx in sorted(unknown_chunks.keys()):
        p_title["unknown"].append(unknown_chunks[idx])
    return dict(p_title)


def guess_part(fname_part):
    for pat_type, pattern in PATTERNS:
        full_match_pat = r"^" + pattern + r"$"
        if re.match(full_match_pat, fname_part, flags=re.I):
            return pat_type
    raise RuntimeError("unhandled pattern type")


if __name__ == "__main__":
    sys.exit(main())
Add python parse implementation 2022-01-03 07:56:15 +00:00			`#!/usr/bin/env python3`

			`import argparse`
			`import collections`
			`import logging`
			`import os`
			`import os.path`
			`import re`
			`import sys`


			`PROCESSED_FILETYPES = (`
			`"mkv",`
			`"avi",`
			`"ts",`
			`)`
			`SEPARATORS = r"[() .!,_\[\]]"`
			`SEPARATORS_HYPHEN = r"[\-" + SEPARATORS[1:]`
			`LANGUAGES = r"(rus\|eng\|ukr\|jap\|ita\|chi\|kor\|ger\|fre\|spa\|pol)"`
			`PATTERNS = (`
			`("episode", r"s\d{1,2}(e\d{1,2})?"),`
			`("year", r"(19\|20)\d{2}"),`
			`("edition", r"((theatrical\|director'*s\|extended\|un)[-.]?cut"`
			`r"\|imax[-.]edition"`
			`r"\|noir[-.]edition"`
Add tests 2022-01-12 20:24:10 +00:00			`r"\|black[-.]chrome[-.]edition"`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`r"\|extended[-.]edition"`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`r"\|theatrical)"),`
			`("restrictions", r"(unrated)"),`
			`("resolution", r"[0-9]{3,4}[pi]"),`
			`("quality", r"((blu[-.]?ray\|bd)[-.]?remux"`
			`r"\|(blu[-.]?ray\|bd\|uhd\|hd(dvd\|tv)?\|web([-.]?dl)?\|dvd)[-.]?rip"`
			`r"\|web[-.]?dl\|blu[-.]?ray\|hdtv\|hddvd\|dvd(9)?\|f-hd\|uhd\|remastered"`
			`r"\|amzn)"),`
			`("codec", r"([hx]\.?26[45]\|(mpeg4-)?avc\|hevc(10)?\|xvid\|divx)"),`
			`("hdr", r"(hdr(10)?\|10bit)"),`
			`("audio", r"%s?(dts(-es)?\|ac3\|flac\|dd5\.1\|aac2\.0\|dub-line)" % LANGUAGES),`
			`("subtitles", r"%s?sub" % LANGUAGES),`
			`("language", r"(\d{1,2}x)?%s" % LANGUAGES),`
			`("unknown", r".*")`
			`)`

			`_lg = logging.getLogger("spqr.movie-renamer")`


			`def main():`
			`parser = argparse.ArgumentParser(description="Rename media files.")`
			`parser.add_argument("target", type=str,`
			`help="path to the media file/directory")`
			`parser.add_argument("-v", "--verbose", action="store_true", default=False,`
			`help="verbose output")`
			`args = parser.parse_args()`

			`loglevel = logging.DEBUG if args.verbose else logging.INFO`
			`logging.basicConfig(level=loglevel)`

			`if os.path.isdir(args.target):`
			`process_dir(args.target)`
			`else:`
			`process_file(args.target)`

			`return 0`


			`def process_dir(dir_path):`
			`for fname in os.listdir(dir_path):`
			`fpath = os.path.join(dir_path, fname)`
			`process_file(fpath)`


			`def process_file(fpath):`
			`# process only files`
			`if not os.path.isfile(fpath):`
			`_lg.debug("Not a file: %s", fpath)`
			`return`

			`# split filepath to dir path, title, and extension`
			`dir_path, fname = os.path.split(fpath)`
			`title, ext = os.path.splitext(fname)`
			`ext = ext[1:]`
			`if ext not in PROCESSED_FILETYPES:`
			`_lg.debug("Extension is not supported: %s", fpath)`
			`return`

			`parsed_title = parse_title(title)`

			`# create file name from parsed chunks`
			`chunk_order = [k for k, _ in PATTERNS]`
			`chunk_order = ["name"] + chunk_order`
			`result = []`
			`for chunk_type in chunk_order:`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`if not parsed_title.get(chunk_type, []):`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`continue`
			`result.append(".".join(parsed_title[chunk_type]))`
			`result.append(ext)`
			`result = ".".join(result)`

			`if result != fname:`
			`_lg.warning("%s -> %s", fname, result)`


			`def parse_title(title):`
			`""" Split media title to components. """`

			`chunks = list(filter(None, re.split(SEPARATORS, title)))`
			`p_title = collections.defaultdict(list)`

Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`# remove non-word chunks (like single hyphens)`
			`chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))`

Add python parse implementation 2022-01-03 07:56:15 +00:00			`# parse each chunk`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`unknown_chunks = {}`
			`for idx, chunk in enumerate(chunks):`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`pat_type = guess_part(chunk)`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`if pat_type != "unknown":`
			`p_title[pat_type].append(chunk)`
			`else:`
			`unknown_chunks[idx] = chunk`
Add python parse implementation 2022-01-03 07:56:15 +00:00
			`# try to combine unknown chunks in pairs and parse them`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`if len(unknown_chunks) > 1:`
			`prev_idx = -1`
			`for idx in sorted(unknown_chunks.keys()):`

			`# first unknown chunk, skip`
			`if prev_idx < 0:`
			`prev_idx = idx`
			`continue`
			`# previous unknown chunk does not border with current, skip`
			`if (prev_idx + 1) != idx:`
			`prev_idx = idx`
			`continue`

Add python parse implementation 2022-01-03 07:56:15 +00:00			`# create combined chunk`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`cmb_chunk_type = guess_part(cmb_chunk)`

Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`# check next pair if nothing`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`if cmb_chunk_type == "unknown":`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`prev_idx = idx`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`continue`

Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`# if combined chunk matches pattern, add it to found type`
			`# and remove from unknown chunks its parts`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`p_title[cmb_chunk_type].append(cmb_chunk)`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`del unknown_chunks[prev_idx]`
			`del unknown_chunks[idx]`
			`prev_idx = -1`
Add python parse implementation 2022-01-03 07:56:15 +00:00
			`# try to parse unknown chunks, replacing all hyphens in them with dots`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`if unknown_chunks:`
			`# create string from unknown_chunks with dots instead of hyphens`
			`u_chunks_str = ".".join(unknown_chunks.values())`
			`uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`# recursion exit condition`
			`if uc_title != title:`
			`p_uc_title = parse_title(uc_title)`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`# if parsed uc_title has smth else than "unknown", update p_title`
Add python parse implementation 2022-01-03 07:56:15 +00:00			`if list(p_uc_title.keys()) != ["unknown"]:`
			`p_title.update(p_uc_title)`
Rework parsing title Better guessing of movie name. 2022-01-09 19:09:02 +00:00			`# unknown_chunks should be cleared,`
			`# because it was processed in nested function call`
			`unknown_chunks = {}`

			`# cut name from unknown chunks`
			`# name is the first n consequent chunks`
			`# only if amount of unknown chunks differs from overall amount of chunks`
			`if len(unknown_chunks) != len(chunks):`
			`i = 0`
			`for idx in sorted(unknown_chunks.keys()):`
			`if idx != i:`
			`break`
			`p_title["name"].append(unknown_chunks[idx])`
			`del unknown_chunks[idx]`
			`i += 1`

			`for idx in sorted(unknown_chunks.keys()):`
			`p_title["unknown"].append(unknown_chunks[idx])`
			`return dict(p_title)`
Add python parse implementation 2022-01-03 07:56:15 +00:00

			`def guess_part(fname_part):`
			`for pat_type, pattern in PATTERNS:`
			`full_match_pat = r"^" + pattern + r"$"`
			`if re.match(full_match_pat, fname_part, flags=re.I):`
			`return pat_type`
			`raise RuntimeError("unhandled pattern type")`


			`if __name__ == "__main__":`
			`sys.exit(main())`