From 1b3af219d2f858d12f706fecf2235fd9a0981234 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 9 Jan 2022 22:09:02 +0300 Subject: [PATCH] Rework parsing title Better guessing of movie name. --- renamer.py | 102 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 44 deletions(-) diff --git a/renamer.py b/renamer.py index eb03b71..42362f8 100755 --- a/renamer.py +++ b/renamer.py @@ -6,7 +6,6 @@ import logging import os import os.path import re -import string import sys @@ -24,6 +23,7 @@ PATTERNS = ( ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut" r"|imax[-.]edition" r"|noir[-.]edition" + r"|extended[-.]edition" r"|theatrical)"), ("restrictions", r"(unrated)"), ("resolution", r"[0-9]{3,4}[pi]"), @@ -88,7 +88,7 @@ def process_file(fpath): chunk_order = ["name"] + chunk_order result = [] for chunk_type in chunk_order: - if not parsed_title[chunk_type]: + if not parsed_title.get(chunk_type, []): continue result.append(".".join(parsed_title[chunk_type])) result.append(ext) @@ -104,64 +104,78 @@ def parse_title(title): chunks = list(filter(None, re.split(SEPARATORS, title))) p_title = collections.defaultdict(list) + # remove non-word chunks (like single hyphens) + chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks)) + # parse each chunk - is_name = True - for chunk in chunks: + unknown_chunks = {} + for idx, chunk in enumerate(chunks): pat_type = guess_part(chunk) - # consider chunk as part of the name until meta info is found - if is_name: - if pat_type == "unknown": - pat_type = "name" - else: - is_name = False - p_title[pat_type].append(chunk) - - # if name is the only thing we have, then we parsed nothing - if is_name: - p_title["unknown"] = p_title["name"] - del p_title["name"] - - # remove unknown chunks without alphanumerals (like single hyphens) - u_chunks = p_title.get("unknown", []) - clean_u_chunks = [] - for u_chunk in u_chunks: - acceptable_chars = set(string.digits + string.ascii_lowercase) - if set(u_chunk.lower()) & acceptable_chars: - clean_u_chunks.append(u_chunk) - p_title["unknown"] = clean_u_chunks + if pat_type != "unknown": + p_title[pat_type].append(chunk) + else: + unknown_chunks[idx] = chunk # try to combine unknown chunks in pairs and parse them - u_chunks = p_title.get("unknown", []) - if len(u_chunks) > 1: - i = 0 - while i < (len(u_chunks) - 1): - # create combined chunk - cmb_chunk = ".".join(u_chunks[i:i+2]) - cmb_chunk_type = guess_part(cmb_chunk) + if len(unknown_chunks) > 1: + prev_idx = -1 + for idx in sorted(unknown_chunks.keys()): - # go to next pair if nothing - if cmb_chunk_type == "unknown": - i += 1 + # first unknown chunk, skip + if prev_idx < 0: + prev_idx = idx + continue + # previous unknown chunk does not border with current, skip + if (prev_idx + 1) != idx: + prev_idx = idx continue - # if combined chunk matches pattern, add to found type - # and remove from unknown its parts + # create combined chunk + cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]]) + cmb_chunk_type = guess_part(cmb_chunk) + + # check next pair if nothing + if cmb_chunk_type == "unknown": + prev_idx = idx + continue + + # if combined chunk matches pattern, add it to found type + # and remove from unknown chunks its parts p_title[cmb_chunk_type].append(cmb_chunk) - del u_chunks[i:i+2] + del unknown_chunks[prev_idx] + del unknown_chunks[idx] + prev_idx = -1 # try to parse unknown chunks, replacing all hyphens in them with dots - u_chunks = p_title.get("unknown", []) - if u_chunks: - # create string from u_chunks with dots instead of hyphens - uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks)))) + if unknown_chunks: + # create string from unknown_chunks with dots instead of hyphens + u_chunks_str = ".".join(unknown_chunks.values()) + uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str))) # recursion exit condition if uc_title != title: p_uc_title = parse_title(uc_title) - # if parsed uc_title has something else than "unknown", update p_title + # if parsed uc_title has smth else than "unknown", update p_title if list(p_uc_title.keys()) != ["unknown"]: p_title.update(p_uc_title) + # unknown_chunks should be cleared, + # because it was processed in nested function call + unknown_chunks = {} - return p_title + # cut name from unknown chunks + # name is the first n consequent chunks + # only if amount of unknown chunks differs from overall amount of chunks + if len(unknown_chunks) != len(chunks): + i = 0 + for idx in sorted(unknown_chunks.keys()): + if idx != i: + break + p_title["name"].append(unknown_chunks[idx]) + del unknown_chunks[idx] + i += 1 + + for idx in sorted(unknown_chunks.keys()): + p_title["unknown"].append(unknown_chunks[idx]) + return dict(p_title) def guess_part(fname_part):