Rework parsing title

Better guessing of movie name.
2022-01-09 22:09:02 +03:00
parent 368fc7e624
commit 1b3af219d2
1 changed files with 58 additions and 44 deletions
--- a/renamer.py
+++ b/renamer.py
@@ -6,7 +6,6 @@ import logging
 import os
 import os.path
 import re
 import string
 import sys
@@ -24,6 +23,7 @@ PATTERNS = (
    ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                r"|imax[-.]edition"
                r"|noir[-.]edition"
                r"|extended[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
@@ -88,7 +88,7 @@ def process_file(fpath):
    chunk_order = ["name"] + chunk_order
    result = []
    for chunk_type in chunk_order:
-        if not parsed_title[chunk_type]:
+        if not parsed_title.get(chunk_type, []):
            continue
        result.append(".".join(parsed_title[chunk_type]))
    result.append(ext)
@@ -104,64 +104,78 @@ def parse_title(title):
    chunks = list(filter(None, re.split(SEPARATORS, title)))
    p_title = collections.defaultdict(list)
    # remove non-word chunks (like single hyphens)
    chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
    # parse each chunk
-    is_name = True
+    unknown_chunks = {}
-    for chunk in chunks:
+    for idx, chunk in enumerate(chunks):
        pat_type = guess_part(chunk)
-        # consider chunk as part of the name until meta info is found
+        if pat_type != "unknown":
-        if is_name:
+            p_title[pat_type].append(chunk)
-            if pat_type == "unknown":
+        else:
-                pat_type = "name"
+            unknown_chunks[idx] = chunk
            else:
                is_name = False
        p_title[pat_type].append(chunk)
    # if name is the only thing we have, then we parsed nothing
    if is_name:
        p_title["unknown"] = p_title["name"]
        del p_title["name"]
    # remove unknown chunks without alphanumerals (like single hyphens)
    u_chunks = p_title.get("unknown", [])
    clean_u_chunks = []
    for u_chunk in u_chunks:
        acceptable_chars = set(string.digits + string.ascii_lowercase)
        if set(u_chunk.lower()) & acceptable_chars:
            clean_u_chunks.append(u_chunk)
    p_title["unknown"] = clean_u_chunks
    # try to combine unknown chunks in pairs and parse them
-    u_chunks = p_title.get("unknown", [])
+    if len(unknown_chunks) > 1:
-    if len(u_chunks) > 1:
+        prev_idx = -1
-        i = 0
+        for idx in sorted(unknown_chunks.keys()):
        while i < (len(u_chunks) - 1):
            # create combined chunk
            cmb_chunk = ".".join(u_chunks[i:i+2])
            cmb_chunk_type = guess_part(cmb_chunk)
-            # go to next pair if nothing
+            # first unknown chunk, skip
-            if cmb_chunk_type == "unknown":
+            if prev_idx < 0:
-                i += 1
+                prev_idx = idx
                continue
            # previous unknown chunk does not border with current, skip
            if (prev_idx + 1) != idx:
                prev_idx = idx
                continue
-            # if combined chunk matches pattern, add to found type
+            # create combined chunk
-            # and remove from unknown its parts
+            cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
            cmb_chunk_type = guess_part(cmb_chunk)
            # check next pair if nothing
            if cmb_chunk_type == "unknown":
                prev_idx = idx
                continue
            # if combined chunk matches pattern, add it to found type
            # and remove from unknown chunks its parts
            p_title[cmb_chunk_type].append(cmb_chunk)
-            del u_chunks[i:i+2]
+            del unknown_chunks[prev_idx]
            del unknown_chunks[idx]
            prev_idx = -1
    # try to parse unknown chunks, replacing all hyphens in them with dots
-    u_chunks = p_title.get("unknown", [])
+    if unknown_chunks:
-    if u_chunks:
+        # create string from unknown_chunks with dots instead of hyphens
-        # create string from u_chunks with dots instead of hyphens
+        u_chunks_str = ".".join(unknown_chunks.values())
-        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks))))
+        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
        # recursion exit condition
        if uc_title != title:
            p_uc_title = parse_title(uc_title)
-            # if parsed uc_title has something else than "unknown", update p_title
+            # if parsed uc_title has smth else than "unknown", update p_title
            if list(p_uc_title.keys()) != ["unknown"]:
                p_title.update(p_uc_title)
                # unknown_chunks should be cleared,
                # because it was processed in nested function call
                unknown_chunks = {}
-    return p_title
+    # cut name from unknown chunks
    # name is the first n consequent chunks
    # only if amount of unknown chunks differs from overall amount of chunks
    if len(unknown_chunks) != len(chunks):
        i = 0
        for idx in sorted(unknown_chunks.keys()):
            if idx != i:
                break
            p_title["name"].append(unknown_chunks[idx])
            del unknown_chunks[idx]
            i += 1
    for idx in sorted(unknown_chunks.keys()):
        p_title["unknown"].append(unknown_chunks[idx])
    return dict(p_title)
 def guess_part(fname_part):