Rework parsing title

Better guessing of movie name.
2022-01-09 22:09:02 +03:00
parent 368fc7e624
commit 1b3af219d2
1 changed files with 58 additions and 44 deletions
--- a/renamer.py
+++ b/renamer.py
@@ -6,7 +6,6 @@ import logging
 import os
 import os.path
 import re
-import string
 import sys


@@ -24,6 +23,7 @@ PATTERNS = (
    ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                r"|imax[-.]edition"
                r"|noir[-.]edition"
+                r"|extended[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
@@ -88,7 +88,7 @@ def process_file(fpath):
    chunk_order = ["name"] + chunk_order
    result = []
    for chunk_type in chunk_order:
-        if not parsed_title[chunk_type]:
+        if not parsed_title.get(chunk_type, []):
            continue
        result.append(".".join(parsed_title[chunk_type]))
    result.append(ext)
@@ -104,64 +104,78 @@ def parse_title(title):
    chunks = list(filter(None, re.split(SEPARATORS, title)))
    p_title = collections.defaultdict(list)

+    # remove non-word chunks (like single hyphens)
+    chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
+
    # parse each chunk
-    is_name = True
-    for chunk in chunks:
+    unknown_chunks = {}
+    for idx, chunk in enumerate(chunks):
        pat_type = guess_part(chunk)
-        # consider chunk as part of the name until meta info is found
-        if is_name:
-            if pat_type == "unknown":
-                pat_type = "name"
-            else:
-                is_name = False
-        p_title[pat_type].append(chunk)
-
-    # if name is the only thing we have, then we parsed nothing
-    if is_name:
-        p_title["unknown"] = p_title["name"]
-        del p_title["name"]
-
-    # remove unknown chunks without alphanumerals (like single hyphens)
-    u_chunks = p_title.get("unknown", [])
-    clean_u_chunks = []
-    for u_chunk in u_chunks:
-        acceptable_chars = set(string.digits + string.ascii_lowercase)
-        if set(u_chunk.lower()) & acceptable_chars:
-            clean_u_chunks.append(u_chunk)
-    p_title["unknown"] = clean_u_chunks
+        if pat_type != "unknown":
+            p_title[pat_type].append(chunk)
+        else:
+            unknown_chunks[idx] = chunk

    # try to combine unknown chunks in pairs and parse them
-    u_chunks = p_title.get("unknown", [])
-    if len(u_chunks) > 1:
-        i = 0
-        while i < (len(u_chunks) - 1):
-            # create combined chunk
-            cmb_chunk = ".".join(u_chunks[i:i+2])
-            cmb_chunk_type = guess_part(cmb_chunk)
+    if len(unknown_chunks) > 1:
+        prev_idx = -1
+        for idx in sorted(unknown_chunks.keys()):

-            # go to next pair if nothing
-            if cmb_chunk_type == "unknown":
-                i += 1
+            # first unknown chunk, skip
+            if prev_idx < 0:
+                prev_idx = idx
+                continue
+            # previous unknown chunk does not border with current, skip
+            if (prev_idx + 1) != idx:
+                prev_idx = idx
                continue

-            # if combined chunk matches pattern, add to found type
-            # and remove from unknown its parts
+            # create combined chunk
+            cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
+            cmb_chunk_type = guess_part(cmb_chunk)
+
+            # check next pair if nothing
+            if cmb_chunk_type == "unknown":
+                prev_idx = idx
+                continue
+
+            # if combined chunk matches pattern, add it to found type
+            # and remove from unknown chunks its parts
            p_title[cmb_chunk_type].append(cmb_chunk)
-            del u_chunks[i:i+2]
+            del unknown_chunks[prev_idx]
+            del unknown_chunks[idx]
+            prev_idx = -1

    # try to parse unknown chunks, replacing all hyphens in them with dots
-    u_chunks = p_title.get("unknown", [])
-    if u_chunks:
-        # create string from u_chunks with dots instead of hyphens
-        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks))))
+    if unknown_chunks:
+        # create string from unknown_chunks with dots instead of hyphens
+        u_chunks_str = ".".join(unknown_chunks.values())
+        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
        # recursion exit condition
        if uc_title != title:
            p_uc_title = parse_title(uc_title)
-            # if parsed uc_title has something else than "unknown", update p_title
+            # if parsed uc_title has smth else than "unknown", update p_title
            if list(p_uc_title.keys()) != ["unknown"]:
                p_title.update(p_uc_title)
+                # unknown_chunks should be cleared,
+                # because it was processed in nested function call
+                unknown_chunks = {}

-    return p_title
+    # cut name from unknown chunks
+    # name is the first n consequent chunks
+    # only if amount of unknown chunks differs from overall amount of chunks
+    if len(unknown_chunks) != len(chunks):
+        i = 0
+        for idx in sorted(unknown_chunks.keys()):
+            if idx != i:
+                break
+            p_title["name"].append(unknown_chunks[idx])
+            del unknown_chunks[idx]
+            i += 1
+
+    for idx in sorted(unknown_chunks.keys()):
+        p_title["unknown"].append(unknown_chunks[idx])
+    return dict(p_title)


 def guess_part(fname_part):