From 1b3af219d2f858d12f706fecf2235fd9a0981234 Mon Sep 17 00:00:00 2001
From: Maks Snegov <snegov@spqr.link>
Date: Sun, 9 Jan 2022 22:09:02 +0300
Subject: [PATCH] Rework parsing title

Better guessing of movie name.
---
 renamer.py | 102 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 58 insertions(+), 44 deletions(-)

diff --git a/renamer.py b/renamer.py
index eb03b71..42362f8 100755
--- a/renamer.py
+++ b/renamer.py
@@ -6,7 +6,6 @@ import logging
 import os
 import os.path
 import re
-import string
 import sys
 
 
@@ -24,6 +23,7 @@ PATTERNS = (
     ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
                 r"|imax[-.]edition"
                 r"|noir[-.]edition"
+                r"|extended[-.]edition"
                 r"|theatrical)"),
     ("restrictions", r"(unrated)"),
     ("resolution", r"[0-9]{3,4}[pi]"),
@@ -88,7 +88,7 @@ def process_file(fpath):
     chunk_order = ["name"] + chunk_order
     result = []
     for chunk_type in chunk_order:
-        if not parsed_title[chunk_type]:
+        if not parsed_title.get(chunk_type, []):
             continue
         result.append(".".join(parsed_title[chunk_type]))
     result.append(ext)
@@ -104,64 +104,78 @@ def parse_title(title):
     chunks = list(filter(None, re.split(SEPARATORS, title)))
     p_title = collections.defaultdict(list)
 
+    # remove non-word chunks (like single hyphens)
+    chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
+
     # parse each chunk
-    is_name = True
-    for chunk in chunks:
+    unknown_chunks = {}
+    for idx, chunk in enumerate(chunks):
         pat_type = guess_part(chunk)
-        # consider chunk as part of the name until meta info is found
-        if is_name:
-            if pat_type == "unknown":
-                pat_type = "name"
-            else:
-                is_name = False
-        p_title[pat_type].append(chunk)
-
-    # if name is the only thing we have, then we parsed nothing
-    if is_name:
-        p_title["unknown"] = p_title["name"]
-        del p_title["name"]
-
-    # remove unknown chunks without alphanumerals (like single hyphens)
-    u_chunks = p_title.get("unknown", [])
-    clean_u_chunks = []
-    for u_chunk in u_chunks:
-        acceptable_chars = set(string.digits + string.ascii_lowercase)
-        if set(u_chunk.lower()) & acceptable_chars:
-            clean_u_chunks.append(u_chunk)
-    p_title["unknown"] = clean_u_chunks
+        if pat_type != "unknown":
+            p_title[pat_type].append(chunk)
+        else:
+            unknown_chunks[idx] = chunk
 
     # try to combine unknown chunks in pairs and parse them
-    u_chunks = p_title.get("unknown", [])
-    if len(u_chunks) > 1:
-        i = 0
-        while i < (len(u_chunks) - 1):
-            # create combined chunk
-            cmb_chunk = ".".join(u_chunks[i:i+2])
-            cmb_chunk_type = guess_part(cmb_chunk)
+    if len(unknown_chunks) > 1:
+        prev_idx = -1
+        for idx in sorted(unknown_chunks.keys()):
 
-            # go to next pair if nothing
-            if cmb_chunk_type == "unknown":
-                i += 1
+            # first unknown chunk, skip
+            if prev_idx < 0:
+                prev_idx = idx
+                continue
+            # previous unknown chunk does not border with current, skip
+            if (prev_idx + 1) != idx:
+                prev_idx = idx
                 continue
 
-            # if combined chunk matches pattern, add to found type
-            # and remove from unknown its parts
+            # create combined chunk
+            cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
+            cmb_chunk_type = guess_part(cmb_chunk)
+
+            # check next pair if nothing
+            if cmb_chunk_type == "unknown":
+                prev_idx = idx
+                continue
+
+            # if combined chunk matches pattern, add it to found type
+            # and remove from unknown chunks its parts
             p_title[cmb_chunk_type].append(cmb_chunk)
-            del u_chunks[i:i+2]
+            del unknown_chunks[prev_idx]
+            del unknown_chunks[idx]
+            prev_idx = -1
 
     # try to parse unknown chunks, replacing all hyphens in them with dots
-    u_chunks = p_title.get("unknown", [])
-    if u_chunks:
-        # create string from u_chunks with dots instead of hyphens
-        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks))))
+    if unknown_chunks:
+        # create string from unknown_chunks with dots instead of hyphens
+        u_chunks_str = ".".join(unknown_chunks.values())
+        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
         # recursion exit condition
         if uc_title != title:
             p_uc_title = parse_title(uc_title)
-            # if parsed uc_title has something else than "unknown", update p_title
+            # if parsed uc_title has smth else than "unknown", update p_title
             if list(p_uc_title.keys()) != ["unknown"]:
                 p_title.update(p_uc_title)
+                # unknown_chunks should be cleared,
+                # because it was processed in nested function call
+                unknown_chunks = {}
 
-    return p_title
+    # cut name from unknown chunks
+    # name is the first n consequent chunks
+    # only if amount of unknown chunks differs from overall amount of chunks
+    if len(unknown_chunks) != len(chunks):
+        i = 0
+        for idx in sorted(unknown_chunks.keys()):
+            if idx != i:
+                break
+            p_title["name"].append(unknown_chunks[idx])
+            del unknown_chunks[idx]
+            i += 1
+
+    for idx in sorted(unknown_chunks.keys()):
+        p_title["unknown"].append(unknown_chunks[idx])
+    return dict(p_title)
 
 
 def guess_part(fname_part):