Rework parsing title

Better guessing of movie name.
This commit is contained in:
Maks Snegov 2022-01-09 22:09:02 +03:00
parent 368fc7e624
commit 1b3af219d2

View File

@ -6,7 +6,6 @@ import logging
import os import os
import os.path import os.path
import re import re
import string
import sys import sys
@ -24,6 +23,7 @@ PATTERNS = (
("edition", r"((theatrical|director'*s|extended|un)[-.]?cut" ("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
r"|imax[-.]edition" r"|imax[-.]edition"
r"|noir[-.]edition" r"|noir[-.]edition"
r"|extended[-.]edition"
r"|theatrical)"), r"|theatrical)"),
("restrictions", r"(unrated)"), ("restrictions", r"(unrated)"),
("resolution", r"[0-9]{3,4}[pi]"), ("resolution", r"[0-9]{3,4}[pi]"),
@ -88,7 +88,7 @@ def process_file(fpath):
chunk_order = ["name"] + chunk_order chunk_order = ["name"] + chunk_order
result = [] result = []
for chunk_type in chunk_order: for chunk_type in chunk_order:
if not parsed_title[chunk_type]: if not parsed_title.get(chunk_type, []):
continue continue
result.append(".".join(parsed_title[chunk_type])) result.append(".".join(parsed_title[chunk_type]))
result.append(ext) result.append(ext)
@ -104,64 +104,78 @@ def parse_title(title):
chunks = list(filter(None, re.split(SEPARATORS, title))) chunks = list(filter(None, re.split(SEPARATORS, title)))
p_title = collections.defaultdict(list) p_title = collections.defaultdict(list)
# remove non-word chunks (like single hyphens)
chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
# parse each chunk # parse each chunk
is_name = True unknown_chunks = {}
for chunk in chunks: for idx, chunk in enumerate(chunks):
pat_type = guess_part(chunk) pat_type = guess_part(chunk)
# consider chunk as part of the name until meta info is found if pat_type != "unknown":
if is_name: p_title[pat_type].append(chunk)
if pat_type == "unknown": else:
pat_type = "name" unknown_chunks[idx] = chunk
else:
is_name = False
p_title[pat_type].append(chunk)
# if name is the only thing we have, then we parsed nothing
if is_name:
p_title["unknown"] = p_title["name"]
del p_title["name"]
# remove unknown chunks without alphanumerals (like single hyphens)
u_chunks = p_title.get("unknown", [])
clean_u_chunks = []
for u_chunk in u_chunks:
acceptable_chars = set(string.digits + string.ascii_lowercase)
if set(u_chunk.lower()) & acceptable_chars:
clean_u_chunks.append(u_chunk)
p_title["unknown"] = clean_u_chunks
# try to combine unknown chunks in pairs and parse them # try to combine unknown chunks in pairs and parse them
u_chunks = p_title.get("unknown", []) if len(unknown_chunks) > 1:
if len(u_chunks) > 1: prev_idx = -1
i = 0 for idx in sorted(unknown_chunks.keys()):
while i < (len(u_chunks) - 1):
# create combined chunk
cmb_chunk = ".".join(u_chunks[i:i+2])
cmb_chunk_type = guess_part(cmb_chunk)
# go to next pair if nothing # first unknown chunk, skip
if cmb_chunk_type == "unknown": if prev_idx < 0:
i += 1 prev_idx = idx
continue
# previous unknown chunk does not border with current, skip
if (prev_idx + 1) != idx:
prev_idx = idx
continue continue
# if combined chunk matches pattern, add to found type # create combined chunk
# and remove from unknown its parts cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
cmb_chunk_type = guess_part(cmb_chunk)
# check next pair if nothing
if cmb_chunk_type == "unknown":
prev_idx = idx
continue
# if combined chunk matches pattern, add it to found type
# and remove from unknown chunks its parts
p_title[cmb_chunk_type].append(cmb_chunk) p_title[cmb_chunk_type].append(cmb_chunk)
del u_chunks[i:i+2] del unknown_chunks[prev_idx]
del unknown_chunks[idx]
prev_idx = -1
# try to parse unknown chunks, replacing all hyphens in them with dots # try to parse unknown chunks, replacing all hyphens in them with dots
u_chunks = p_title.get("unknown", []) if unknown_chunks:
if u_chunks: # create string from unknown_chunks with dots instead of hyphens
# create string from u_chunks with dots instead of hyphens u_chunks_str = ".".join(unknown_chunks.values())
uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks)))) uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
# recursion exit condition # recursion exit condition
if uc_title != title: if uc_title != title:
p_uc_title = parse_title(uc_title) p_uc_title = parse_title(uc_title)
# if parsed uc_title has something else than "unknown", update p_title # if parsed uc_title has smth else than "unknown", update p_title
if list(p_uc_title.keys()) != ["unknown"]: if list(p_uc_title.keys()) != ["unknown"]:
p_title.update(p_uc_title) p_title.update(p_uc_title)
# unknown_chunks should be cleared,
# because it was processed in nested function call
unknown_chunks = {}
return p_title # cut name from unknown chunks
# name is the first n consequent chunks
# only if amount of unknown chunks differs from overall amount of chunks
if len(unknown_chunks) != len(chunks):
i = 0
for idx in sorted(unknown_chunks.keys()):
if idx != i:
break
p_title["name"].append(unknown_chunks[idx])
del unknown_chunks[idx]
i += 1
for idx in sorted(unknown_chunks.keys()):
p_title["unknown"].append(unknown_chunks[idx])
return dict(p_title)
def guess_part(fname_part): def guess_part(fname_part):