Rework parsing title
Better guessing of movie name.
This commit is contained in:
parent
368fc7e624
commit
1b3af219d2
102
renamer.py
102
renamer.py
@ -6,7 +6,6 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
import string
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
@ -24,6 +23,7 @@ PATTERNS = (
|
|||||||
("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
|
("edition", r"((theatrical|director'*s|extended|un)[-.]?cut"
|
||||||
r"|imax[-.]edition"
|
r"|imax[-.]edition"
|
||||||
r"|noir[-.]edition"
|
r"|noir[-.]edition"
|
||||||
|
r"|extended[-.]edition"
|
||||||
r"|theatrical)"),
|
r"|theatrical)"),
|
||||||
("restrictions", r"(unrated)"),
|
("restrictions", r"(unrated)"),
|
||||||
("resolution", r"[0-9]{3,4}[pi]"),
|
("resolution", r"[0-9]{3,4}[pi]"),
|
||||||
@ -88,7 +88,7 @@ def process_file(fpath):
|
|||||||
chunk_order = ["name"] + chunk_order
|
chunk_order = ["name"] + chunk_order
|
||||||
result = []
|
result = []
|
||||||
for chunk_type in chunk_order:
|
for chunk_type in chunk_order:
|
||||||
if not parsed_title[chunk_type]:
|
if not parsed_title.get(chunk_type, []):
|
||||||
continue
|
continue
|
||||||
result.append(".".join(parsed_title[chunk_type]))
|
result.append(".".join(parsed_title[chunk_type]))
|
||||||
result.append(ext)
|
result.append(ext)
|
||||||
@ -104,64 +104,78 @@ def parse_title(title):
|
|||||||
chunks = list(filter(None, re.split(SEPARATORS, title)))
|
chunks = list(filter(None, re.split(SEPARATORS, title)))
|
||||||
p_title = collections.defaultdict(list)
|
p_title = collections.defaultdict(list)
|
||||||
|
|
||||||
|
# remove non-word chunks (like single hyphens)
|
||||||
|
chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
|
||||||
|
|
||||||
# parse each chunk
|
# parse each chunk
|
||||||
is_name = True
|
unknown_chunks = {}
|
||||||
for chunk in chunks:
|
for idx, chunk in enumerate(chunks):
|
||||||
pat_type = guess_part(chunk)
|
pat_type = guess_part(chunk)
|
||||||
# consider chunk as part of the name until meta info is found
|
if pat_type != "unknown":
|
||||||
if is_name:
|
p_title[pat_type].append(chunk)
|
||||||
if pat_type == "unknown":
|
else:
|
||||||
pat_type = "name"
|
unknown_chunks[idx] = chunk
|
||||||
else:
|
|
||||||
is_name = False
|
|
||||||
p_title[pat_type].append(chunk)
|
|
||||||
|
|
||||||
# if name is the only thing we have, then we parsed nothing
|
|
||||||
if is_name:
|
|
||||||
p_title["unknown"] = p_title["name"]
|
|
||||||
del p_title["name"]
|
|
||||||
|
|
||||||
# remove unknown chunks without alphanumerals (like single hyphens)
|
|
||||||
u_chunks = p_title.get("unknown", [])
|
|
||||||
clean_u_chunks = []
|
|
||||||
for u_chunk in u_chunks:
|
|
||||||
acceptable_chars = set(string.digits + string.ascii_lowercase)
|
|
||||||
if set(u_chunk.lower()) & acceptable_chars:
|
|
||||||
clean_u_chunks.append(u_chunk)
|
|
||||||
p_title["unknown"] = clean_u_chunks
|
|
||||||
|
|
||||||
# try to combine unknown chunks in pairs and parse them
|
# try to combine unknown chunks in pairs and parse them
|
||||||
u_chunks = p_title.get("unknown", [])
|
if len(unknown_chunks) > 1:
|
||||||
if len(u_chunks) > 1:
|
prev_idx = -1
|
||||||
i = 0
|
for idx in sorted(unknown_chunks.keys()):
|
||||||
while i < (len(u_chunks) - 1):
|
|
||||||
# create combined chunk
|
|
||||||
cmb_chunk = ".".join(u_chunks[i:i+2])
|
|
||||||
cmb_chunk_type = guess_part(cmb_chunk)
|
|
||||||
|
|
||||||
# go to next pair if nothing
|
# first unknown chunk, skip
|
||||||
if cmb_chunk_type == "unknown":
|
if prev_idx < 0:
|
||||||
i += 1
|
prev_idx = idx
|
||||||
|
continue
|
||||||
|
# previous unknown chunk does not border with current, skip
|
||||||
|
if (prev_idx + 1) != idx:
|
||||||
|
prev_idx = idx
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# if combined chunk matches pattern, add to found type
|
# create combined chunk
|
||||||
# and remove from unknown its parts
|
cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
|
||||||
|
cmb_chunk_type = guess_part(cmb_chunk)
|
||||||
|
|
||||||
|
# check next pair if nothing
|
||||||
|
if cmb_chunk_type == "unknown":
|
||||||
|
prev_idx = idx
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if combined chunk matches pattern, add it to found type
|
||||||
|
# and remove from unknown chunks its parts
|
||||||
p_title[cmb_chunk_type].append(cmb_chunk)
|
p_title[cmb_chunk_type].append(cmb_chunk)
|
||||||
del u_chunks[i:i+2]
|
del unknown_chunks[prev_idx]
|
||||||
|
del unknown_chunks[idx]
|
||||||
|
prev_idx = -1
|
||||||
|
|
||||||
# try to parse unknown chunks, replacing all hyphens in them with dots
|
# try to parse unknown chunks, replacing all hyphens in them with dots
|
||||||
u_chunks = p_title.get("unknown", [])
|
if unknown_chunks:
|
||||||
if u_chunks:
|
# create string from unknown_chunks with dots instead of hyphens
|
||||||
# create string from u_chunks with dots instead of hyphens
|
u_chunks_str = ".".join(unknown_chunks.values())
|
||||||
uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, ".".join(u_chunks))))
|
uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
|
||||||
# recursion exit condition
|
# recursion exit condition
|
||||||
if uc_title != title:
|
if uc_title != title:
|
||||||
p_uc_title = parse_title(uc_title)
|
p_uc_title = parse_title(uc_title)
|
||||||
# if parsed uc_title has something else than "unknown", update p_title
|
# if parsed uc_title has smth else than "unknown", update p_title
|
||||||
if list(p_uc_title.keys()) != ["unknown"]:
|
if list(p_uc_title.keys()) != ["unknown"]:
|
||||||
p_title.update(p_uc_title)
|
p_title.update(p_uc_title)
|
||||||
|
# unknown_chunks should be cleared,
|
||||||
|
# because it was processed in nested function call
|
||||||
|
unknown_chunks = {}
|
||||||
|
|
||||||
return p_title
|
# cut name from unknown chunks
|
||||||
|
# name is the first n consequent chunks
|
||||||
|
# only if amount of unknown chunks differs from overall amount of chunks
|
||||||
|
if len(unknown_chunks) != len(chunks):
|
||||||
|
i = 0
|
||||||
|
for idx in sorted(unknown_chunks.keys()):
|
||||||
|
if idx != i:
|
||||||
|
break
|
||||||
|
p_title["name"].append(unknown_chunks[idx])
|
||||||
|
del unknown_chunks[idx]
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
for idx in sorted(unknown_chunks.keys()):
|
||||||
|
p_title["unknown"].append(unknown_chunks[idx])
|
||||||
|
return dict(p_title)
|
||||||
|
|
||||||
|
|
||||||
def guess_part(fname_part):
|
def guess_part(fname_part):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user