Add support for episode names

This commit is contained in:
Maks Snegov 2022-01-16 09:11:00 +03:00
parent 58b1606a62
commit 463e5b4c9f
2 changed files with 266 additions and 85 deletions

View File

@ -25,6 +25,7 @@ PATTERNS = (
r"|noir[-.]edition"
r"|black[-.]chrome[-.]edition"
r"|extended[-.]edition"
r"|hq[-.]edition"
r"|theatrical)"),
("restrictions", r"(unrated)"),
("resolution", r"[0-9]{3,4}[pi]"),
@ -37,6 +38,7 @@ PATTERNS = (
("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
("subtitles", r"%s?sub" % LANGUAGES),
("language", r"(\d{1,2}x)?%s" % LANGUAGES),
("file_extension", r"mkv|avi"),
("unknown", r".*")
)
@ -99,83 +101,130 @@ def process_file(fpath):
_lg.warning("%s -> %s", fname, result)
def _get_parsed_title_dict(chunk_list, chunk_map):
p_title = collections.defaultdict(list)
for idx, chunk in enumerate(chunk_list):
chunk_type = chunk_map[idx]
p_title[chunk_type].append(chunk)
return p_title
def _guess_combined(chunk_values, chunk_map):
""" Try to combine unknown chunks in pairs and parse them """
is_changed = False
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) < 2:
return is_changed, chunk_values, chunk_map
# i - begin of slice, j - end of slice
i = 0
# process up to second-to-last element
while i < len(chunk_map) - 1:
# we need slice with at least two elements
j = i + 2
# we need only unknown elements
while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):
# create combined chunk
cmb_chunk = ".".join(chunk_values[i:j])
cmb_chunk_type = guess_part(cmb_chunk)
# add new combined chunk in lists
# first subelement gets new chunk, rest - None
# (will be removed later)
if cmb_chunk_type != "unknown":
is_changed = True
chunk_values[i] = cmb_chunk
chunk_map[i] = cmb_chunk_type
for idx in range(i+1, j):
chunk_values[idx] = None
chunk_map[idx] = None
# to start checking next chunks right after the end of slice
i = idx
break
# try add more elements to combined chunk
else:
j += 1
# start checking next value
i += 1
# clean up from None values
chunk_values = list(filter(None, chunk_values))
chunk_map = list(filter(None, chunk_map))
return is_changed, chunk_values, chunk_map
def parse_title(title):
""" Split media title to components. """
chunks = list(filter(None, re.split(SEPARATORS, title)))
p_title = collections.defaultdict(list)
chunk_values = filter(None, re.split(SEPARATORS, title))
# remove non-word chunks (like single hyphens)
chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
chunk_values = list(filter(lambda ch: re.search(r"\w+", ch), chunk_values))
# parse each chunk
unknown_chunks = {}
for idx, chunk in enumerate(chunks):
pat_type = guess_part(chunk)
if pat_type != "unknown":
p_title[pat_type].append(chunk)
else:
unknown_chunks[idx] = chunk
chunk_map = []
for ch_value in chunk_values:
chunk_map.append(guess_part(ch_value))
# try to combine unknown chunks in pairs and parse them
if len(unknown_chunks) > 1:
prev_idx = -1
for idx in sorted(unknown_chunks.keys()):
_, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)
# first unknown chunk, skip
if prev_idx < 0:
prev_idx = idx
# # try to parse unknown chunks, replacing all hyphens in them with dots
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
is_changed = False
if p_title.get("unknown"):
spl_ch_values = []
spl_ch_map = []
for idx, ch_value in enumerate(chunk_values):
ch_type = chunk_map[idx]
if ch_type == "unknown" and "-" in ch_value:
spl_values = ch_value.split("-")
for spl_val in spl_values:
if not spl_val:
continue
spl_type = guess_part(spl_val)
if spl_type != "unknown":
is_changed = True
spl_ch_values.append(spl_val)
spl_ch_map.append(spl_type)
else:
spl_ch_values.append(ch_value)
spl_ch_map.append(ch_type)
is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)
if is_changed or is_combined:
chunk_values = spl_ch_values
chunk_map = spl_ch_map
# parse name and episode name
# only if there is something except unknown chunks
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) != len(chunk_values):
idx = 0
while idx < len(chunk_map) and chunk_map[idx] == "unknown":
chunk_map[idx] = "name"
idx += 1
# if episode number is found, next unknown chunks are episode name
if p_title.get("episode"):
idx = chunk_map.index("episode") + 1
while idx < len(chunk_map) and chunk_map[idx] == "unknown":
chunk_map[idx] = "episode_name"
idx += 1
# at last, strip hyphens from unknown chunks
# only if there is something except unknown chunks
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
if len(p_title["unknown"]) != len(chunk_values):
for idx, chunk_type in enumerate(chunk_map):
if chunk_type != "unknown":
continue
# previous unknown chunk does not border with current, skip
if (prev_idx + 1) != idx:
prev_idx = idx
chunk_value = chunk_values[idx]
if chunk_value[0] != "-" and chunk_value[-1] != "-":
continue
chunk_values[idx] = chunk_value.strip("-")
# create combined chunk
cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
cmb_chunk_type = guess_part(cmb_chunk)
# check next pair if nothing
if cmb_chunk_type == "unknown":
prev_idx = idx
continue
# if combined chunk matches pattern, add it to found type
# and remove from unknown chunks its parts
p_title[cmb_chunk_type].append(cmb_chunk)
del unknown_chunks[prev_idx]
del unknown_chunks[idx]
prev_idx = -1
# try to parse unknown chunks, replacing all hyphens in them with dots
if unknown_chunks:
# create string from unknown_chunks with dots instead of hyphens
u_chunks_str = ".".join(unknown_chunks.values())
uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
# recursion exit condition
if uc_title != title:
p_uc_title = parse_title(uc_title)
# if parsed uc_title has smth else than "unknown", update p_title
if list(p_uc_title.keys()) != ["unknown"]:
p_title.update(p_uc_title)
# unknown_chunks should be cleared,
# because it was processed in nested function call
unknown_chunks = {}
# cut name from unknown chunks
# name is the first n consequent chunks
# only if amount of unknown chunks differs from overall amount of chunks
if len(unknown_chunks) != len(chunks):
i = 0
for idx in sorted(unknown_chunks.keys()):
if idx != i:
break
p_title["name"].append(unknown_chunks[idx])
del unknown_chunks[idx]
i += 1
for idx in sorted(unknown_chunks.keys()):
p_title["unknown"].append(unknown_chunks[idx])
p_title = _get_parsed_title_dict(chunk_values, chunk_map)
return dict(p_title)

View File

@ -2,7 +2,71 @@ import unittest
from renamer import parse_title
class TestParser(unittest.TestCase):
class TestParserParts(unittest.TestCase):
def test_episode_name(self):
title = "The.Walking.Dead.S04E06.Live.Bait.1080p.WEB-DL.Rus.Eng.HDCLUB"
res = parse_title(title)
self.assertEqual(["Live", "Bait"], res.get("episode_name"))
self.assertEqual(
res,
{
"episode": ["S04E06"],
"resolution": ["1080p"],
"quality": ["WEB-DL"],
"episode_name": ["Live", "Bait"],
"language": ["Rus", "Eng"],
"name": ["The", "Walking", "Dead"],
"unknown": ["HDCLUB"],
},
)
def test_episode_number(self):
title = "Vikings.S01E01.720p.BluRay.4xRus.Eng.HDCLUB"
res = parse_title(title)
self.assertIn("S01E01", res.get("episode", []))
self.assertEqual(
res,
{
"episode": ["S01E01"],
"resolution": ["720p"],
"quality": ["BluRay"],
"language": ["4xRus", "Eng"],
"name": ["Vikings"],
"unknown": ["HDCLUB"],
},
)
def test_subtitles(self):
title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub"
res = parse_title(title)
self.assertIn("EngSub", res.get("subtitles", []))
self.assertEqual(
res,
{
"quality": ["Blu-RayRip"],
"resolution": ["720p"],
"audio": ["RusDTS", "GerAC3"],
"subtitles": ["EngSub"],
"name": ["Lives", "of", "Others"],
},
)
def test_file_extension(self):
title = "The Guild s04e06 Weird Respawn (by Swich).mkv"
res = parse_title(title)
self.assertIn("mkv", res.get("file_extension", []))
self.assertEqual(
res,
{
"name": ["The", "Guild"],
"episode": ["s04e06"],
"episode_name": ["Weird", "Respawn", "by", "Swich"],
"file_extension": ["mkv"],
},
)
class TestCornerCases(unittest.TestCase):
def test_misc_separators(self):
title = "V tumane 2012 1080p BluRay DD5.1 x264-EA"
res = parse_title(title)
@ -22,6 +86,24 @@ class TestParser(unittest.TestCase):
},
)
def test_misc_separators_and_combine(self):
title = "The.Dawn.Patrol.1938.1080p.WEB-DL.AAC2.0.H.264-alinto"
res = parse_title(title)
self.assertIn("H.264", res.get("codec", []))
self.assertIn("alinto", res.get("unknown", []))
self.assertEqual(
res,
{
"year": ["1938"],
"resolution": ["1080p"],
"quality": ["WEB-DL"],
"audio": ["AAC2.0"],
"codec": ["H.264"],
"name": ["The", "Dawn", "Patrol"],
"unknown": ["alinto"],
},
)
@unittest.expectedFailure
def test_russian_char_in_resolution(self):
title = "Trouble.with.the.Curve.2012.1080р.BluRay.Rus.Eng.HDCLUB"
@ -69,10 +151,10 @@ class TestParser(unittest.TestCase):
},
)
def test_file_extension_goes_to_unknown(self):
def test_file_extension_does_not_go_to_unknown(self):
title = "Monsters,Inc.2001.BDRip.1080p.3xRus.Ukr.Eng.HDCLUB.mkv"
res = parse_title(title)
self.assertIn("mkv", res.get("unknown", []))
self.assertNotIn("mkv", res.get("unknown", []))
self.assertEqual(
res,
{
@ -81,7 +163,8 @@ class TestParser(unittest.TestCase):
"resolution": ["1080p"],
"language": ["3xRus", "Ukr", "Eng"],
"name": ["Monsters", "Inc"],
"unknown": ["HDCLUB", "mkv"],
"unknown": ["HDCLUB"],
"file_extension": ["mkv"],
},
)
@ -102,7 +185,6 @@ class TestParser(unittest.TestCase):
},
)
@unittest.expectedFailure
def test_chunk_from_three_parts(self):
title = "Mad.Max.Road.of.Fury.Black.Chrome.edition.BDRip.1080p"
self.assertEqual(
@ -162,21 +244,6 @@ class TestParser(unittest.TestCase):
},
)
def test_subtitles(self):
title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub"
res = parse_title(title)
self.assertIn("EngSub", res.get("subtitles", []))
self.assertEqual(
res,
{
"quality": ["Blu-RayRip"],
"resolution": ["720p"],
"audio": ["RusDTS", "GerAC3"],
"subtitles": ["EngSub"],
"name": ["Lives", "of", "Others"],
},
)
def test_strange_dot_in_name(self):
title = "WALL·E.2008.1080p.BluRay.3xRus.Ukr.Eng.HDCLUB-Skazhutin"
res = parse_title(title)
@ -192,3 +259,68 @@ class TestParser(unittest.TestCase):
"unknown": ["HDCLUB-Skazhutin"],
},
)
def test_splitted_by_hyphens_chunk_ends_with_empty_part(self):
title = "The.IT.Crowd.S01E04.The.Red.Door.HR.DVDRip.HQ.Edition.x264-N-(Rus.Eng)"
res = parse_title(title)
self.assertEqual(
res,
{
"name": ["The", "IT", "Crowd"],
"episode": ["S01E04"],
"episode_name": ["The", "Red", "Door", "HR"],
"quality": ["DVDRip"],
"edition": ["HQ.Edition"],
"codec": ["x264"],
"unknown": ["N"],
"language": ["Rus", "Eng"],
},
)
def test_year_before_episode_number_and_name(self):
title = "The.Big.Bang.Theory.2019.S12E20.The.Decision.Reverberation.1080p.AMZN.WEB-DL.DD5.1.x264-NTb_EniaHD.mkv"
res = parse_title(title)
self.assertEqual(
res,
{
"name": ["The", "Big", "Bang", "Theory"],
"year": ["2019"],
"episode": ["S12E20"],
"episode_name": ["The", "Decision", "Reverberation"],
"unknown": ["NTb", "EniaHD"],
"resolution": ["1080p"],
"quality": ["AMZN", "WEB-DL"],
"audio": ["DD5.1"],
"codec": ["x264"],
"file_extension": ["mkv"],
},
)
def test_unknown_stripped_from_hyphens(self):
title = "The.Big.Bang.Theory.S04E06.720p.WEB-DL.eng.rus.[Kuraj-Bambey.Ru]-jhonny2.mkv"
res = parse_title(title)
self.assertIn("jhonny2", res.get("unknown", []))
self.assertEqual(
res,
{
"name": ["The", "Big", "Bang", "Theory"],
"episode": ["S04E06"],
"resolution": ["720p"],
"quality": ["WEB-DL"],
"language": ["eng", "rus"],
"unknown": ["Kuraj-Bambey", "Ru", "jhonny2"],
"file_extension": ["mkv"],
},
)
def test_episode_name_without_series_name(self):
title = "S27E01.Every Man's Dream.mkv"
res = parse_title(title)
self.assertEqual(
res,
{
"episode": ["S27E01"],
"episode_name": ["Every", "Man's", "Dream"],
"file_extension": ["mkv"],
},
)