diff --git a/renamer.py b/renamer.py index cf2aa3b..ef4a01a 100755 --- a/renamer.py +++ b/renamer.py @@ -25,6 +25,7 @@ PATTERNS = ( r"|noir[-.]edition" r"|black[-.]chrome[-.]edition" r"|extended[-.]edition" + r"|hq[-.]edition" r"|theatrical)"), ("restrictions", r"(unrated)"), ("resolution", r"[0-9]{3,4}[pi]"), @@ -37,6 +38,7 @@ PATTERNS = ( ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES), ("subtitles", r"%s?sub" % LANGUAGES), ("language", r"(\d{1,2}x)?%s" % LANGUAGES), + ("file_extension", r"mkv|avi"), ("unknown", r".*") ) @@ -99,83 +101,130 @@ def process_file(fpath): _lg.warning("%s -> %s", fname, result) +def _get_parsed_title_dict(chunk_list, chunk_map): + p_title = collections.defaultdict(list) + for idx, chunk in enumerate(chunk_list): + chunk_type = chunk_map[idx] + p_title[chunk_type].append(chunk) + return p_title + + +def _guess_combined(chunk_values, chunk_map): + """ Try to combine unknown chunks in pairs and parse them """ + is_changed = False + p_title = _get_parsed_title_dict(chunk_values, chunk_map) + if len(p_title["unknown"]) < 2: + return is_changed, chunk_values, chunk_map + + # i - begin of slice, j - end of slice + i = 0 + # process up to second-to-last element + while i < len(chunk_map) - 1: + # we need slice with at least two elements + j = i + 2 + # we need only unknown elements + while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map): + # create combined chunk + cmb_chunk = ".".join(chunk_values[i:j]) + cmb_chunk_type = guess_part(cmb_chunk) + + # add new combined chunk in lists + # first subelement gets new chunk, rest - None + # (will be removed later) + if cmb_chunk_type != "unknown": + is_changed = True + chunk_values[i] = cmb_chunk + chunk_map[i] = cmb_chunk_type + for idx in range(i+1, j): + chunk_values[idx] = None + chunk_map[idx] = None + # to start checking next chunks right after the end of slice + i = idx + break + # try add more elements to combined chunk + else: + j += 1 + + # start checking next value + i += 1 + + # clean up from None values + chunk_values = list(filter(None, chunk_values)) + chunk_map = list(filter(None, chunk_map)) + + return is_changed, chunk_values, chunk_map + + def parse_title(title): """ Split media title to components. """ - chunks = list(filter(None, re.split(SEPARATORS, title))) - p_title = collections.defaultdict(list) + chunk_values = filter(None, re.split(SEPARATORS, title)) # remove non-word chunks (like single hyphens) - chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks)) + chunk_values = list(filter(lambda ch: re.search(r"\w+", ch), chunk_values)) # parse each chunk - unknown_chunks = {} - for idx, chunk in enumerate(chunks): - pat_type = guess_part(chunk) - if pat_type != "unknown": - p_title[pat_type].append(chunk) - else: - unknown_chunks[idx] = chunk + chunk_map = [] + for ch_value in chunk_values: + chunk_map.append(guess_part(ch_value)) - # try to combine unknown chunks in pairs and parse them - if len(unknown_chunks) > 1: - prev_idx = -1 - for idx in sorted(unknown_chunks.keys()): + _, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map) - # first unknown chunk, skip - if prev_idx < 0: - prev_idx = idx + # # try to parse unknown chunks, replacing all hyphens in them with dots + p_title = _get_parsed_title_dict(chunk_values, chunk_map) + is_changed = False + if p_title.get("unknown"): + spl_ch_values = [] + spl_ch_map = [] + for idx, ch_value in enumerate(chunk_values): + ch_type = chunk_map[idx] + if ch_type == "unknown" and "-" in ch_value: + spl_values = ch_value.split("-") + for spl_val in spl_values: + if not spl_val: + continue + spl_type = guess_part(spl_val) + if spl_type != "unknown": + is_changed = True + spl_ch_values.append(spl_val) + spl_ch_map.append(spl_type) + else: + spl_ch_values.append(ch_value) + spl_ch_map.append(ch_type) + + is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map) + if is_changed or is_combined: + chunk_values = spl_ch_values + chunk_map = spl_ch_map + + # parse name and episode name + # only if there is something except unknown chunks + p_title = _get_parsed_title_dict(chunk_values, chunk_map) + if len(p_title["unknown"]) != len(chunk_values): + idx = 0 + while idx < len(chunk_map) and chunk_map[idx] == "unknown": + chunk_map[idx] = "name" + idx += 1 + # if episode number is found, next unknown chunks are episode name + if p_title.get("episode"): + idx = chunk_map.index("episode") + 1 + while idx < len(chunk_map) and chunk_map[idx] == "unknown": + chunk_map[idx] = "episode_name" + idx += 1 + + # at last, strip hyphens from unknown chunks + # only if there is something except unknown chunks + p_title = _get_parsed_title_dict(chunk_values, chunk_map) + if len(p_title["unknown"]) != len(chunk_values): + for idx, chunk_type in enumerate(chunk_map): + if chunk_type != "unknown": continue - # previous unknown chunk does not border with current, skip - if (prev_idx + 1) != idx: - prev_idx = idx + chunk_value = chunk_values[idx] + if chunk_value[0] != "-" and chunk_value[-1] != "-": continue + chunk_values[idx] = chunk_value.strip("-") - # create combined chunk - cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]]) - cmb_chunk_type = guess_part(cmb_chunk) - - # check next pair if nothing - if cmb_chunk_type == "unknown": - prev_idx = idx - continue - - # if combined chunk matches pattern, add it to found type - # and remove from unknown chunks its parts - p_title[cmb_chunk_type].append(cmb_chunk) - del unknown_chunks[prev_idx] - del unknown_chunks[idx] - prev_idx = -1 - - # try to parse unknown chunks, replacing all hyphens in them with dots - if unknown_chunks: - # create string from unknown_chunks with dots instead of hyphens - u_chunks_str = ".".join(unknown_chunks.values()) - uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str))) - # recursion exit condition - if uc_title != title: - p_uc_title = parse_title(uc_title) - # if parsed uc_title has smth else than "unknown", update p_title - if list(p_uc_title.keys()) != ["unknown"]: - p_title.update(p_uc_title) - # unknown_chunks should be cleared, - # because it was processed in nested function call - unknown_chunks = {} - - # cut name from unknown chunks - # name is the first n consequent chunks - # only if amount of unknown chunks differs from overall amount of chunks - if len(unknown_chunks) != len(chunks): - i = 0 - for idx in sorted(unknown_chunks.keys()): - if idx != i: - break - p_title["name"].append(unknown_chunks[idx]) - del unknown_chunks[idx] - i += 1 - - for idx in sorted(unknown_chunks.keys()): - p_title["unknown"].append(unknown_chunks[idx]) + p_title = _get_parsed_title_dict(chunk_values, chunk_map) return dict(p_title) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 7208eac..fc6e20b 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -2,7 +2,71 @@ import unittest from renamer import parse_title -class TestParser(unittest.TestCase): +class TestParserParts(unittest.TestCase): + def test_episode_name(self): + title = "The.Walking.Dead.S04E06.Live.Bait.1080p.WEB-DL.Rus.Eng.HDCLUB" + res = parse_title(title) + self.assertEqual(["Live", "Bait"], res.get("episode_name")) + self.assertEqual( + res, + { + "episode": ["S04E06"], + "resolution": ["1080p"], + "quality": ["WEB-DL"], + "episode_name": ["Live", "Bait"], + "language": ["Rus", "Eng"], + "name": ["The", "Walking", "Dead"], + "unknown": ["HDCLUB"], + }, + ) + + def test_episode_number(self): + title = "Vikings.S01E01.720p.BluRay.4xRus.Eng.HDCLUB" + res = parse_title(title) + self.assertIn("S01E01", res.get("episode", [])) + self.assertEqual( + res, + { + "episode": ["S01E01"], + "resolution": ["720p"], + "quality": ["BluRay"], + "language": ["4xRus", "Eng"], + "name": ["Vikings"], + "unknown": ["HDCLUB"], + }, + ) + + def test_subtitles(self): + title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub" + res = parse_title(title) + self.assertIn("EngSub", res.get("subtitles", [])) + self.assertEqual( + res, + { + "quality": ["Blu-RayRip"], + "resolution": ["720p"], + "audio": ["RusDTS", "GerAC3"], + "subtitles": ["EngSub"], + "name": ["Lives", "of", "Others"], + }, + ) + + def test_file_extension(self): + title = "The Guild s04e06 Weird Respawn (by Swich).mkv" + res = parse_title(title) + self.assertIn("mkv", res.get("file_extension", [])) + self.assertEqual( + res, + { + "name": ["The", "Guild"], + "episode": ["s04e06"], + "episode_name": ["Weird", "Respawn", "by", "Swich"], + "file_extension": ["mkv"], + }, + ) + + +class TestCornerCases(unittest.TestCase): def test_misc_separators(self): title = "V tumane 2012 1080p BluRay DD5.1 x264-EA" res = parse_title(title) @@ -22,6 +86,24 @@ class TestParser(unittest.TestCase): }, ) + def test_misc_separators_and_combine(self): + title = "The.Dawn.Patrol.1938.1080p.WEB-DL.AAC2.0.H.264-alinto" + res = parse_title(title) + self.assertIn("H.264", res.get("codec", [])) + self.assertIn("alinto", res.get("unknown", [])) + self.assertEqual( + res, + { + "year": ["1938"], + "resolution": ["1080p"], + "quality": ["WEB-DL"], + "audio": ["AAC2.0"], + "codec": ["H.264"], + "name": ["The", "Dawn", "Patrol"], + "unknown": ["alinto"], + }, + ) + @unittest.expectedFailure def test_russian_char_in_resolution(self): title = "Trouble.with.the.Curve.2012.1080р.BluRay.Rus.Eng.HDCLUB" @@ -69,10 +151,10 @@ class TestParser(unittest.TestCase): }, ) - def test_file_extension_goes_to_unknown(self): + def test_file_extension_does_not_go_to_unknown(self): title = "Monsters,Inc.2001.BDRip.1080p.3xRus.Ukr.Eng.HDCLUB.mkv" res = parse_title(title) - self.assertIn("mkv", res.get("unknown", [])) + self.assertNotIn("mkv", res.get("unknown", [])) self.assertEqual( res, { @@ -81,7 +163,8 @@ class TestParser(unittest.TestCase): "resolution": ["1080p"], "language": ["3xRus", "Ukr", "Eng"], "name": ["Monsters", "Inc"], - "unknown": ["HDCLUB", "mkv"], + "unknown": ["HDCLUB"], + "file_extension": ["mkv"], }, ) @@ -102,7 +185,6 @@ class TestParser(unittest.TestCase): }, ) - @unittest.expectedFailure def test_chunk_from_three_parts(self): title = "Mad.Max.Road.of.Fury.Black.Chrome.edition.BDRip.1080p" self.assertEqual( @@ -162,21 +244,6 @@ class TestParser(unittest.TestCase): }, ) - def test_subtitles(self): - title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub" - res = parse_title(title) - self.assertIn("EngSub", res.get("subtitles", [])) - self.assertEqual( - res, - { - "quality": ["Blu-RayRip"], - "resolution": ["720p"], - "audio": ["RusDTS", "GerAC3"], - "subtitles": ["EngSub"], - "name": ["Lives", "of", "Others"], - }, - ) - def test_strange_dot_in_name(self): title = "WALL·E.2008.1080p.BluRay.3xRus.Ukr.Eng.HDCLUB-Skazhutin" res = parse_title(title) @@ -192,3 +259,68 @@ class TestParser(unittest.TestCase): "unknown": ["HDCLUB-Skazhutin"], }, ) + + def test_splitted_by_hyphens_chunk_ends_with_empty_part(self): + title = "The.IT.Crowd.S01E04.The.Red.Door.HR.DVDRip.HQ.Edition.x264-N-(Rus.Eng)" + res = parse_title(title) + self.assertEqual( + res, + { + "name": ["The", "IT", "Crowd"], + "episode": ["S01E04"], + "episode_name": ["The", "Red", "Door", "HR"], + "quality": ["DVDRip"], + "edition": ["HQ.Edition"], + "codec": ["x264"], + "unknown": ["N"], + "language": ["Rus", "Eng"], + }, + ) + + def test_year_before_episode_number_and_name(self): + title = "The.Big.Bang.Theory.2019.S12E20.The.Decision.Reverberation.1080p.AMZN.WEB-DL.DD5.1.x264-NTb_EniaHD.mkv" + res = parse_title(title) + self.assertEqual( + res, + { + "name": ["The", "Big", "Bang", "Theory"], + "year": ["2019"], + "episode": ["S12E20"], + "episode_name": ["The", "Decision", "Reverberation"], + "unknown": ["NTb", "EniaHD"], + "resolution": ["1080p"], + "quality": ["AMZN", "WEB-DL"], + "audio": ["DD5.1"], + "codec": ["x264"], + "file_extension": ["mkv"], + }, + ) + + def test_unknown_stripped_from_hyphens(self): + title = "The.Big.Bang.Theory.S04E06.720p.WEB-DL.eng.rus.[Kuraj-Bambey.Ru]-jhonny2.mkv" + res = parse_title(title) + self.assertIn("jhonny2", res.get("unknown", [])) + self.assertEqual( + res, + { + "name": ["The", "Big", "Bang", "Theory"], + "episode": ["S04E06"], + "resolution": ["720p"], + "quality": ["WEB-DL"], + "language": ["eng", "rus"], + "unknown": ["Kuraj-Bambey", "Ru", "jhonny2"], + "file_extension": ["mkv"], + }, + ) + + def test_episode_name_without_series_name(self): + title = "S27E01.Every Man's Dream.mkv" + res = parse_title(title) + self.assertEqual( + res, + { + "episode": ["S27E01"], + "episode_name": ["Every", "Man's", "Dream"], + "file_extension": ["mkv"], + }, + )