Add support for episode names

2022-01-16 09:11:00 +03:00
parent 58b1606a62
commit 463e5b4c9f
2 changed files with 266 additions and 85 deletions
--- a/renamer.py
+++ b/renamer.py
@@ -25,6 +25,7 @@ PATTERNS = (
                r"|noir[-.]edition"
                r"|black[-.]chrome[-.]edition"
                r"|extended[-.]edition"
+                r"|hq[-.]edition"
                r"|theatrical)"),
    ("restrictions", r"(unrated)"),
    ("resolution", r"[0-9]{3,4}[pi]"),
@@ -37,6 +38,7 @@ PATTERNS = (
    ("audio", r"%s?(dts(-es)?|ac3|flac|dd5\.1|aac2\.0|dub-line)" % LANGUAGES),
    ("subtitles", r"%s?sub" % LANGUAGES),
    ("language", r"(\d{1,2}x)?%s" % LANGUAGES),
+    ("file_extension", r"mkv|avi"),
    ("unknown", r".*")
 )

@@ -99,83 +101,130 @@ def process_file(fpath):
        _lg.warning("%s -> %s", fname, result)


+def _get_parsed_title_dict(chunk_list, chunk_map):
+    p_title = collections.defaultdict(list)
+    for idx, chunk in enumerate(chunk_list):
+        chunk_type = chunk_map[idx]
+        p_title[chunk_type].append(chunk)
+    return p_title
+
+
+def _guess_combined(chunk_values, chunk_map):
+    """ Try to combine unknown chunks in pairs and parse them """
+    is_changed = False
+    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
+    if len(p_title["unknown"]) < 2:
+        return is_changed, chunk_values, chunk_map
+
+    # i - begin of slice, j - end of slice
+    i = 0
+    # process up to second-to-last element
+    while i < len(chunk_map) - 1:
+        # we need slice with at least two elements
+        j = i + 2
+        # we need only unknown elements
+        while set(chunk_map[i:j]) == {"unknown"} and j <= len(chunk_map):
+            # create combined chunk
+            cmb_chunk = ".".join(chunk_values[i:j])
+            cmb_chunk_type = guess_part(cmb_chunk)
+
+            # add new combined chunk in lists
+            # first subelement gets new chunk, rest - None
+            # (will be removed later)
+            if cmb_chunk_type != "unknown":
+                is_changed = True
+                chunk_values[i] = cmb_chunk
+                chunk_map[i] = cmb_chunk_type
+                for idx in range(i+1, j):
+                    chunk_values[idx] = None
+                    chunk_map[idx] = None
+                    # to start checking next chunks right after the end of slice
+                    i = idx
+                break
+            # try add more elements to combined chunk
+            else:
+                j += 1
+
+        # start checking next value
+        i += 1
+
+    # clean up from None values
+    chunk_values = list(filter(None, chunk_values))
+    chunk_map = list(filter(None, chunk_map))
+
+    return is_changed, chunk_values, chunk_map
+
+
 def parse_title(title):
    """ Split media title to components. """

-    chunks = list(filter(None, re.split(SEPARATORS, title)))
-    p_title = collections.defaultdict(list)
+    chunk_values = filter(None, re.split(SEPARATORS, title))

    # remove non-word chunks (like single hyphens)
-    chunks = list(filter(lambda ch: re.search(r"\w+", ch), chunks))
+    chunk_values = list(filter(lambda ch: re.search(r"\w+", ch), chunk_values))

    # parse each chunk
-    unknown_chunks = {}
-    for idx, chunk in enumerate(chunks):
-        pat_type = guess_part(chunk)
-        if pat_type != "unknown":
-            p_title[pat_type].append(chunk)
-        else:
-            unknown_chunks[idx] = chunk
+    chunk_map = []
+    for ch_value in chunk_values:
+        chunk_map.append(guess_part(ch_value))

-    # try to combine unknown chunks in pairs and parse them
-    if len(unknown_chunks) > 1:
-        prev_idx = -1
-        for idx in sorted(unknown_chunks.keys()):
+    _, chunk_values, chunk_map = _guess_combined(chunk_values, chunk_map)

-            # first unknown chunk, skip
-            if prev_idx < 0:
-                prev_idx = idx
+    # # try to parse unknown chunks, replacing all hyphens in them with dots
+    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
+    is_changed = False
+    if p_title.get("unknown"):
+        spl_ch_values = []
+        spl_ch_map = []
+        for idx, ch_value in enumerate(chunk_values):
+            ch_type = chunk_map[idx]
+            if ch_type == "unknown" and "-" in ch_value:
+                spl_values = ch_value.split("-")
+                for spl_val in spl_values:
+                    if not spl_val:
+                        continue
+                    spl_type = guess_part(spl_val)
+                    if spl_type != "unknown":
+                        is_changed = True
+                    spl_ch_values.append(spl_val)
+                    spl_ch_map.append(spl_type)
+            else:
+                spl_ch_values.append(ch_value)
+                spl_ch_map.append(ch_type)
+
+        is_combined, spl_ch_values, spl_ch_map = _guess_combined(spl_ch_values, spl_ch_map)
+        if is_changed or is_combined:
+            chunk_values = spl_ch_values
+            chunk_map = spl_ch_map
+
+    # parse name and episode name
+    # only if there is something except unknown chunks
+    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
+    if len(p_title["unknown"]) != len(chunk_values):
+        idx = 0
+        while idx < len(chunk_map) and chunk_map[idx] == "unknown":
+            chunk_map[idx] = "name"
+            idx += 1
+        # if episode number is found, next unknown chunks are episode name
+        if p_title.get("episode"):
+            idx = chunk_map.index("episode") + 1
+            while idx < len(chunk_map) and chunk_map[idx] == "unknown":
+                chunk_map[idx] = "episode_name"
+                idx += 1
+
+    # at last, strip hyphens from unknown chunks
+    # only if there is something except unknown chunks
+    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
+    if len(p_title["unknown"]) != len(chunk_values):
+        for idx, chunk_type in enumerate(chunk_map):
+            if chunk_type != "unknown":
                continue
-            # previous unknown chunk does not border with current, skip
-            if (prev_idx + 1) != idx:
-                prev_idx = idx
+            chunk_value = chunk_values[idx]
+            if chunk_value[0] != "-" and chunk_value[-1] != "-":
                continue
+            chunk_values[idx] = chunk_value.strip("-")

-            # create combined chunk
-            cmb_chunk = ".".join([unknown_chunks[prev_idx], unknown_chunks[idx]])
-            cmb_chunk_type = guess_part(cmb_chunk)
-
-            # check next pair if nothing
-            if cmb_chunk_type == "unknown":
-                prev_idx = idx
-                continue
-
-            # if combined chunk matches pattern, add it to found type
-            # and remove from unknown chunks its parts
-            p_title[cmb_chunk_type].append(cmb_chunk)
-            del unknown_chunks[prev_idx]
-            del unknown_chunks[idx]
-            prev_idx = -1
-
-    # try to parse unknown chunks, replacing all hyphens in them with dots
-    if unknown_chunks:
-        # create string from unknown_chunks with dots instead of hyphens
-        u_chunks_str = ".".join(unknown_chunks.values())
-        uc_title = ".".join(filter(None, re.split(SEPARATORS_HYPHEN, u_chunks_str)))
-        # recursion exit condition
-        if uc_title != title:
-            p_uc_title = parse_title(uc_title)
-            # if parsed uc_title has smth else than "unknown", update p_title
-            if list(p_uc_title.keys()) != ["unknown"]:
-                p_title.update(p_uc_title)
-                # unknown_chunks should be cleared,
-                # because it was processed in nested function call
-                unknown_chunks = {}
-
-    # cut name from unknown chunks
-    # name is the first n consequent chunks
-    # only if amount of unknown chunks differs from overall amount of chunks
-    if len(unknown_chunks) != len(chunks):
-        i = 0
-        for idx in sorted(unknown_chunks.keys()):
-            if idx != i:
-                break
-            p_title["name"].append(unknown_chunks[idx])
-            del unknown_chunks[idx]
-            i += 1
-
-    for idx in sorted(unknown_chunks.keys()):
-        p_title["unknown"].append(unknown_chunks[idx])
+    p_title = _get_parsed_title_dict(chunk_values, chunk_map)
    return dict(p_title)


--- a/tests/test_parsing.py
+++ b/tests/test_parsing.py
@@ -2,7 +2,71 @@ import unittest
 from renamer import parse_title


-class TestParser(unittest.TestCase):
+class TestParserParts(unittest.TestCase):
+    def test_episode_name(self):
+        title = "The.Walking.Dead.S04E06.Live.Bait.1080p.WEB-DL.Rus.Eng.HDCLUB"
+        res = parse_title(title)
+        self.assertEqual(["Live", "Bait"], res.get("episode_name"))
+        self.assertEqual(
+            res,
+            {
+                "episode": ["S04E06"],
+                "resolution": ["1080p"],
+                "quality": ["WEB-DL"],
+                "episode_name": ["Live", "Bait"],
+                "language": ["Rus", "Eng"],
+                "name": ["The", "Walking", "Dead"],
+                "unknown": ["HDCLUB"],
+            },
+        )
+
+    def test_episode_number(self):
+        title = "Vikings.S01E01.720p.BluRay.4xRus.Eng.HDCLUB"
+        res = parse_title(title)
+        self.assertIn("S01E01", res.get("episode", []))
+        self.assertEqual(
+            res,
+            {
+                "episode": ["S01E01"],
+                "resolution": ["720p"],
+                "quality": ["BluRay"],
+                "language": ["4xRus", "Eng"],
+                "name": ["Vikings"],
+                "unknown": ["HDCLUB"],
+            },
+        )
+
+    def test_subtitles(self):
+        title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub"
+        res = parse_title(title)
+        self.assertIn("EngSub", res.get("subtitles", []))
+        self.assertEqual(
+            res,
+            {
+                "quality": ["Blu-RayRip"],
+                "resolution": ["720p"],
+                "audio": ["RusDTS", "GerAC3"],
+                "subtitles": ["EngSub"],
+                "name": ["Lives", "of", "Others"],
+            },
+        )
+
+    def test_file_extension(self):
+        title = "The Guild s04e06 Weird Respawn (by Swich).mkv"
+        res = parse_title(title)
+        self.assertIn("mkv", res.get("file_extension", []))
+        self.assertEqual(
+            res,
+            {
+                "name": ["The", "Guild"],
+                "episode": ["s04e06"],
+                "episode_name": ["Weird", "Respawn", "by", "Swich"],
+                "file_extension": ["mkv"],
+            },
+        )
+
+
+class TestCornerCases(unittest.TestCase):
    def test_misc_separators(self):
        title = "V tumane 2012 1080p BluRay DD5.1 x264-EA"
        res = parse_title(title)
@@ -22,6 +86,24 @@ class TestParser(unittest.TestCase):
            },
        )

+    def test_misc_separators_and_combine(self):
+        title = "The.Dawn.Patrol.1938.1080p.WEB-DL.AAC2.0.H.264-alinto"
+        res = parse_title(title)
+        self.assertIn("H.264", res.get("codec", []))
+        self.assertIn("alinto", res.get("unknown", []))
+        self.assertEqual(
+            res,
+            {
+                "year": ["1938"],
+                "resolution": ["1080p"],
+                "quality": ["WEB-DL"],
+                "audio": ["AAC2.0"],
+                "codec": ["H.264"],
+                "name": ["The", "Dawn", "Patrol"],
+                "unknown": ["alinto"],
+            },
+        )
+
    @unittest.expectedFailure
    def test_russian_char_in_resolution(self):
        title = "Trouble.with.the.Curve.2012.1080р.BluRay.Rus.Eng.HDCLUB"
@@ -69,10 +151,10 @@ class TestParser(unittest.TestCase):
            },
        )

-    def test_file_extension_goes_to_unknown(self):
+    def test_file_extension_does_not_go_to_unknown(self):
        title = "Monsters,Inc.2001.BDRip.1080p.3xRus.Ukr.Eng.HDCLUB.mkv"
        res = parse_title(title)
-        self.assertIn("mkv", res.get("unknown", []))
+        self.assertNotIn("mkv", res.get("unknown", []))
        self.assertEqual(
            res,
            {
@@ -81,7 +163,8 @@ class TestParser(unittest.TestCase):
                "resolution": ["1080p"],
                "language": ["3xRus", "Ukr", "Eng"],
                "name": ["Monsters", "Inc"],
-                "unknown": ["HDCLUB", "mkv"],
+                "unknown": ["HDCLUB"],
+                "file_extension": ["mkv"],
            },
        )

@@ -102,7 +185,6 @@ class TestParser(unittest.TestCase):
            },
        )

-    @unittest.expectedFailure
    def test_chunk_from_three_parts(self):
        title = "Mad.Max.Road.of.Fury.Black.Chrome.edition.BDRip.1080p"
        self.assertEqual(
@@ -162,21 +244,6 @@ class TestParser(unittest.TestCase):
            },
        )

-    def test_subtitles(self):
-        title = "Lives.of.Others.Blu-RayRip.720p.RusDTS.GerAC3.EngSub"
-        res = parse_title(title)
-        self.assertIn("EngSub", res.get("subtitles", []))
-        self.assertEqual(
-            res,
-            {
-                "quality": ["Blu-RayRip"],
-                "resolution": ["720p"],
-                "audio": ["RusDTS", "GerAC3"],
-                "subtitles": ["EngSub"],
-                "name": ["Lives", "of", "Others"],
-            },
-        )
-
    def test_strange_dot_in_name(self):
        title = "WALL·E.2008.1080p.BluRay.3xRus.Ukr.Eng.HDCLUB-Skazhutin"
        res = parse_title(title)
@@ -192,3 +259,68 @@ class TestParser(unittest.TestCase):
                "unknown": ["HDCLUB-Skazhutin"],
            },
        )
+
+    def test_splitted_by_hyphens_chunk_ends_with_empty_part(self):
+        title = "The.IT.Crowd.S01E04.The.Red.Door.HR.DVDRip.HQ.Edition.x264-N-(Rus.Eng)"
+        res = parse_title(title)
+        self.assertEqual(
+            res,
+            {
+                "name": ["The", "IT", "Crowd"],
+                "episode": ["S01E04"],
+                "episode_name": ["The", "Red", "Door", "HR"],
+                "quality": ["DVDRip"],
+                "edition": ["HQ.Edition"],
+                "codec": ["x264"],
+                "unknown": ["N"],
+                "language": ["Rus", "Eng"],
+            },
+        )
+
+    def test_year_before_episode_number_and_name(self):
+        title = "The.Big.Bang.Theory.2019.S12E20.The.Decision.Reverberation.1080p.AMZN.WEB-DL.DD5.1.x264-NTb_EniaHD.mkv"
+        res = parse_title(title)
+        self.assertEqual(
+            res,
+            {
+                "name": ["The", "Big", "Bang", "Theory"],
+                "year": ["2019"],
+                "episode": ["S12E20"],
+                "episode_name": ["The", "Decision", "Reverberation"],
+                "unknown": ["NTb", "EniaHD"],
+                "resolution": ["1080p"],
+                "quality": ["AMZN", "WEB-DL"],
+                "audio": ["DD5.1"],
+                "codec": ["x264"],
+                "file_extension": ["mkv"],
+            },
+        )
+
+    def test_unknown_stripped_from_hyphens(self):
+        title = "The.Big.Bang.Theory.S04E06.720p.WEB-DL.eng.rus.[Kuraj-Bambey.Ru]-jhonny2.mkv"
+        res = parse_title(title)
+        self.assertIn("jhonny2", res.get("unknown", []))
+        self.assertEqual(
+            res,
+            {
+                "name": ["The", "Big", "Bang", "Theory"],
+                "episode": ["S04E06"],
+                "resolution": ["720p"],
+                "quality": ["WEB-DL"],
+                "language": ["eng", "rus"],
+                "unknown": ["Kuraj-Bambey", "Ru", "jhonny2"],
+                "file_extension": ["mkv"],
+            },
+        )
+
+    def test_episode_name_without_series_name(self):
+        title = "S27E01.Every Man's Dream.mkv"
+        res = parse_title(title)
+        self.assertEqual(
+            res,
+            {
+                "episode": ["S27E01"],
+                "episode_name": ["Every", "Man's", "Dream"],
+                "file_extension": ["mkv"],
+            },
+        )