Implemented more formats for label parsing during search

2025-02-18 17:52:02 +01:00 · 2025-02-18 17:52:02 +01:00 · f09c3eb928
commit f09c3eb928
parent 1e407d1ad2
1 changed files with 44 additions and 16 deletions
--- a/src/dvd-compare-python/dvd_compare.py
+++ b/src/dvd-compare-python/dvd_compare.py
@ -3,6 +3,10 @@ from bs4 import BeautifulSoup
 import re


+class UnknownFormatError(Exception):
+    pass
+
+
 class DcSearchResult:
    """Simple container for search results, containing basic details."""

@ -10,13 +14,15 @@ class DcSearchResult:
        self.label: str
        """Unprocessed label returned by search."""
        self.year: int
-        """Year of media parsed from label. Parsing has not been tested on all cases."""
-        self.title: str
-        """Title of media parsed from label. Parsing has not been tested on all cases."""
+        """Earliest year of media parsed from label. Parsing has not been tested on all cases, some errors might occure."""
+        self.years: str
+        """Entire year value, might indicate a range, parsed from label. Parsing has not been tested on all cases, some errors might occure."""
+        self.titles: list[str]
+        """List of constructed title parsed from label. Media can have multiple alternative titles, but will most often be just one. Parsing has not been tested on all cases, some errors might occure."""
        self.tagline: str
        """Unprocessed tagline returned by search."""
-        self.format: str
-        """Format (DVD, Blu-Ray, etc.) parsed from label. Defaults to DVD if no explicit format is given. Parsing has not been tested on all cases."""
+        self.formats: list[str]
+        """List of formats (TV, short, Blu-Ray, etc.) parsed from label. Might be empty if none given. Parsing has not been tested on all cases, some errors might occure."""
        self.fid: int
        """FID, used to identify media by dvdcompare.net, parsed from URL. Can be used to retrive further details."""
        self.url: str
@ -25,7 +31,7 @@ class DcSearchResult:
        """Description, listing available publications to label, as returned by search."""

    def __repr__(self):
-        return f"{self.label}\n{self.title} | {self.format} | {self.year}\n{self.tagline}\n[{self.fid} | {self.url}]\n{self.description}\n"
+        return f"{self.label}\n{self.tagline}\n{self.year} | {' | '.join(self.formats)}\n{' | '.join(self.titles)}\n[{self.fid} | {self.url}]\n{self.description}\n"


 class DvdCompare:
@ -49,19 +55,38 @@ class DvdCompare:
        result.tagline = item.strong.a.get("title")

        # Parse QoL values
+        result.fid = int(result.url.split("?fid=")[1])
+
        label_matches = re.match(
-            r"^(?P<title>[^()]+)\s(?P<the>\(The\))?\s?(?:\((?P<format>[^()]+)\))?\s?\((?P<year>[0-9]{4})\)$",
-            result.label,
+            r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$",
+            result.label.replace("", "-"),
            re.IGNORECASE,
        )

-        result.format = label_matches["format"] if label_matches["format"] else "DVD"
-        result.year = int(label_matches["year"])
-        result.title = label_matches["title"]
-        if label_matches["the"]:
-            result.title = f"The {result.title}"
+        if not label_matches:
+            raise UnknownFormatError(
+                f"Label in unknown format that could not be parsed: {result.label}"
+            )

-        result.fid = int(result.url.split("?fid=")[1])
+        result.formats = []
+        if label_matches["formats"]:
+            result.formats = [
+                m
+                for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"])
+            ]
+        result.year = int(label_matches["year"])
+        result.years = label_matches["years"]
+        result.titles = []
+        for raw_title in label_matches["titles"].strip().split(" AKA "):
+            title_match = re.match(
+                r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?",
+                raw_title,
+                re.IGNORECASE,
+            )
+            title = title_match["title"]
+            if title_match["prefix"]:
+                title = f"{title_match['prefix']} {title}"
+            result.titles.append(title)

        return result

@ -127,7 +152,10 @@ if __name__ == "__main__":
    import asyncio

    async def test():
-        results = await DvdCompare.search_async("Truman", year=1998)
+        results = await DvdCompare.search_async("The")
+        if results:
            print(results[0])
+        else:
+            print("No results found.")

    asyncio.run(test())