From f09c3eb92800c98b32d6e56ce1ec29e63c2e044e Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 18 Feb 2025 17:52:02 +0100 Subject: [PATCH] Implemented more formats for label parsing during search --- src/dvd-compare-python/dvd_compare.py | 60 ++++++++++++++++++++------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/src/dvd-compare-python/dvd_compare.py b/src/dvd-compare-python/dvd_compare.py index 156a51e..696adbd 100644 --- a/src/dvd-compare-python/dvd_compare.py +++ b/src/dvd-compare-python/dvd_compare.py @@ -3,6 +3,10 @@ from bs4 import BeautifulSoup import re +class UnknownFormatError(Exception): + pass + + class DcSearchResult: """Simple container for search results, containing basic details.""" @@ -10,13 +14,15 @@ class DcSearchResult: self.label: str """Unprocessed label returned by search.""" self.year: int - """Year of media parsed from label. Parsing has not been tested on all cases.""" - self.title: str - """Title of media parsed from label. Parsing has not been tested on all cases.""" + """Earliest year of media parsed from label. Parsing has not been tested on all cases, some errors might occure.""" + self.years: str + """Entire year value, might indicate a range, parsed from label. Parsing has not been tested on all cases, some errors might occure.""" + self.titles: list[str] + """List of constructed title parsed from label. Media can have multiple alternative titles, but will most often be just one. Parsing has not been tested on all cases, some errors might occure.""" self.tagline: str """Unprocessed tagline returned by search.""" - self.format: str - """Format (DVD, Blu-Ray, etc.) parsed from label. Defaults to DVD if no explicit format is given. Parsing has not been tested on all cases.""" + self.formats: list[str] + """List of formats (TV, short, Blu-Ray, etc.) parsed from label. Might be empty if none given. Parsing has not been tested on all cases, some errors might occure.""" self.fid: int """FID, used to identify media by dvdcompare.net, parsed from URL. Can be used to retrive further details.""" self.url: str @@ -25,7 +31,7 @@ class DcSearchResult: """Description, listing available publications to label, as returned by search.""" def __repr__(self): - return f"{self.label}\n{self.title} | {self.format} | {self.year}\n{self.tagline}\n[{self.fid} | {self.url}]\n{self.description}\n" + return f"{self.label}\n{self.tagline}\n{self.year} | {' | '.join(self.formats)}\n{' | '.join(self.titles)}\n[{self.fid} | {self.url}]\n{self.description}\n" class DvdCompare: @@ -49,19 +55,38 @@ class DvdCompare: result.tagline = item.strong.a.get("title") # Parse QoL values + result.fid = int(result.url.split("?fid=")[1]) + label_matches = re.match( - r"^(?P[^()]+)\s(?P<the>\(The\))?\s?(?:\((?P<format>[^()]+)\))?\s?\((?P<year>[0-9]{4})\)$", - result.label, + r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$", + result.label.replace("–", "-"), re.IGNORECASE, ) - result.format = label_matches["format"] if label_matches["format"] else "DVD" - result.year = int(label_matches["year"]) - result.title = label_matches["title"] - if label_matches["the"]: - result.title = f"The {result.title}" + if not label_matches: + raise UnknownFormatError( + f"Label in unknown format that could not be parsed: {result.label}" + ) - result.fid = int(result.url.split("?fid=")[1]) + result.formats = [] + if label_matches["formats"]: + result.formats = [ + m + for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"]) + ] + result.year = int(label_matches["year"]) + result.years = label_matches["years"] + result.titles = [] + for raw_title in label_matches["titles"].strip().split(" AKA "): + title_match = re.match( + r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?", + raw_title, + re.IGNORECASE, + ) + title = title_match["title"] + if title_match["prefix"]: + title = f"{title_match['prefix']} {title}" + result.titles.append(title) return result @@ -127,7 +152,10 @@ if __name__ == "__main__": import asyncio async def test(): - results = await DvdCompare.search_async("Truman", year=1998) - print(results[0]) + results = await DvdCompare.search_async("The") + if results: + print(results[0]) + else: + print("No results found.") asyncio.run(test())