From 022a41723b2f3e46c30dc271c80cc70eae82994f Mon Sep 17 00:00:00 2001 From: Maximilian Giller Date: Wed, 19 Feb 2025 06:15:54 +0100 Subject: [PATCH] Refactored parsing of search results and covering all known cases for know --- src/dvd-compare-python/dvd_compare.py | 63 ++++++++++++++++----------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/dvd-compare-python/dvd_compare.py b/src/dvd-compare-python/dvd_compare.py index fd28100..e811410 100644 --- a/src/dvd-compare-python/dvd_compare.py +++ b/src/dvd-compare-python/dvd_compare.py @@ -57,37 +57,47 @@ class DvdCompare: # Parse QoL values result.fid = int(result.url.split("?fid=")[1]) - label_matches = re.match( - r"^(?P(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P(?P[0-9]{4})(?:[-,][0-9]{4})?)\)$", - result.label.replace("–", "-"), - re.IGNORECASE, - ) + result.formats = [] + result.titles = [] + for part in ( + result.label.replace("–", "-").replace("\x96–", "-").split(" AKA ") + ): # Iterate over alternative titles and collect information + part_matches = re.match( + r"^(?P[^()\n]*[^()\s])(?:\s\((?P<prefix>The|A|An)\)(?P<suffix>[^()\n]+)?)?(?P<formats> ?(?:\([^()]*[a-z]+[^()]*\)\s?)+)?(?P<years>(?: ?\((?P<range>(?P<year>[0-9]{4})(?:[-,/][0-9]{1,2}(?:[0-9]{2})?)?)\)))?", + part, + re.IGNORECASE, + ) - if not label_matches: + if part_matches is None: + raise UnknownFormatError( + f"Label part in unknown format that could not be parsed: {part}" + ) + + # Title + title = f"{part_matches['title']}{part_matches['suffix']}".strip() + if part_matches["prefix"]: + title = f"{part_matches['prefix']} {title}" + result.titles.append(title) + + # Formats + if part_matches["formats"]: + result.formats = [ + m + for m in re.findall( + r"\((?P<format>[^()]+)\)", part_matches["formats"] + ) + ] + + # Years + if part_matches["years"]: + result.year = int(part_matches["year"]) + result.years = part_matches["years"] + + if len(result.titles) <= 0: raise UnknownFormatError( f"Label in unknown format that could not be parsed: {result.label}" ) - result.formats = [] - if label_matches["formats"]: - result.formats = [ - m - for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"]) - ] - result.year = int(label_matches["year"]) - result.years = label_matches["years"] - result.titles = [] - for raw_title in label_matches["titles"].strip().split(" AKA "): - title_match = re.match( - r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?", - raw_title, - re.IGNORECASE, - ) - title = title_match["title"] - if title_match["prefix"]: - title = f"{title_match['prefix']} {title}" - result.titles.append(title) - return result async def search_async( @@ -157,6 +167,7 @@ if __name__ == "__main__": results = await DvdCompare.search_async("The") if results: print(results[0]) + print(f"And {len(results)} additional results") else: print("No results found.")