Implemented more formats for label parsing during search

This commit is contained in:
Maximilian Giller 2025-02-18 17:52:02 +01:00
parent 1e407d1ad2
commit f09c3eb928

View file

@ -3,6 +3,10 @@ from bs4 import BeautifulSoup
import re import re
class UnknownFormatError(Exception):
pass
class DcSearchResult: class DcSearchResult:
"""Simple container for search results, containing basic details.""" """Simple container for search results, containing basic details."""
@ -10,13 +14,15 @@ class DcSearchResult:
self.label: str self.label: str
"""Unprocessed label returned by search.""" """Unprocessed label returned by search."""
self.year: int self.year: int
"""Year of media parsed from label. Parsing has not been tested on all cases.""" """Earliest year of media parsed from label. Parsing has not been tested on all cases, some errors might occure."""
self.title: str self.years: str
"""Title of media parsed from label. Parsing has not been tested on all cases.""" """Entire year value, might indicate a range, parsed from label. Parsing has not been tested on all cases, some errors might occure."""
self.titles: list[str]
"""List of constructed title parsed from label. Media can have multiple alternative titles, but will most often be just one. Parsing has not been tested on all cases, some errors might occure."""
self.tagline: str self.tagline: str
"""Unprocessed tagline returned by search.""" """Unprocessed tagline returned by search."""
self.format: str self.formats: list[str]
"""Format (DVD, Blu-Ray, etc.) parsed from label. Defaults to DVD if no explicit format is given. Parsing has not been tested on all cases.""" """List of formats (TV, short, Blu-Ray, etc.) parsed from label. Might be empty if none given. Parsing has not been tested on all cases, some errors might occure."""
self.fid: int self.fid: int
"""FID, used to identify media by dvdcompare.net, parsed from URL. Can be used to retrive further details.""" """FID, used to identify media by dvdcompare.net, parsed from URL. Can be used to retrive further details."""
self.url: str self.url: str
@ -25,7 +31,7 @@ class DcSearchResult:
"""Description, listing available publications to label, as returned by search.""" """Description, listing available publications to label, as returned by search."""
def __repr__(self): def __repr__(self):
return f"{self.label}\n{self.title} | {self.format} | {self.year}\n{self.tagline}\n[{self.fid} | {self.url}]\n{self.description}\n" return f"{self.label}\n{self.tagline}\n{self.year} | {' | '.join(self.formats)}\n{' | '.join(self.titles)}\n[{self.fid} | {self.url}]\n{self.description}\n"
class DvdCompare: class DvdCompare:
@ -49,19 +55,38 @@ class DvdCompare:
result.tagline = item.strong.a.get("title") result.tagline = item.strong.a.get("title")
# Parse QoL values # Parse QoL values
result.fid = int(result.url.split("?fid=")[1])
label_matches = re.match( label_matches = re.match(
r"^(?P<title>[^()]+)\s(?P<the>\(The\))?\s?(?:\((?P<format>[^()]+)\))?\s?\((?P<year>[0-9]{4})\)$", r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$",
result.label, result.label.replace("–", "-"),
re.IGNORECASE, re.IGNORECASE,
) )
result.format = label_matches["format"] if label_matches["format"] else "DVD" if not label_matches:
result.year = int(label_matches["year"]) raise UnknownFormatError(
result.title = label_matches["title"] f"Label in unknown format that could not be parsed: {result.label}"
if label_matches["the"]: )
result.title = f"The {result.title}"
result.fid = int(result.url.split("?fid=")[1]) result.formats = []
if label_matches["formats"]:
result.formats = [
m
for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"])
]
result.year = int(label_matches["year"])
result.years = label_matches["years"]
result.titles = []
for raw_title in label_matches["titles"].strip().split(" AKA "):
title_match = re.match(
r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?",
raw_title,
re.IGNORECASE,
)
title = title_match["title"]
if title_match["prefix"]:
title = f"{title_match['prefix']} {title}"
result.titles.append(title)
return result return result
@ -127,7 +152,10 @@ if __name__ == "__main__":
import asyncio import asyncio
async def test(): async def test():
results = await DvdCompare.search_async("Truman", year=1998) results = await DvdCompare.search_async("The")
print(results[0]) if results:
print(results[0])
else:
print("No results found.")
asyncio.run(test()) asyncio.run(test())