Implemented more formats for label parsing during search

This commit is contained in:
Maximilian Giller 2025-02-18 17:52:02 +01:00
parent 1e407d1ad2
commit f09c3eb928

View file

@ -3,6 +3,10 @@ from bs4 import BeautifulSoup
import re
class UnknownFormatError(Exception):
pass
class DcSearchResult:
"""Simple container for search results, containing basic details."""
@ -10,13 +14,15 @@ class DcSearchResult:
self.label: str
"""Unprocessed label returned by search."""
self.year: int
"""Year of media parsed from label. Parsing has not been tested on all cases."""
self.title: str
"""Title of media parsed from label. Parsing has not been tested on all cases."""
"""Earliest year of media parsed from label. Parsing has not been tested on all cases, some errors might occure."""
self.years: str
"""Entire year value, might indicate a range, parsed from label. Parsing has not been tested on all cases, some errors might occure."""
self.titles: list[str]
"""List of constructed title parsed from label. Media can have multiple alternative titles, but will most often be just one. Parsing has not been tested on all cases, some errors might occure."""
self.tagline: str
"""Unprocessed tagline returned by search."""
self.format: str
"""Format (DVD, Blu-Ray, etc.) parsed from label. Defaults to DVD if no explicit format is given. Parsing has not been tested on all cases."""
self.formats: list[str]
"""List of formats (TV, short, Blu-Ray, etc.) parsed from label. Might be empty if none given. Parsing has not been tested on all cases, some errors might occure."""
self.fid: int
"""FID, used to identify media by dvdcompare.net, parsed from URL. Can be used to retrive further details."""
self.url: str
@ -25,7 +31,7 @@ class DcSearchResult:
"""Description, listing available publications to label, as returned by search."""
def __repr__(self):
return f"{self.label}\n{self.title} | {self.format} | {self.year}\n{self.tagline}\n[{self.fid} | {self.url}]\n{self.description}\n"
return f"{self.label}\n{self.tagline}\n{self.year} | {' | '.join(self.formats)}\n{' | '.join(self.titles)}\n[{self.fid} | {self.url}]\n{self.description}\n"
class DvdCompare:
@ -49,19 +55,38 @@ class DvdCompare:
result.tagline = item.strong.a.get("title")
# Parse QoL values
result.fid = int(result.url.split("?fid=")[1])
label_matches = re.match(
r"^(?P<title>[^()]+)\s(?P<the>\(The\))?\s?(?:\((?P<format>[^()]+)\))?\s?\((?P<year>[0-9]{4})\)$",
result.label,
r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$",
result.label.replace("–", "-"),
re.IGNORECASE,
)
result.format = label_matches["format"] if label_matches["format"] else "DVD"
result.year = int(label_matches["year"])
result.title = label_matches["title"]
if label_matches["the"]:
result.title = f"The {result.title}"
if not label_matches:
raise UnknownFormatError(
f"Label in unknown format that could not be parsed: {result.label}"
)
result.fid = int(result.url.split("?fid=")[1])
result.formats = []
if label_matches["formats"]:
result.formats = [
m
for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"])
]
result.year = int(label_matches["year"])
result.years = label_matches["years"]
result.titles = []
for raw_title in label_matches["titles"].strip().split(" AKA "):
title_match = re.match(
r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?",
raw_title,
re.IGNORECASE,
)
title = title_match["title"]
if title_match["prefix"]:
title = f"{title_match['prefix']} {title}"
result.titles.append(title)
return result
@ -127,7 +152,10 @@ if __name__ == "__main__":
import asyncio
async def test():
results = await DvdCompare.search_async("Truman", year=1998)
results = await DvdCompare.search_async("The")
if results:
print(results[0])
else:
print("No results found.")
asyncio.run(test())