Refactored parsing of search results and covering all known cases for know

This commit is contained in:
Maximilian Giller 2025-02-19 06:15:54 +01:00
parent 2eed3e1cdf
commit 022a41723b

View file

@ -57,37 +57,47 @@ class DvdCompare:
# Parse QoL values
result.fid = int(result.url.split("?fid=")[1])
label_matches = re.match(
r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$",
result.label.replace("–", "-"),
result.formats = []
result.titles = []
for part in (
result.label.replace("–", "-").replace("\x96–", "-").split(" AKA ")
): # Iterate over alternative titles and collect information
part_matches = re.match(
r"^(?P<title>[^()\n]*[^()\s])(?:\s\((?P<prefix>The|A|An)\)(?P<suffix>[^()\n]+)?)?(?P<formats> ?(?:\([^()]*[a-z]+[^()]*\)\s?)+)?(?P<years>(?: ?\((?P<range>(?P<year>[0-9]{4})(?:[-,/][0-9]{1,2}(?:[0-9]{2})?)?)\)))?",
part,
re.IGNORECASE,
)
if not label_matches:
if part_matches is None:
raise UnknownFormatError(
f"Label part in unknown format that could not be parsed: {part}"
)
# Title
title = f"{part_matches['title']}{part_matches['suffix']}".strip()
if part_matches["prefix"]:
title = f"{part_matches['prefix']} {title}"
result.titles.append(title)
# Formats
if part_matches["formats"]:
result.formats = [
m
for m in re.findall(
r"\((?P<format>[^()]+)\)", part_matches["formats"]
)
]
# Years
if part_matches["years"]:
result.year = int(part_matches["year"])
result.years = part_matches["years"]
if len(result.titles) <= 0:
raise UnknownFormatError(
f"Label in unknown format that could not be parsed: {result.label}"
)
result.formats = []
if label_matches["formats"]:
result.formats = [
m
for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"])
]
result.year = int(label_matches["year"])
result.years = label_matches["years"]
result.titles = []
for raw_title in label_matches["titles"].strip().split(" AKA "):
title_match = re.match(
r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?",
raw_title,
re.IGNORECASE,
)
title = title_match["title"]
if title_match["prefix"]:
title = f"{title_match['prefix']} {title}"
result.titles.append(title)
return result
async def search_async(
@ -157,6 +167,7 @@ if __name__ == "__main__":
results = await DvdCompare.search_async("The")
if results:
print(results[0])
print(f"And {len(results)} additional results")
else:
print("No results found.")