Refactored parsing of search results and covering all known cases for know
This commit is contained in:
parent
2eed3e1cdf
commit
022a41723b
1 changed files with 37 additions and 26 deletions
|
@ -57,37 +57,47 @@ class DvdCompare:
|
||||||
# Parse QoL values
|
# Parse QoL values
|
||||||
result.fid = int(result.url.split("?fid=")[1])
|
result.fid = int(result.url.split("?fid=")[1])
|
||||||
|
|
||||||
label_matches = re.match(
|
result.formats = []
|
||||||
r"^(?P<titles>(?:[^()]+\s(?:\((?:The|A|An)\))?(?:(?P<formats>(?:\([^()]+\)\s?)*)\s)?(?:\s?AKA\s?)?)+)\s?\((?P<years>(?P<year>[0-9]{4})(?:[-,][0-9]{4})?)\)$",
|
result.titles = []
|
||||||
result.label.replace("", "-"),
|
for part in (
|
||||||
re.IGNORECASE,
|
result.label.replace("", "-").replace("\x96", "-").split(" AKA ")
|
||||||
)
|
): # Iterate over alternative titles and collect information
|
||||||
|
part_matches = re.match(
|
||||||
|
r"^(?P<title>[^()\n]*[^()\s])(?:\s\((?P<prefix>The|A|An)\)(?P<suffix>[^()\n]+)?)?(?P<formats> ?(?:\([^()]*[a-z]+[^()]*\)\s?)+)?(?P<years>(?: ?\((?P<range>(?P<year>[0-9]{4})(?:[-,/][0-9]{1,2}(?:[0-9]{2})?)?)\)))?",
|
||||||
|
part,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
if not label_matches:
|
if part_matches is None:
|
||||||
|
raise UnknownFormatError(
|
||||||
|
f"Label part in unknown format that could not be parsed: {part}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Title
|
||||||
|
title = f"{part_matches['title']}{part_matches['suffix']}".strip()
|
||||||
|
if part_matches["prefix"]:
|
||||||
|
title = f"{part_matches['prefix']} {title}"
|
||||||
|
result.titles.append(title)
|
||||||
|
|
||||||
|
# Formats
|
||||||
|
if part_matches["formats"]:
|
||||||
|
result.formats = [
|
||||||
|
m
|
||||||
|
for m in re.findall(
|
||||||
|
r"\((?P<format>[^()]+)\)", part_matches["formats"]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Years
|
||||||
|
if part_matches["years"]:
|
||||||
|
result.year = int(part_matches["year"])
|
||||||
|
result.years = part_matches["years"]
|
||||||
|
|
||||||
|
if len(result.titles) <= 0:
|
||||||
raise UnknownFormatError(
|
raise UnknownFormatError(
|
||||||
f"Label in unknown format that could not be parsed: {result.label}"
|
f"Label in unknown format that could not be parsed: {result.label}"
|
||||||
)
|
)
|
||||||
|
|
||||||
result.formats = []
|
|
||||||
if label_matches["formats"]:
|
|
||||||
result.formats = [
|
|
||||||
m
|
|
||||||
for m in re.findall(r"\((?P<format>[^()]+)\)", label_matches["formats"])
|
|
||||||
]
|
|
||||||
result.year = int(label_matches["year"])
|
|
||||||
result.years = label_matches["years"]
|
|
||||||
result.titles = []
|
|
||||||
for raw_title in label_matches["titles"].strip().split(" AKA "):
|
|
||||||
title_match = re.match(
|
|
||||||
r"(?P<title>[^()]+[^() ])(?:\s\((?P<prefix>The|A|An)\))?(?:\s\([^()]+\))?",
|
|
||||||
raw_title,
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
title = title_match["title"]
|
|
||||||
if title_match["prefix"]:
|
|
||||||
title = f"{title_match['prefix']} {title}"
|
|
||||||
result.titles.append(title)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def search_async(
|
async def search_async(
|
||||||
|
@ -157,6 +167,7 @@ if __name__ == "__main__":
|
||||||
results = await DvdCompare.search_async("The")
|
results = await DvdCompare.search_async("The")
|
||||||
if results:
|
if results:
|
||||||
print(results[0])
|
print(results[0])
|
||||||
|
print(f"And {len(results)} additional results")
|
||||||
else:
|
else:
|
||||||
print("No results found.")
|
print("No results found.")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue