ray-the-ripper/src/file_classifier.py

116 lines
3.5 KiB
Python

import logging
import imdbinfo as imdb
from models import PathInfo, FileInfo, PathCategory, FileCategory
from pathlib import Path
import os
import re
import math
import tmdb
def classify_show(info: PathInfo) -> PathInfo:
# Gather meta information for identifying episodes
episode_durations: set[int] = set()
for ep in imdb.get_all_episodes(info.imdb.imdb_id):
if ep.duration:
episode_durations.add(int(ep.duration / 60))
logging.debug(episode_durations)
# Go over all files
count = 0
for root, dirs, files in os.walk(info.path):
for filename in files:
filepath = Path(os.path.join(root, filename))
file = FileInfo(filepath, info)
if file.path.suffix != ".mkv" or file.duration_in_seconds is None:
continue
if (
math.floor(file.duration_in_seconds / 60) in episode_durations
or math.ceil(file.duration_in_seconds / 60) in episode_durations
):
print(f"{filename} {file.duration_in_seconds / 60} EPISODE")
count += 1
else:
print(f"{filename} {file.duration_in_seconds / 60}")
logging.info(f"Identified [{count}] episodes.")
return info
def classify_movie(info: PathInfo) -> PathInfo:
logging.error(f"Movie classification not yet implemented.")
return info
def classify_files(path: str) -> PathInfo | None:
p = Path(path)
# Extract title and year
pattern = re.compile(r"^(?P<title>.+) \((?P<year>[0-9]{4})\)$")
match = pattern.match(p.name)
if not match:
logging.error(
f"Could not extract title and year from directory name [{p.name}]."
)
return None
title = match.group("title")
year = int(match.group("year"))
logging.info(f"Information extracted. Year: [{year}] Title: [{title}]")
# Fetch from IMDB
results = imdb.search_title(p.name)
if results is None:
logging.error(f"No IMDB results found for query [{p.name}].")
return None
imdb_entry = results.titles[0]
if imdb_entry.title.replace(":", "").replace("-", "") != title.replace(
":", ""
).replace("-", ""):
logging.error(
f"IMDB result title does not match. Expected: [{title}] Actual: [{imdb_entry.title}]"
)
return None
if imdb_entry.year != year:
logging.error(
f"IMDB result year does not match. Expected: [{year}] Actual: [{imdb_entry.year}]"
)
return None
logging.info(f"Found matching IMDB entry with id [{imdb_entry.imdb_id}].")
tmdb_id = tmdb.search_show(imdb_entry.id, imdb_entry.title, year)
info = PathInfo(
path=p,
title=imdb_entry.title,
year=year,
imdb_id=imdb_entry.imdb_id,
imdb=imdb_entry,
tmdb_id=tmdb_id,
)
# Identify category
if imdb_entry.kind == "tvSeries":
info.category = PathCategory.SHOW
logging.info(f"Path identified as containing SHOW.")
info = classify_show(info)
elif imdb_entry.kind == "movie":
info.category = PathCategory.MOVIE
logging.info(f"Path identified as containing MOVIE.")
info = classify_movie(info)
else:
info.category = PathCategory.UNKNOWN
logging.error(
f"IMDB entry has unknown qualifier for content [{imdb_entry.kind}]."
)
return info
if __name__ == "__main__":
results = classify_files("/home/max/Media Library/testing/The Mentalist (2008)")
print(results)
print("Done.")