Ever wondered how to get personalized movie recommendations based on your mood? In this project, I built CineMood (a Mood-Based Trending Movie Recommendation Web App) from scratchβusing Python, Hugging Face, XGBoost, and the TMDb API. The app analyzes trending movies, classifies them by mood, and delivers real-time recommendations.
In this post, Iβll walk you through the entire process of a end-to-end machine learning projectβfrom data collection and model training to deploying the app on Hugging Face Spaces.
CineMood recommends movies based on six emotions:
The web app:
Key Features:
Create a directory structure:
cinemood_project/
βββ app.py # Streamlit web app
βββ data/
β βββ movie_mood_dataset.csv # Generated dataset
βββ models/
β βββ tfidf_vectorizer.pkl # TF-IDF vectorization of movie overview
β βββ xgb_mood_classifier.pkl # Trained XGBoost model
βββ generate_movie_mood_dataset.py # Script to Generate dataset
βββ train_model.py # Model training script
βββ requirements.txt # List of dependencies
βββ README.md # Project documentation
Create a file generate_movie_mood_dataset.py
below to generate dataset movie_mood_dataset.csv
:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from transformers import pipeline
from tqdm import tqdm
# Load API keys from .env file
load_dotenv()
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
# Hugging Face Emotion Classification Model
classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion")
# TMDb API Endpoints
TMDB_ENDPOINTS = [
"https://api.themoviedb.org/3/trending/movie/week",
"https://api.themoviedb.org/3/movie/top_rated",
"https://api.themoviedb.org/3/movie/popular"
]
# Define target samples per mood
TARGET_SAMPLES_PER_MOOD = 200
# Dictionary to store movies per mood
movie_moods = {
"joy": [], "sadness": [], "love": [], "anger": [], "fear": [], "surprise": []
}
# Set to track unique movie titles
unique_movie_titles = set()
def get_movies_from_tmdb(endpoint, page=1):
"""Fetch movies from TMDb API based on the given endpoint and page number."""
try:
response = requests.get(endpoint, params={"api_key": TMDB_API_KEY, "page": page}, timeout=10)
response.raise_for_status()
return response.json().get("results", [])
except requests.exceptions.RequestException as e:
print(f"β Error fetching movies from {endpoint}: {e}")
return []
def classify_mood(movie_overview):
"""Classify movie mood using the Hugging Face emotion classifier."""
if not movie_overview or len(movie_overview) < 10:
return None
try:
result = classifier(movie_overview)
mood = result[0]["label"]
return mood if mood in movie_moods else None
except Exception as e:
print(f"β Error during mood classification: {e}")
return None
def collect_movie_data():
"""Fetch movies, classify moods, and ensure 200 samples per mood."""
for endpoint in TMDB_ENDPOINTS:
print(f"π₯ Fetching movies from {endpoint}...")
page = 1
while not all(len(movies) >= TARGET_SAMPLES_PER_MOOD for movies in movie_moods.values()):
movies = get_movies_from_tmdb(endpoint, page)
if not movies:
break
for movie in tqdm(movies, desc=f"Processing page {page}"):
title, overview = movie.get("title"), movie.get("overview")
if not title or not overview or title in unique_movie_titles:
continue
mood = classify_mood(overview)
if mood and len(movie_moods[mood]) < TARGET_SAMPLES_PER_MOOD:
movie_moods[mood].append({"Movie_Title": title, "Overview": overview, "Mood": mood})
unique_movie_titles.add(title)
page += 1
# Stop when each mood reaches its target
if all(len(movies) >= TARGET_SAMPLES_PER_MOOD for movies in movie_moods.values()):
break
def save_dataset():
"""Save the collected movie data into a CSV file."""
all_movies = []
for mood, movies in movie_moods.items():
all_movies.extend(movies)
df = pd.DataFrame(all_movies)
df.to_csv("data/movie_mood_dataset.csv", index=False)
print("β
Movie mood dataset saved as movie_mood_dataset.csv")
if __name__ == "__main__":
print("π Collecting movies and ensuring 200 per mood...")
collect_movie_data()
save_dataset()
print("π¬ Dataset generation complete!")
Create a file train_model.py
to generate trained models tfidf_vectorizer.pkl
and xgb_mood_classifier.pkl
:
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_sample_weight
print("π Starting XGBoost model training with hyperparameter tuning ...")
# Load dataset
df = pd.read_csv("data/movie_mood_dataset.csv")
print(f"π Dataset loaded successfully. Total samples: {df.shape[0]}")
# Encode moods into numerical labels
label_encoder = LabelEncoder()
df["Mood_Label"] = label_encoder.fit_transform(df["Mood"])
print(f"π’ Mood labels encoded. Unique moods: {len(label_encoder.classes_)}")
# TF-IDF vectorization
vectorizer = TfidfVectorizer(
max_features=2000, # Limit features to avoid overfitting
stop_words="english"
)
X = vectorizer.fit_transform(df["Overview"])
y = df["Mood_Label"]
print(f"π TF-IDF vectorization complete. Vocabulary size: {len(vectorizer.get_feature_names_out())}")
# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"π Data split into training ({X_train.shape[0]}) and test ({X_test.shape[0]}) sets.")
sample_weights = compute_sample_weight('balanced', y_train)
# Define base XGBoost classifier with class imbalance handling
base_xgb = xgb.XGBClassifier(
objective="multi:softmax",
num_class=len(label_encoder.classes_),
random_state=42
)
# Define hyperparameter grid for tuning
param_grid = {
'max_depth': [6, 8],
'learning_rate': [0.01, 0.05],
'n_estimators': [100, 500],
'subsample': [0.8],
'colsample_bytree': [0.8]
}
# Perform GridSearchCV for hyperparameter tuning
print("π Performing hyperparameter tuning with GridSearchCV...")
grid_search = GridSearchCV(
estimator=base_xgb,
param_grid=param_grid,
scoring='accuracy',
cv=5,
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train, sample_weight=sample_weights)
# Best model after tuning
best_xgb_model = grid_search.best_estimator_
print(f"π Best hyperparameters found: {grid_search.best_params_}")
# Calculate sample weights
sample_weights = compute_sample_weight("balanced", y_train)
# Train the best model on full training set
print("β³ Training best model with optimized parameters...")
best_xgb_model.fit(X_train, y_train, verbose=True)
print("β
Model training complete.")
# Save model and vectorizer
joblib.dump(best_xgb_model, "models/xgb_mood_classifier.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
print("πΎ Model and vectorizer saved successfully.")
# Predictions on the test set
print("π Generating predictions on test set...")
y_pred = best_xgb_model.predict(X_test)
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"π― XGBoost Model Accuracy: {accuracy:.2%}")
# Classification report
print("π Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("π Training pipeline completed successfully!")
Create a file app.py
:
import streamlit as st
import pandas as pd
import joblib
import requests
import random
from dotenv import load_dotenv
import os
import time
from datetime import datetime, timedelta
# Load environment variables (API keys)
load_dotenv()
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
# Check if key is available
if not TMDB_API_KEY:
raise ValueError("π¨ TMDB_API_KEY not found! Please set it in Hugging Face Secrets.")
# Cache settings
CACHE_FILE = "movies_cache.pkl"
CACHE_EXPIRATION_DAYS = 7 # Refresh once per week
# Load trained model and vectorizer
model = joblib.load("models/xgb_mood_classifier.pkl")
vectorizer = joblib.load("models/tfidf_vectorizer.pkl")
# Define mood labels and corresponding emotion icons in the desired order
mood_mapping = {
"love": ("love", "β€οΈ"),
"joy": ("joy", "π"),
"surprise": ("surprise", "π²"),
"sadness": ("sadness", "π’"),
"fear": ("fear", "π¨"),
"anger": ("anger", "π‘"),
}
# Hugging Face original order to custom order mapping
huggingface_to_custom = {
"anger": "anger",
"fear": "fear",
"joy": "joy",
"love": "love",
"sadness": "sadness",
"surprise": "surprise"
}
# TMDb API endpoint and image URL
WEEK_ENDPOINT = "https://api.themoviedb.org/3/trending/movie/week"
TMDB_IMAGE_URL = "https://image.tmdb.org/t/p/w500"
# Get the first day of the current week (Monday)
first_day_of_current_week = datetime.now() - timedelta(days=datetime.now().weekday())
current_week = datetime.now().isocalendar()[1] # ISO week number
# π° Cache movie fetching for one week
@st.cache_data(ttl=60 * 60 * 24 * 7, hash_funcs={int: str})
def fetch_trending_movies(week=current_week):
"""Fetch trending movies from TMDb and classify them once per week."""
movies_cache = []
page = 1
while len(movies_cache) < 150: # Fetch enough movies for all moods
try:
response = requests.get(WEEK_ENDPOINT, params={"api_key": TMDB_API_KEY, "page": page})
response.raise_for_status()
results = response.json().get("results", [])
for movie in results:
title = movie.get("title")
overview = movie.get("overview")
poster = TMDB_IMAGE_URL + movie["poster_path"] if movie.get("poster_path") else None
release_date = movie.get("release_date")
if title and overview and release_date:
release_date_obj = datetime.strptime(release_date, "%Y-%m-%d")
if release_date_obj < first_day_of_current_week: # Ensure the movie was released before this week
hf_mood = classify_mood(overview)
custom_mood = huggingface_to_custom.get(hf_mood, "unknown")
movies_cache.append({
"title": title,
"overview": overview,
"poster": poster,
"mood": custom_mood,
"release_date": release_date
})
page += 1
if not results:
break
except Exception as e:
st.error(f"Failed to fetch trending movies (Page {page}): {e}")
break
# Sort by release date (newest first)
movies_cache.sort(key=lambda x: x["release_date"], reverse=True)
return movies_cache
def classify_mood(movie_overview):
"""Predict movie mood using XGBoost model and map to custom order."""
X = vectorizer.transform([movie_overview])
mood_label = model.predict(X)[0]
hf_mood = ["anger", "fear", "joy", "love", "sadness", "surprise"][mood_label]
return hf_mood
def fetch_recommendations(user_mood):
"""Fetch 3 recommendations per mood from cached trending movies. Get more if fewer than 3."""
mood_movies = []
page = 1
while len(mood_movies) < 3:
trending_movies = fetch_trending_movies(current_week)
# Filter movies by user mood
for movie in trending_movies:
if movie["mood"] == user_mood and movie["title"] not in [m["title"] for m in mood_movies]:
mood_movies.append(movie)
if len(mood_movies) >= 3:
break
# If fewer than 3, fetch more pages
if len(mood_movies) < 3:
try:
response = requests.get(WEEK_ENDPOINT, params={"api_key": TMDB_API_KEY, "page": page})
response.raise_for_status()
results = response.json().get("results", [])
for movie in results:
title = movie.get("title")
overview = movie.get("overview")
poster = TMDB_IMAGE_URL + movie["poster_path"] if movie.get("poster_path") else None
release_date = movie.get("release_date")
if title and overview and release_date:
release_date_obj = datetime.strptime(release_date, "%Y-%m-%d")
if release_date_obj < first_day_of_current_week:
hf_mood = classify_mood(overview)
custom_mood = huggingface_to_custom.get(hf_mood, "unknown")
if custom_mood == user_mood and title not in [m["title"] for m in mood_movies]:
mood_movies.append({
"title": title,
"overview": overview,
"poster": poster,
"mood": custom_mood,
"release_date": release_date
})
page += 1
if not results:
break
except Exception as e:
st.error(f"Failed to fetch additional trending movies: {e}")
break
return mood_movies[:3]
# Streamlit UI
st.title("π¬ CineMood: Get Your Mood-Based Trending Movies! β‘")
# User selects their mood
user_mood, mood_icon = st.selectbox(
"Select your mood:",
[(mood, emoji) for mood, (mood, emoji) in mood_mapping.items()],
format_func=lambda x: f"{x[1]} {x[0]}"
)
# Fetch recommendations based on user mood
recommended_movies = fetch_recommendations(user_mood)
# Display recommendations
st.subheader(f"{mood_icon} Recommended Trending Movies for Your Mood: {user_mood.capitalize()}")
if recommended_movies:
for movie in recommended_movies:
st.markdown(f"### π¬ {movie['title']} ({movie['release_date']})")
st.write(f"π {movie['overview']}")
if movie['poster']:
st.image(movie['poster'], width=200)
st.write("---")
else:
st.write("β No matching movies found. Try again later!")
# Footer Section
st.markdown("**Made by [Thanh Tung Vu](https://thanhtungvudata.github.io/)**")
Deployment steps:
requirements.txt
The file requirements.txt
:
requests
pandas
numpy
scikit-learn
xgboost
joblib
transformers
huggingface_hub
fastapi
uvicorn
tmdbv3api
python-dotenv
tqdm
streamlit
The final web app delivers mood-based movie recommendations in just a second, with fresh content every week.
You can try it here:π CineMood Live App on Hugging Face Spaces
CineMood showcases how machine learning, APIs, and web tools can create an engaging and user-friendly app. From data collection to deployment, it demonstrates the power of end-to-end ML pipelines.
Looking ahead, next steps will be:
The code of this project is available here.
For further inquiries or collaboration, please contact me at my email.