PyScripts/data_processing/letterboxd_analyzer.py at main · Benji377/PyScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Title: Letterboxd Watch Time Analyzer
Description: This script allows you to scrape a Letterboxd user's film history and calculate total watch time
Author: Benji377
Created: 30.04.2025
Last Updated: 30.04.2025

Usage:
    python letterboxd_analyzer.py <username> [--csv] [--pdf]
     <username>        : The Letterboxd username of the user you want to analyze.
     --csv             : Export the results to a CSV file.
     --pdf             : Generate a PDF report with graphs.

Dependencies:
    - requests
    - matplotlib
    - pandas
    - beautifulsoup4
    - reportlab
    (Install with: pip install requests matplotlib pandas beautifulsoup4 reportlab)

Notes:
    - This script is standalone and does not require other files from the repo.
    - Adjust any parameters or constants at the top of the script as needed.
"""


import requests
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
from multiprocessing import Pool, cpu_count
from time import sleep
from bs4 import BeautifulSoup
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER


# This script allows you to scrape a Letterboxd user's film history and calculate
# total watch time, generate CSV reports, and create PDF reports with graphs.
# The script works by:
#   1. Fetching a user's film slugs (unique identifiers for each film).
#   2. Retrieving detailed information for each film via JSON API endpoints.
#   3. Calculating the total time watched and generating summary reports.
#   4. Optionally exporting the data to a CSV file or generating a PDF report
#      with graphs showing the user's watch time and films watched per year.

HEADERS = {"User-Agent": "Mozilla/5.0"}

# Scrape movie slugs from a Letterboxd user's film list pages
def get_film_slugs(user):
    slugs = []
    page = 1
    while True:
        url = f"https://letterboxd.com/{user}/films/page/{page}/"
        soup = BeautifulSoup(requests.get(url, headers=HEADERS).text, 'html.parser')
        posters = soup.select('ul.poster-list li div[data-details-endpoint]')
        if not posters:
            break
        for div in posters:
            endpoint = div.get('data-details-endpoint')
            if endpoint:
                slug = endpoint.split('/film/')[1]
                slugs.append(slug)
        page += 1
    return slugs


# Fetch runtime and release year from JSON endpoint
def fetch_film_data(slug, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            json_url = f"https://letterboxd.com/film/{slug}"
            data = requests.get(json_url, headers=HEADERS, timeout=10).json()
            return {
                'title': data.get('name', 'Unknown'),
                'minutes': int(data.get('runTime') or 0),
                'year': int(data.get('releaseYear') or 0)
            }
        except Exception as e:
            attempt += 1
            print(f"Error fetching data for {json_url}: {e}, retrying ({attempt}/{retries})...")
            sleep(random.randint(1, 3))  # Random backoff
    return None  # After retries, return None if failed


# Generate CSV export
def export_to_csv(film_data, filename="letterboxd_watch_log.csv"):
    film_data = [f for f in film_data if f and f['minutes'] > 0]
    df = pd.DataFrame(film_data)
    df.to_csv(filename, index=False)
    print(f"📁 CSV exported to {filename}")

# Plot and save charts
def generate_graphs(film_data):
    film_data = [f for f in film_data if f and f['minutes'] > 0]
    df = pd.DataFrame(film_data)
    df = df[df['year'] > 0]
    df = df.dropna(subset=['minutes', 'year'])  # Remove rows with missing data

    # Runtime per year
    time_per_year = df.groupby('year')['minutes'].sum()
    time_per_year.plot(kind='bar', title='Watch Time per Year', figsize=(8, 3), color='skyblue')
    plt.ylabel("Minutes")
    plt.tight_layout()
    plt.savefig("watch_time_per_year.png")
    plt.close()

    # Films per year
    films_per_year = df.groupby('year')['title'].count()
    films_per_year.plot(kind='bar', title='Films Watched per Year', figsize=(8, 3), color='salmon')
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig("films_per_year.png")
    plt.close()

# Generate a simple PDF report with embedded charts
def generate_pdf_report(total_minutes):
    total_hours = total_minutes / 60
    doc = SimpleDocTemplate("letterboxd_report.pdf", pagesize=letter)
    styles = getSampleStyleSheet()

    header_style = ParagraphStyle(
        name='HeaderStyle',
        parent=styles['Heading2'],
        alignment=TA_CENTER,
        spaceAfter=12
    )

    elements = [
        Paragraph("<b>Letterboxd Watch Report</b>", header_style),
        Paragraph("🎬 Your personal film-watching stats summary", styles['Normal']),
        Spacer(1, 12),
        Paragraph("<b>▶️ Total Time Watched:</b>", styles['Heading3']),
        Paragraph(f"{total_minutes} minutes ({total_hours:.2f} hours)", styles['Normal']),
        Spacer(1, 12),
        Paragraph("<b>📊 Watch Time Per Year</b>", styles['Heading3']),
        Image("watch_time_per_year.png", width=6*72, height=2.5*72),
        Spacer(1, 6),
        Paragraph("<b>🎞️ Films Watched Per Year</b>", styles['Heading3']),
        Image("films_per_year.png", width=6*72, height=2.5*72),
        Spacer(1, 6),
    ]

    doc.build(elements)
    print("📄 PDF report saved as letterboxd_report.pdf ✅")

    # Clean up temp graphs
    for img in ["watch_time_per_year.png", "films_per_year.png"]:
        try:
            os.remove(img)
        except Exception as e:
            print(f"Warning: Couldn't delete {img}: {e}")

# Command-line entry point
def main():
    parser = argparse.ArgumentParser(
        description="Letterboxd Watch Time Analyzer. This script scrapes Letterboxd data, calculates total watch time, generates CSV, and creates PDF reports with graphs."
    )
    parser.add_argument("username", help="Your Letterboxd username")
    parser.add_argument("--csv", action="store_true", help="Export results to CSV")
    parser.add_argument("--pdf", action="store_true", help="Generate a PDF report")
    args = parser.parse_args()

    print(f"🔍 Scanning films for user: {args.username}")
    slugs = get_film_slugs(args.username)
    print(f"🎞️  Found {len(slugs)} films")

    # Use multiprocessing to fetch data faster
    with Pool(cpu_count()) as pool:
        film_data = list(filter(None, pool.map(fetch_film_data, slugs)))

    total_minutes = sum(f['minutes'] for f in film_data)
    print(f"⏱️  Total watch time: {total_minutes} minutes ({total_minutes / 60:.2f} hours)")

    if args.csv:
        export_to_csv(film_data)

    if args.pdf:
        generate_graphs(film_data)
        generate_pdf_report(total_minutes)

if __name__ == "__main__":
    main()