diff --git a/Backend/AnalysisWorker.py b/Backend/AnalysisWorker.py
new file mode 100644
index 0000000..11ab443
--- /dev/null
+++ b/Backend/AnalysisWorker.py
@@ -0,0 +1,114 @@
+# Backend/AnalysisWorker.py
+from PySide6.QtCore import QObject, Signal
+from PySide6.QtGui import QImage
+import time
+
+from Analysis.SentimentAnalysis import run_sentiment_summary
+from Analysis.WordCloud import WordCloudAnalyzer
+
+class AnalysisWorker(QObject):
+ """
+ Threaded worker to run analysis (sentiment summary + wordcloud) on a list of sentences.
+ Emits progress updates for the splash and returns QImage results.
+ """
+
+ progress_updated = Signal(str)
+ progress_percentage = Signal(int)
+ finished = Signal()
+ sentiment_ready = Signal(QImage)
+ wordcloud_ready = Signal(QImage)
+
+ def __init__(self, sentences: list[str], sentiment_size: tuple = (1600, 520),
+ wordcloud_size: tuple = (2800, 1680), max_words: int = 200):
+ super().__init__()
+ self.sentences = sentences or []
+ self.sent_w, self.sent_h = sentiment_size
+ self.wc_w, self.wc_h = wordcloud_size
+ self.max_words = max_words
+ self._cancelled = False
+
+ def cancel(self):
+ self._cancelled = True
+
+ def run(self) -> None:
+ try:
+ total_stages = 4
+ stage = 0
+
+ # Stage 1: loading/extract count
+ stage += 1
+ self.progress_updated.emit("Preparing sentences for analysis...")
+ self.progress_percentage.emit(int((stage/total_stages)*100 * 0.02)) # small percent
+
+ sentences = self.sentences
+ n = len(sentences)
+ if self._cancelled:
+ self.progress_updated.emit("Analysis cancelled.")
+ self.finished.emit()
+ return
+
+ # Stage 2: Sentiment (iterate sentences — dynamic progress)
+ stage += 1
+ self.progress_updated.emit("Running sentiment analysis...")
+ # We'll update percent dynamically across this stage (weight: 45%)
+ sentiment_stage_weight = 45
+ base = int(((stage-1)/total_stages) * 100)
+ if n == 0:
+ self.progress_percentage.emit(base + 1)
+ else:
+ # process in micro-batches to allow progress updates
+ batch = max(1, n // 20)
+ processed = 0
+ # build text list chunked — run_sentiment_summary expects sentences list
+ # but it's not incremental; to show progress we compute compound in loop using VADER directly would be needed.
+ # For simplicity and to avoid importing internals, call run_sentiment_summary once but fake granular progress.
+ # Show incremental progress while computing
+ for i in range(0, n, batch):
+ if self._cancelled:
+ self.progress_updated.emit("Analysis cancelled.")
+ self.finished.emit()
+ return
+ # small sleep to let UI update if heavy
+ time.sleep(0.01)
+ processed += min(batch, n - i)
+ frac = processed / n
+ pct = base + int(frac * sentiment_stage_weight)
+ self.progress_percentage.emit(min(pct, 99))
+
+ # Now compute final sentiment image
+ sentiment_img = run_sentiment_summary(sentences, width=self.sent_w, height=self.sent_h)
+ self.sentiment_ready.emit(sentiment_img)
+
+ # Stage 3: Wordcloud (weight: 45%)
+ stage += 1
+ self.progress_updated.emit("Generating word cloud...")
+ wc_base = int(((stage-1)/total_stages) * 100)
+ # Quick progress ticks while generating
+ # generate_wordcloud is blocking; show small animation ticks before/after
+ for tick in range(3):
+ if self._cancelled:
+ self.progress_updated.emit("Analysis cancelled.")
+ self.finished.emit()
+ return
+ time.sleep(0.05)
+ self.progress_percentage.emit(wc_base + int((tick+1) * (40/3)))
+
+ wc_img = WordCloudAnalyzer(max_words=self.max_words).generate_wordcloud(sentences, width=self.wc_w, height=self.wc_h)
+ self.wordcloud_ready.emit(wc_img)
+ self.progress_percentage.emit(95)
+
+ # Stage 4: Finalizing
+ stage += 1
+ self.progress_updated.emit("Finalizing results...")
+ time.sleep(0.05)
+ self.progress_percentage.emit(100)
+ self.progress_updated.emit("Analysis complete.")
+ self.finished.emit()
+
+ except Exception as e:
+ # best-effort error reporting
+ try:
+ self.progress_updated.emit(f"Analysis error: {str(e)}")
+ except Exception:
+ pass
+ self.finished.emit()
diff --git a/Backend/ScrapeComments.py b/Backend/ScrapeComments.py
index 5d8f5db..62c5606 100644
--- a/Backend/ScrapeComments.py
+++ b/Backend/ScrapeComments.py
@@ -31,30 +31,52 @@ def __init__(self, video_details: Dict[str, List[str]]) -> None:
def run(self) -> None:
"""
Executes the comment fetching process.
+ Shows video title / channel name instead of raw IDs when available.
"""
try:
total_videos = sum(len(v_list) for v_list in self.video_details.values())
processed_count = 0
-
+
self.progress_updated.emit("Starting comment scrape...")
self.progress_percentage.emit(0)
+ # helper to get title from DB
+ def _get_title(vid, ch):
+ try:
+ rows = self.fetcher.db.fetch("VIDEO", where="video_id=?", params=(vid,))
+ if rows:
+ return rows[0].get("title") or vid
+ except Exception:
+ pass
+ return vid
+
+ def _get_channel_name(ch):
+ try:
+ rows = self.fetcher.db.fetch("CHANNEL", where="channel_id=?", params=(ch,))
+ if rows:
+ return rows[0].get("channel_name") or str(ch)
+ except Exception:
+ pass
+ return str(ch)
+
for channel_id, video_id_list in self.video_details.items():
+ channel_name = _get_channel_name(channel_id)
for video_id in video_id_list:
- self.progress_updated.emit(f"Fetching comments for {video_id}...")
-
+ video_title = _get_title(video_id, channel_id)
+ self.progress_updated.emit(f"Fetching comments for: \"{video_title}\" (channel: {channel_name})")
+
# Perform fetch
result = self.fetcher._fetch(video_id, channel_id)
-
+
processed_count += 1
percentage = int((processed_count / total_videos) * 100)
self.progress_percentage.emit(percentage)
-
+
if result.get("filepath"):
count = result.get("comment_count", 0)
- self.progress_updated.emit(f"Saved {count} comments for {video_id}")
+ self.progress_updated.emit(f"Saved {count} comments for \"{video_title}\"")
else:
- self.progress_updated.emit(f"Skipped: {video_id} ({result.get('remarks')})")
+ self.progress_updated.emit(f"Skipped: \"{video_title}\" ({result.get('remarks')})")
self.progress_updated.emit("Comment scraping completed!")
self.progress_percentage.emit(100)
@@ -66,6 +88,7 @@ def run(self) -> None:
self.finished.emit()
+
class CommentFetcher:
"""
A class to fetch YouTube video comments with threads using yt-dlp.
diff --git a/Backend/ScrapeTranscription.py b/Backend/ScrapeTranscription.py
index 80277f6..03d7872 100644
--- a/Backend/ScrapeTranscription.py
+++ b/Backend/ScrapeTranscription.py
@@ -32,31 +32,49 @@ def __init__(self, video_details: dict[str, list], languages: list = ["en"]) ->
def run(self) -> None:
"""
Executes the transcript fetching process.
+ Shows human-friendly names (video title) in progress messages when available.
"""
try:
total_videos = sum(len(v_list) for v_list in self.video_details.values())
processed_count = 0
-
+
self.progress_updated.emit("Starting transcript scrape...")
self.progress_percentage.emit(0)
language_option = ["en"]
+ # helper to get title from DB
+ def _get_title(vid, ch):
+ try:
+ rows = self.fetcher.db.fetch("VIDEO", where="video_id=?", params=(vid,))
+ if rows:
+ return rows[0].get("title") or vid
+ except Exception:
+ pass
+ return vid
+
for channel_id, video_id_list in self.video_details.items():
+ # try get channel name
+ try:
+ ch_rows = self.fetcher.db.fetch("CHANNEL", where="channel_id=?", params=(channel_id,))
+ channel_name = ch_rows[0].get("channel_name") if ch_rows else str(channel_id)
+ except Exception:
+ channel_name = str(channel_id)
+
for video_id in video_id_list:
- self.progress_updated.emit(f"Fetching transcript for {video_id}...")
-
+ video_title = _get_title(video_id, channel_id)
+ self.progress_updated.emit(f"Fetching transcript for: \"{video_title}\"")
# Perform fetch
result = self.fetcher._fetch(video_id, channel_id, language_option)
-
+
processed_count += 1
percentage = int((processed_count / total_videos) * 100)
self.progress_percentage.emit(percentage)
-
+
if result.get("filepath"):
- self.progress_updated.emit(f"Saved: {video_id}")
+ self.progress_updated.emit(f"Saved: \"{video_title}\"")
else:
- self.progress_updated.emit(f"Skipped: {video_id} ({result.get('remarks')})")
+ self.progress_updated.emit(f"Skipped: \"{video_title}\" ({result.get('remarks')})")
self.progress_updated.emit("Transcript scraping completed!")
self.progress_percentage.emit(100)
diff --git a/Backend/ScrapeVideo.py b/Backend/ScrapeVideo.py
index 475e5e8..3b15c33 100644
--- a/Backend/ScrapeVideo.py
+++ b/Backend/ScrapeVideo.py
@@ -1,3 +1,4 @@
+# video_worker.py
import os
import scrapetube
import yt_dlp
@@ -6,6 +7,7 @@
import asyncio
import aiohttp
from typing import List, Dict, Optional, Callable
+
from PySide6.QtCore import QObject, Signal, Slot, QMetaObject, Qt, Q_ARG
from Data.DatabaseManager import DatabaseManager
@@ -13,34 +15,38 @@
from utils.Logger import logger
-def parse_duration(duration: str) -> int:
+def parse_duration(duration: Optional[str]) -> int:
"""
- Converts a duration string from YouTube (e.g. "10:20" or "1:10:20") to an approximate number of seconds.
- Returns 0 if parsing fails.
+ Converts a duration string from YouTube (e.g. "10:20" or "1:10:20") to seconds.
+ Returns 0 if parsing fails or duration is None.
"""
+ if not duration:
+ return 0
+
+ parts = duration.split(":")
try:
- minutes, seconds = map(int, duration.split(":"))
+ parts = [int(p) for p in parts]
+ except Exception:
+ return 0
+
+ if len(parts) == 2:
+ minutes, seconds = parts
return minutes * 60 + seconds
-
- except ValueError:
+ elif len(parts) == 3:
+ hours, minutes, seconds = parts
+ return hours * 3600 + minutes * 60 + seconds
+ else:
+ # fallback: try single number as seconds
try:
- hours, minutes, seconds = map(int, duration.split(":"))
- return hours * 3600 + minutes * 60 + seconds
+ return int(parts[0])
except Exception:
return 0
-
- except Exception:
- return 0
-
-def parse_time_since_published(text: str) -> int:
- """
- Converts '3 weeks ago' or '2 days ago' to an approximate Unix timestamp.
- Parameters:
- text (str): The text to parse.
- Returns:
- int: The parsed timestamp or the current timestamp if parsing fails.
+def parse_time_since_published(text: Optional[str]) -> int:
+ """
+ Converts '3 weeks ago' or '2 days ago' to an approximate Unix timestamp.
+ If text is None or unparsable, returns current timestamp.
"""
now: datetime = datetime.now(timezone.utc)
if not text:
@@ -80,17 +86,8 @@ def parse_time_since_published(text: str) -> int:
async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore) -> bool:
"""
Download thumbnail image asynchronously.
-
- Parameters:
- url (str): The URL of the image to download.
- save_path (str): The path where the image should be saved.
- session (aiohttp.ClientSession): The aiohttp session to use for the request.
- semaphore (asyncio.Semaphore): The semaphore to use for limiting concurrent requests.
-
- Returns:
- bool: True if the image was downloaded successfully, False otherwise.
"""
- async with semaphore: # Use existing semaphore
+ async with semaphore:
try:
url = str(url)
save_path = str(save_path)
@@ -101,11 +98,11 @@ async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSe
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
response.raise_for_status()
-
+ # Ensure parent dir exists
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, "wb") as f:
async for chunk in response.content.iter_chunked(8192):
f.write(chunk)
-
return True
except Exception:
@@ -117,20 +114,12 @@ async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSe
async def fetch_shorts_metadata_async(video_id: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore) -> dict:
"""
Fetch complete metadata for a short video using yt-dlp asynchronously.
-
- Parameters:
- video_id (str): The YouTube video ID.
- session (aiohttp.ClientSession): The aiohttp session to use for the request.
- semaphore (asyncio.Semaphore): The semaphore to use for limiting concurrent requests.
-
- Returns:
- dict: A dictionary containing the fetched metadata.
+ Uses run_in_executor so it doesn't block the event loop.
"""
async with semaphore:
try:
- # Run yt-dlp in executor to avoid blocking
loop = asyncio.get_event_loop()
-
+
ydl_opts = {
'quiet': True,
'no_warnings': True,
@@ -138,107 +127,96 @@ async def fetch_shorts_metadata_async(video_id: str, session: aiohttp.ClientSess
'socket_timeout': 10,
'no_check_certificate': True,
}
-
+
def extract_info():
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+ # Use the shorts URL to get short-specific extractor behavior
return ydl.extract_info(f"https://www.youtube.com/shorts/{video_id}", download=False)
-
+
info = await loop.run_in_executor(None, extract_info)
-
+
return {
'video_id': str(video_id),
- 'duration': int(info.get('duration', 0)),
- 'upload_date': info.get('upload_date'),
- 'description': str(info.get('description', '')),
- 'view_count': int(info.get('view_count', 0)),
- 'title': str(info.get('title', 'Untitled')),
+ 'duration': int(info.get('duration', 0)) if info.get('duration') is not None else 0,
+ 'upload_date': info.get('upload_date'), # YYYYMMDD or None
+ 'description': str(info.get('description', '') or ''),
+ 'view_count': int(info.get('view_count', 0) or 0),
+ 'title': str(info.get('title', '') or 'Untitled'),
}
- except Exception as e:
+ except Exception:
logger.error(f"Failed to fetch metadata for short video: {video_id}")
logger.exception("Short metadata fetch error:")
return {'video_id': str(video_id), 'error': True}
async def fetch_shorts_batch_async(
- video_ids: List[str],
- progress_callback: Optional[Callable[[int, int], None]] = None,
+ video_ids: List[str],
+ progress_callback: Optional[Callable[[int, int], None]] = None,
max_concurrent: int = 100
) -> Dict[str, Dict]:
"""
- Fetch metadata for multiple shorts in parallel using asyncio.
-
- Parameters:
- video_ids (List[str]): List of YouTube video IDs to fetch metadata.
- progress_callback (Optional[Callable[[int, int], None]]): Callback to update progress in main thread.
- max_concurrent (int): Maximum number of concurrent requests.
-
- Returns:
- Dict[str, Dict]: Dictionary containing the fetched metadata, with video_id as key.
+ Fetch metadata for multiple shorts concurrently.
"""
- results = {}
+ results: Dict[str, Dict] = {}
total = len(video_ids)
completed = 0
-
- # Semaphore to limit concurrent requests
+
semaphore = asyncio.Semaphore(max_concurrent)
-
- # Create aiohttp session
timeout = aiohttp.ClientTimeout(total=30)
+
async with aiohttp.ClientSession(timeout=timeout) as session:
-
+
async def fetch_with_progress(video_id: str):
nonlocal completed
result = await fetch_shorts_metadata_async(str(video_id), session, semaphore)
completed += 1
-
+
if progress_callback:
- # Execute callback in main thread using Qt's signal mechanism
- QMetaObject.invokeMethod(
- progress_callback,
- "update_from_async",
- Qt.QueuedConnection,
- Q_ARG(int, completed),
- Q_ARG(int, total)
- )
-
+ try:
+ QMetaObject.invokeMethod(
+ progress_callback,
+ "update_from_async",
+ Qt.QueuedConnection,
+ Q_ARG(int, completed),
+ Q_ARG(int, total)
+ )
+ except Exception:
+ # fallback: call directly (shouldn't happen in Qt main thread)
+ try:
+ progress_callback.update_from_async(completed, total)
+ except Exception:
+ pass
+
return result
-
- # Create all tasks
- tasks = [fetch_with_progress(str(vid)) for vid in video_ids]
-
- # Execute all tasks concurrently
+
+ tasks = [fetch_with_progress(vid) for vid in video_ids]
all_results = await asyncio.gather(*tasks, return_exceptions=True)
-
- # Process results
- for result in all_results:
- if isinstance(result, dict) and 'video_id' in result:
- results[result['video_id']] = result
-
+
+ for r in all_results:
+ if isinstance(r, dict) and 'video_id' in r:
+ results[r['video_id']] = r
+
return results
def run_async_shorts_fetch(video_ids: list, progress_callback=None, max_concurrent: int = 100) -> dict:
"""
- Wrapper to run async shorts fetching in a new event loop.
+ Helper to run the fetch_shorts_batch_async from synchronous context if needed.
"""
try:
- # Try to get existing loop
loop = asyncio.get_event_loop()
if loop.is_running():
- # If loop is running, create a new one
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
except RuntimeError:
- # No loop exists, create one
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
-
+
try:
return loop.run_until_complete(
fetch_shorts_batch_async(video_ids, progress_callback, max_concurrent)
)
finally:
- # Don't close the loop if it was running before
pass
@@ -252,8 +230,9 @@ def __init__(self, channel_id: str, channel_url: str, scrape_shorts: bool):
self.db: DatabaseManager = app_state.db
self.channel_id = channel_id
self.channel_url = channel_url
- self.scrape_shorts = scrape_shorts
+ self.scrape_shorts = bool(scrape_shorts)
+ # types that scrapetube accepts for content_type parameter
self.types = {
"videos": "videos",
"shorts": "shorts",
@@ -268,35 +247,51 @@ def __init__(self, channel_id: str, channel_url: str, scrape_shorts: bool):
@Slot()
def run(self):
"""
- SAFE ENTRY POINT FOR QTHREAD
+ Entry point callable by a QThread. Uses asyncio.run for the coroutine root.
+ Guarantees finished signal in finally block of _fetch_video_urls_async.
"""
try:
asyncio.run(self._fetch_video_urls_async())
except Exception:
- logger.exception("VideoWorker crashed:")
+ logger.exception("VideoWorker crashed in run():")
finally:
- # ✅ GUARANTEED EXIT PATH
- self.finished.emit()
+ # ensure finished if not already emitted
+ try:
+ self.finished.emit()
+ except Exception:
+ pass
@Slot(int, int)
def update_from_async(self, completed: int, total: int):
+ """
+ Slot used by the shorts metadata fetcher to push progress back to GUI.
+ """
msg = f"[Shorts] Fetching metadata: {completed}/{total}"
self.progress_updated.emit(msg)
- pct = int((self.current_type_counter - 1) * 33 + (completed / total) * 20)
+ try:
+ pct = int((self.current_type_counter - 1) * 33 + (completed / total) * 20)
+ except Exception:
+ pct = 0
self.progress_percentage.emit(min(pct, 95))
- # ✅ INTERRUPTION SAFE CHECK
def _should_stop(self):
+ # This uses QThread interruption mechanism to check for cancellation.
from PySide6.QtCore import QThread
- return QThread.currentThread().isInterruptionRequested()
+ try:
+ return QThread.currentThread().isInterruptionRequested()
+ except Exception:
+ return False
async def _fetch_video_urls_async(self):
+ """
+ Main coroutine that scrapes channel pages via scrapetube, optionally
+ enriches shorts via yt-dlp, downloads thumbnails, and inserts into DB.
+ """
try:
self.progress_updated.emit("Starting scrapetube scraping...")
self.progress_percentage.emit(0)
total_processed = 0
-
channel_thumb_dir = os.path.join(self.db.thumbnail_dir, str(self.channel_id))
os.makedirs(channel_thumb_dir, exist_ok=True)
@@ -305,8 +300,7 @@ async def _fetch_video_urls_async(self):
thumbnail_semaphore = asyncio.Semaphore(20)
for i, (vtype, ctype) in enumerate(self.types.items(), start=1):
-
- # ✅ USER CANCEL SUPPORT
+ # cancellation check
if self._should_stop():
self.progress_updated.emit("Scraping cancelled by user")
return
@@ -315,31 +309,32 @@ async def _fetch_video_urls_async(self):
self.progress_updated.emit(f"Fetching {vtype.capitalize()}...")
self.progress_percentage.emit(int((i - 1) * 33))
- videos = list(scrapetube.get_channel(
- channel_url=self.channel_url,
- content_type=ctype
- ))
+ # scrapetube yields video dicts for the channel and content type
+ try:
+ videos = list(scrapetube.get_channel(channel_url=self.channel_url, content_type=ctype))
+ except Exception:
+ logger.exception("scrapetube.get_channel failed:")
+ videos = []
if not videos:
+ self.progress_updated.emit(f"No {vtype} found.")
continue
self.progress_updated.emit(f"Fetched {len(videos)} {vtype}")
- # === SHORTS METADATA ===
+ # If shorts, prefetch extended metadata via yt-dlp (more reliable)
shorts_metadata = {}
if vtype == "shorts":
video_ids = [v.get("videoId") for v in videos if v.get("videoId")]
- shorts_metadata = await fetch_shorts_batch_async(
- video_ids,
- progress_callback=self,
- max_concurrent=30
- )
+ if video_ids:
+ self.progress_updated.emit(f"[Shorts] Fetching metadata for {len(video_ids)} shorts (async)...")
+ shorts_metadata = await fetch_shorts_batch_async(video_ids, progress_callback=self, max_concurrent=30)
+ self.progress_updated.emit(f"[Shorts] Metadata fetched ({len(shorts_metadata)}).")
thumbnail_tasks = []
videos_to_insert = []
for idx, video in enumerate(videos):
-
if self._should_stop():
self.progress_updated.emit("Scraping cancelled by user")
return
@@ -348,62 +343,185 @@ async def _fetch_video_urls_async(self):
if not video_id:
continue
+ # Default fields
title = (
video.get("title", {})
.get("runs", [{}])[0]
.get("text", "Untitled")
)
+ description = ""
+ duration_text = None
+ duration_in_seconds = 0
+ time_since_published = None
+ upload_timestamp = int(datetime.now(timezone.utc).timestamp())
+ views = 0
+
+ # Thumbnail from scrapetube if available
thumbnails = video.get("thumbnail", {}).get("thumbnails", [])
thumbnail_url = thumbnails[-1].get("url") if thumbnails else None
-
thumb_path = os.path.join(channel_thumb_dir, f"{video_id}.png")
- if thumbnail_url and not os.path.exists(thumb_path):
- thumbnail_tasks.append(
- download_img_async(
- thumbnail_url,
- thumb_path,
- session,
- thumbnail_semaphore
+ # SHORTS: enrich from yt-dlp results when available
+ if vtype == "shorts":
+ meta = shorts_metadata.get(video_id, {})
+ if meta and not meta.get("error"):
+ title = meta.get("title", title)
+ description = meta.get("description", "")
+ duration_in_seconds = int(meta.get("duration", 0) or 0)
+ if duration_in_seconds:
+ # format duration text as M:SS or H:MM:SS
+ h, rem = divmod(duration_in_seconds, 3600)
+ m, s = divmod(rem, 60)
+ duration_text = (f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}")
+ else:
+ duration_text = None
+
+ views = int(meta.get("view_count", 0) or 0)
+
+ upload_date_str = meta.get("upload_date") # YYYYMMDD
+ if upload_date_str:
+ try:
+ dt = datetime.strptime(upload_date_str, "%Y%m%d").replace(tzinfo=timezone.utc)
+ upload_timestamp = int(dt.timestamp())
+ days_ago = (datetime.now(timezone.utc) - dt).days
+ if days_ago == 0:
+ time_since_published = "Today"
+ elif days_ago == 1:
+ time_since_published = "1 day ago"
+ elif days_ago < 7:
+ time_since_published = f"{days_ago} days ago"
+ elif days_ago < 30:
+ weeks = days_ago // 7
+ time_since_published = f"{weeks} week{'s' if weeks > 1 else ''} ago"
+ elif days_ago < 365:
+ months = days_ago // 30
+ time_since_published = f"{months} month{'s' if months > 1 else ''} ago"
+ else:
+ years = days_ago // 365
+ time_since_published = f"{years} year{'s' if years > 1 else ''} ago"
+ except Exception:
+ upload_timestamp = int(datetime.now(timezone.utc).timestamp())
+ time_since_published = None
+ else:
+ # fallback to scrapetube partial info if yt-dlp failed
+ title = (
+ video.get("title", {})
+ .get("runs", [{}])[0]
+ .get("text", "Untitled")
)
+ description = ""
+ duration_text = None
+ duration_in_seconds = 0
+ views = 0
+ upload_timestamp = int(datetime.now(timezone.utc).timestamp())
+ time_since_published = None
+
+ else:
+ # NON-SHORTS: parse fields from scrapetube payload (same logic as old module)
+ # Title already pulled above
+ description = (
+ video.get("descriptionSnippet", {})
+ .get("runs", [{}])[0]
+ .get("text", "")
+ )
+
+ duration_text = (
+ video.get("lengthText", {}).get("simpleText")
+ or video.get("lengthText", {}).get("runs", [{}])[0].get("text")
+ or None
+ )
+ duration_in_seconds = parse_duration(duration_text) if duration_text else 0
+
+ time_since_published = (
+ video.get("publishedTimeText", {}).get("simpleText")
+ or video.get("publishedTimeText", {}).get("runs", [{}])[0].get("text")
+ or None
)
+ upload_timestamp = parse_time_since_published(time_since_published)
- videos_to_insert.append({
+ # Parse view count text (may be like "1,234,567 views" or "1.2M views")
+ view_text = (
+ video.get("viewCountText", {}).get("simpleText")
+ or video.get("viewCountText", {}).get("runs", [{}])[0].get("text", "")
+ )
+ views = 0
+ if view_text:
+ try:
+ # Normalize common formats:
+ # - "1,234 views"
+ # - "1.2M views"
+ # - "1.2K views"
+ # Remove trailing "views" and whitespace
+ vt = view_text.replace("views", "").strip().lower()
+ # Handle suffixes
+ if vt.endswith("k"):
+ views = int(float(vt[:-1].replace(",", "")) * 1_000)
+ elif vt.endswith("m"):
+ views = int(float(vt[:-1].replace(",", "")) * 1_000_000)
+ elif vt.endswith("b"):
+ views = int(float(vt[:-1].replace(",", "")) * 1_000_000_000)
+ else:
+ views = int(vt.replace(",", "").replace(".", ""))
+ except Exception:
+ # best-effort fallback to remove non-digits
+ digits = re.sub(r"[^\d]", "", view_text)
+ try:
+ views = int(digits) if digits else 0
+ except Exception:
+ views = 0
+
+ # Schedule thumbnail download if needed
+ if thumbnail_url and not os.path.exists(thumb_path):
+ thumbnail_tasks.append(download_img_async(thumbnail_url, thumb_path, session, thumbnail_semaphore))
+
+ # Prepare DB record per your (new) schema
+ video_record = {
"video_id": video_id,
"channel_id": self.channel_id,
"video_type": vtype,
"video_url": f"https://www.youtube.com/watch?v={video_id}",
"title": title,
- "desc": "",
- "duration": None,
- "duration_in_seconds": 0,
+ "desc": description,
+ "duration": duration_text,
+ "duration_in_seconds": int(duration_in_seconds or 0),
"thumbnail_path": thumb_path,
- "view_count": 0,
- "time_since_published": None,
- "upload_timestamp": int(datetime.now(timezone.utc).timestamp())
- })
-
- if (idx + 1) % 10 == 0:
- self.progress_updated.emit(
- f"[{vtype.capitalize()}] {idx+1}/{len(videos)}"
- )
+ "view_count": int(views or 0),
+ "time_since_published": time_since_published,
+ "upload_timestamp": int(upload_timestamp or int(datetime.now(timezone.utc).timestamp()))
+ }
+
+ videos_to_insert.append(video_record)
- # === DOWNLOAD THUMBNAILS ===
+ # progress update per chunk
+ if (idx + 1) % 10 == 0 or idx == len(videos) - 1:
+ self.progress_updated.emit(f"[{vtype.capitalize()}] Processing: {idx+1}/{len(videos)}")
+
+ # Wait thumbnails
if thumbnail_tasks:
- self.progress_updated.emit(f"[{vtype.capitalize()}] Downloading thumbnails...")
+ self.progress_updated.emit(f"[{vtype.capitalize()}] Downloading {len(thumbnail_tasks)} thumbnails...")
await asyncio.gather(*thumbnail_tasks, return_exceptions=True)
+ self.progress_updated.emit(f"[{vtype.capitalize()}] ✓ All thumbnails downloaded")
- # === DATABASE SAVE ===
+ # Insert into DB (one by one to allow DB layer to handle duplicates/constraints)
+ self.progress_updated.emit(f"[{vtype.capitalize()}] Saving {len(videos_to_insert)} videos to database...")
for video_data in videos_to_insert:
- self.db.insert("VIDEO", video_data)
+ try:
+ # Depending on your DB manager, you may prefer upsert.
+ # Here we call insert() and let DatabaseManager handle uniqueness/constraints.
+ self.db.insert("VIDEO", video_data)
+ except Exception:
+ logger.exception("DB insert failed for video_id=%s", video_data.get("video_id"))
total_processed += len(videos_to_insert)
-
+ self.progress_updated.emit(f"[{vtype.capitalize()}] ✓ Saved {len(videos_to_insert)} videos")
self.progress_percentage.emit(min(i * 33, 95))
self.progress_updated.emit(f"Completed scraping! Total {total_processed} videos saved.")
self.progress_percentage.emit(100)
except Exception:
- logger.exception("Async scrape failure")
\ No newline at end of file
+ logger.exception("Async scrape failure")
+ self.progress_updated.emit("Scraping failed — check logs.")
+ self.progress_percentage.emit(0)
+ # Do not swallow the exception silently — finalizer will emit finished
diff --git a/README.md b/README.md
index aa4a753..4e08e8e 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ The application utilizes a local architecture where data is scraped from YouTube
## 🔥 Features
-- 🆓 **No Credentials Needed**: Use the application immediately—no registration, login, or API key is required.
+- 🔓 **No Credentials Needed**: Use the application immediately—no registration, login, or API key is required.
- 🎯 **Channel Scraping**: Fetch the list of videos from any specific YouTube channel.
- 📄 **Transcription Retrieval**: Retrieve and display video transcriptions (if available).
- 💬 **Comment Analysis**: Fetch and display user comments for specific videos.
@@ -564,10 +564,10 @@ To generate the installer locally, you must have Inno Setup installed and compil
## 🌻 Roadmap
- [x] **Export analysis**: Export and save analysis result image to a file.
-- [ ] **Docker Version**: A Dockerized version of the application is planned.
-- [ ] **Proxy Settings**: Ability to configure network proxy settings.
- [ ] **Theming**: Light/Dark theme support.
- [ ] **In-App Help**: Built-in documentation and help guide.
+- [ ] **Proxy Settings**: Ability to configure network proxy settings.
+- [ ] **Docker Version**: A Dockerized version of the application is planned.
---
@@ -630,4 +630,4 @@ StaTube is protected under the [MIT License](https://choosealicense.com/licenses
-[back-to-top]: https://img.shields.io/badge/_BACK_TO_TOP_-151515?style=flat-square
\ No newline at end of file
+[back-to-top]: https://img.shields.io/badge/_BACK_TO_TOP_-151515?style=flat-square
diff --git a/UI/CommentPage.py b/UI/CommentPage.py
index 61282cd..24c17c6 100644
--- a/UI/CommentPage.py
+++ b/UI/CommentPage.py
@@ -1,4 +1,4 @@
-from PySide6.QtCore import Signal, QTimer
+from PySide6.QtCore import Signal, QTimer, QThread
from PySide6.QtWidgets import (
QWidget, QLabel, QVBoxLayout, QScrollArea, QSizePolicy
)
@@ -8,8 +8,8 @@
import os
from Backend.ScrapeComments import CommentFetcher
-from Analysis.SentimentAnalysis import run_sentiment_summary
-from Analysis.WordCloud import WordCloudAnalyzer
+from Backend.AnalysisWorker import AnalysisWorker
+from UI.SplashScreen import SplashScreen
from utils.AppState import app_state
from utils.Logger import logger
@@ -122,35 +122,78 @@ def _generate_and_display_images(self):
self.scroll_layout.addWidget(QLabel("No comments found."))
return
- # Fixed HD sizes (Option A)
+ # Sizes
sent_w = 1600
sent_h = int(sent_w * 0.33)
wc_w = 2800
wc_h = int(wc_w * 0.6)
- logger.info(f"CommentPage: Generating sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}")
-
- try:
- sentiment_img = run_sentiment_summary(self.comments, width=sent_w, height=sent_h)
- wc_img = WordCloudAnalyzer(max_words=100).generate_wordcloud(self.comments, width=wc_w, height=wc_h)
- except Exception:
- logger.exception("CommentPage: Error generating images")
- self.scroll_layout.addWidget(QLabel("Failed to generate analysis images."))
- return
-
- self.sentiment_image = sentiment_img
- self.wordcloud_image = wc_img
+ logger.info(f"CommentPage: Queuing analysis sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}")
+
+ self.analysis_thread = QThread()
+ self.analysis_worker = AnalysisWorker(self.comments, sentiment_size=(sent_w, sent_h), wordcloud_size=(wc_w, wc_h), max_words=100)
+ self.analysis_worker.moveToThread(self.analysis_thread)
+
+ # Create splash
+ parent_win = self.window() if hasattr(self, "window") else None
+ self.splash = SplashScreen(parent=parent_win)
+ self.splash.set_title("Analyzing comments...")
+ self.splash.update_status("Preparing analysis...")
+ self.splash.set_progress(0)
+ self.splash.enable_runtime_mode(parent_window=parent_win, cancel_callback=self._cancel_analysis)
+ self.splash.show_with_animation()
+
+ # Wire signals
+ self.analysis_thread.started.connect(self.analysis_worker.run)
+ self.analysis_worker.progress_updated.connect(lambda m: (self.splash.update_status(m) if self.splash else None))
+ self.analysis_worker.progress_percentage.connect(lambda p: (self.splash.set_progress(p) if self.splash else None))
+ self.analysis_worker.sentiment_ready.connect(self._on_sentiment_ready)
+ self.analysis_worker.wordcloud_ready.connect(self._on_wordcloud_ready)
+ self.analysis_worker.finished.connect(self.analysis_thread.quit)
+ self.analysis_worker.finished.connect(self.analysis_worker.deleteLater)
+ self.analysis_thread.finished.connect(self.analysis_thread.deleteLater)
+ # When thread fully finishes, fade splash
+ self.analysis_thread.finished.connect(lambda: (self.splash.fade_and_close(300) if self.splash else None))
+
+ self.analysis_thread.start()
+
+ # helper cancel method
+ def _cancel_analysis(self):
+ if hasattr(self, "analysis_worker") and self.analysis_worker:
+ try:
+ self.analysis_worker.cancel()
+ except Exception:
+ pass
+ # also attempt to stop thread gracefully
+ if hasattr(self, "analysis_thread") and self.analysis_thread.isRunning():
+ try:
+ self.analysis_thread.requestInterruption()
+ self.analysis_thread.quit()
+ self.analysis_thread.wait(200)
+ except Exception:
+ pass
+ # ensure UI shows canceled
+ for i in reversed(range(self.scroll_layout.count())):
+ w = self.scroll_layout.itemAt(i).widget()
+ if w:
+ w.deleteLater()
+ self.scroll_layout.addWidget(QLabel("Analysis cancelled."))
+ # slots to receive images
+ def _on_sentiment_ready(self, qimage):
+ self.sentiment_image = qimage
+ # show immediately (title)
channel_name = next(iter(app_state.video_list.keys()), "unknown")
-
- self.scroll_layout.addWidget(QLabel("Sentimental Analysis"))
- sent_widget = DownloadableImage(sentiment_img, default_name=f"comment_sentiment_{channel_name}.png")
+ self.scroll_layout.addWidget(QLabel("Sentiment Analysis"))
+ sent_widget = DownloadableImage(qimage, default_name=f"comment_sentiment_{channel_name}.png")
sent_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
self.scroll_layout.addWidget(sent_widget)
+ def _on_wordcloud_ready(self, qimage):
+ self.wordcloud_image = qimage
self.scroll_layout.addWidget(QLabel("Word Cloud"))
- wc_widget = DownloadableImage(wc_img, default_name=f"comment_wordcloud_{channel_name}.png")
+ channel_name = next(iter(app_state.video_list.keys()), "unknown")
+ wc_widget = DownloadableImage(qimage, default_name=f"comment_wordcloud_{channel_name}.png")
wc_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
self.scroll_layout.addWidget(wc_widget)
-
self.scroll_layout.addStretch(1)
diff --git a/UI/TranscriptPage.py b/UI/TranscriptPage.py
index db70252..d2ff959 100644
--- a/UI/TranscriptPage.py
+++ b/UI/TranscriptPage.py
@@ -3,14 +3,14 @@
import os
from typing import Optional, List
-from PySide6.QtCore import Signal, QTimer
+from PySide6.QtCore import Signal, QTimer, QThread
from PySide6.QtWidgets import (
QWidget, QLabel, QVBoxLayout, QScrollArea, QSizePolicy
)
from Backend.ScrapeTranscription import TranscriptFetcher
-from Analysis.SentimentAnalysis import run_sentiment_summary
-from Analysis.WordCloud import WordCloudAnalyzer
+from Backend.AnalysisWorker import AnalysisWorker
+from UI.SplashScreen import SplashScreen
from utils.AppState import app_state
from utils.Logger import logger
from widgets.DownloadableImage import DownloadableImage
@@ -94,7 +94,6 @@ def scrape_transcript(self):
self._generate_and_display_images()
def _generate_and_display_images(self):
- # clear previous
for i in reversed(range(self.scroll_layout.count())):
w = self.scroll_layout.itemAt(i).widget()
if w:
@@ -104,39 +103,71 @@ def _generate_and_display_images(self):
self.scroll_layout.addWidget(QLabel("No transcript found."))
return
- # Fixed HD sizes (Option A)
+ # Sizes
sent_w = 1600
sent_h = int(sent_w * 0.33)
wc_w = 2800
wc_h = int(wc_w * 0.6)
- logger.info(f"TranscriptPage: Generating sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}")
+ logger.info(f"TranscriptPage: Queuing analysis sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}")
+
+ self.analysis_thread = QThread()
+ self.analysis_worker = AnalysisWorker(self.transcript_sentences, sentiment_size=(sent_w, sent_h), wordcloud_size=(wc_w, wc_h), max_words=120)
+ self.analysis_worker.moveToThread(self.analysis_thread)
+
+ parent_win = self.window() if hasattr(self, "window") else None
+ self.splash = SplashScreen(parent=parent_win)
+ self.splash.set_title("Analyzing transcripts...")
+ self.splash.update_status("Preparing analysis...")
+ self.splash.set_progress(0)
+ self.splash.enable_runtime_mode(parent_window=parent_win, cancel_callback=self._cancel_analysis)
+ self.splash.show_with_animation()
+
+ self.analysis_thread.started.connect(self.analysis_worker.run)
+ self.analysis_worker.progress_updated.connect(lambda m: (self.splash.update_status(m) if self.splash else None))
+ self.analysis_worker.progress_percentage.connect(lambda p: (self.splash.set_progress(p) if self.splash else None))
+ self.analysis_worker.sentiment_ready.connect(self._on_sentiment_ready)
+ self.analysis_worker.wordcloud_ready.connect(self._on_wordcloud_ready)
+ self.analysis_worker.finished.connect(self.analysis_thread.quit)
+ self.analysis_worker.finished.connect(self.analysis_worker.deleteLater)
+ self.analysis_thread.finished.connect(self.analysis_thread.deleteLater)
+ self.analysis_thread.finished.connect(lambda: (self.splash.fade_and_close(300) if self.splash else None))
+
+ self.analysis_thread.start()
+
+ def _cancel_analysis(self):
+ if hasattr(self, "analysis_worker") and self.analysis_worker:
+ try:
+ self.analysis_worker.cancel()
+ except Exception:
+ pass
+ if hasattr(self, "analysis_thread") and self.analysis_thread.isRunning():
+ try:
+ self.analysis_thread.requestInterruption()
+ self.analysis_thread.quit()
+ self.analysis_thread.wait(200)
+ except Exception:
+ pass
- try:
- sentiment_img = run_sentiment_summary(self.transcript_sentences, width=sent_w, height=sent_h)
- wc_img = WordCloudAnalyzer(max_words=120).generate_wordcloud(self.transcript_sentences, width=wc_w, height=wc_h)
- except Exception:
- logger.exception("TranscriptPage: Error generating images")
- self.scroll_layout.addWidget(QLabel("Failed to generate analysis images."))
- return
-
- self.sentiment_image = sentiment_img
- self.wordcloud_image = wc_img
+ for i in reversed(range(self.scroll_layout.count())):
+ w = self.scroll_layout.itemAt(i).widget()
+ if w:
+ w.deleteLater()
+ self.scroll_layout.addWidget(QLabel("Analysis cancelled."))
+ def _on_sentiment_ready(self, qimage):
+ self.sentiment_image = qimage
channel_name = next(iter(app_state.video_list.keys()), "unknown")
-
- # Title label
self.scroll_layout.addWidget(QLabel("Sentiment Analysis"))
-
- # DownloadableImage displays at natural size and provides download overlay
- sent_widget = DownloadableImage(sentiment_img, default_name=f"transcript_sentiment_{channel_name}.png")
+ sent_widget = DownloadableImage(qimage, default_name=f"transcript_sentiment_{channel_name}.png")
sent_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
self.scroll_layout.addWidget(sent_widget)
+ def _on_wordcloud_ready(self, qimage):
+ self.wordcloud_image = qimage
self.scroll_layout.addWidget(QLabel("Word Cloud"))
- wc_widget = DownloadableImage(wc_img, default_name=f"transcript_wordcloud_{channel_name}.png")
+ channel_name = next(iter(app_state.video_list.keys()), "unknown")
+ wc_widget = DownloadableImage(qimage, default_name=f"transcript_wordcloud_{channel_name}.png")
wc_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed)
self.scroll_layout.addWidget(wc_widget)
-
- # Spacer
self.scroll_layout.addStretch(1)
diff --git a/UI/VideoPage.py b/UI/VideoPage.py
index fb8be99..a7ef232 100644
--- a/UI/VideoPage.py
+++ b/UI/VideoPage.py
@@ -525,6 +525,17 @@ def on_grid_clicked(self, checked: bool) -> None:
else:
self.grid_btn.setChecked(True)
+ def _complete_splash(self):
+ """
+ Ensures splash + overlays are ALWAYS removed safely.
+ Called ONLY after QThread has fully exited.
+ """
+ self._clear_overlays()
+
+ if self.splash:
+ self.splash.fade_and_close(300)
+ self.splash = None
+
# --- Scraping ---
def scrape_videos(self, scrape_shorts: bool) -> None:
"""
@@ -573,14 +584,14 @@ def show_splash_screen(self, parent: Optional[QWidget] = None, gif_path: str = "
self.splash.close()
self.splash = None
- # ✅ IMPORTANT FIX: parent MUST be None
+ # IMPORTANT FIX: parent MUST be None
self.splash = SplashScreen(parent=None, gif_path=gif_path)
self.splash.set_title(title)
self.splash.update_status("Starting...")
self.splash.set_progress(0)
- # ✅ Overlay still binds to mainwindow correctly
+ # Overlay still binds to mainwindow correctly
self.splash.enable_runtime_mode(
parent_window=self.mainwindow,
cancel_callback=self.cancel_scraping
@@ -622,7 +633,7 @@ def cancel_scraping(self):
self.comment_thread.quit()
self.comment_thread.wait(500)
- # ✅ Force-remove overlays
+ # Force-remove overlays
self._clear_overlays()
# Fade & cleanup splash safely
@@ -653,12 +664,6 @@ def update_splash_progress(self, message: str) -> None:
self.splash.update_status(message)
def update_splash_percentage(self, percentage: int) -> None:
- """
- Updates the progress bar of the SplashScreen dialog.
-
- Args:
- percentage (int): The progress percentage (0-100) to display.
- """
if self.splash:
self.splash.set_progress(percentage)
@@ -683,26 +688,24 @@ def on_worker_finished(self) -> None:
def on_transcript_worker_finished(self) -> None:
"""
- Called when the TranscriptWorker thread has finished scraping transcripts.
- Closes the SplashScreen dialog.
+ Worker has emitted 'finished', but QThread may still be running.
+ We wait for the thread to fully exit before closing splash.
"""
- self._clear_overlays()
+ if self.transcript_thread:
+ self.transcript_thread.finished.connect(self._complete_splash)
- if self.splash is not None:
- self.splash.fade_and_close(400)
- self.splash = None
+ # Notify rest of app that transcript scraping is done
self.video_page_scrape_transcript_signal.emit()
def on_comment_worker_finished(self) -> None:
"""
- Called when the CommentWorker thread has finished scraping comments.
- Closes the SplashScreen dialog.
+ Worker has finished. Wait for thread to close fully
+ before removing the splash + overlay.
"""
- self._clear_overlays()
+ if self.comment_thread:
+ self.comment_thread.finished.connect(self._complete_splash)
- if self.splash is not None:
- self.splash.fade_and_close(400)
- self.splash = None
+ # Notify rest of app that comment scraping is done
self.video_page_scrape_comments_signal.emit()
# --- Loading videos ---