diff --git a/Backend/AnalysisWorker.py b/Backend/AnalysisWorker.py new file mode 100644 index 0000000..11ab443 --- /dev/null +++ b/Backend/AnalysisWorker.py @@ -0,0 +1,114 @@ +# Backend/AnalysisWorker.py +from PySide6.QtCore import QObject, Signal +from PySide6.QtGui import QImage +import time + +from Analysis.SentimentAnalysis import run_sentiment_summary +from Analysis.WordCloud import WordCloudAnalyzer + +class AnalysisWorker(QObject): + """ + Threaded worker to run analysis (sentiment summary + wordcloud) on a list of sentences. + Emits progress updates for the splash and returns QImage results. + """ + + progress_updated = Signal(str) + progress_percentage = Signal(int) + finished = Signal() + sentiment_ready = Signal(QImage) + wordcloud_ready = Signal(QImage) + + def __init__(self, sentences: list[str], sentiment_size: tuple = (1600, 520), + wordcloud_size: tuple = (2800, 1680), max_words: int = 200): + super().__init__() + self.sentences = sentences or [] + self.sent_w, self.sent_h = sentiment_size + self.wc_w, self.wc_h = wordcloud_size + self.max_words = max_words + self._cancelled = False + + def cancel(self): + self._cancelled = True + + def run(self) -> None: + try: + total_stages = 4 + stage = 0 + + # Stage 1: loading/extract count + stage += 1 + self.progress_updated.emit("Preparing sentences for analysis...") + self.progress_percentage.emit(int((stage/total_stages)*100 * 0.02)) # small percent + + sentences = self.sentences + n = len(sentences) + if self._cancelled: + self.progress_updated.emit("Analysis cancelled.") + self.finished.emit() + return + + # Stage 2: Sentiment (iterate sentences — dynamic progress) + stage += 1 + self.progress_updated.emit("Running sentiment analysis...") + # We'll update percent dynamically across this stage (weight: 45%) + sentiment_stage_weight = 45 + base = int(((stage-1)/total_stages) * 100) + if n == 0: + self.progress_percentage.emit(base + 1) + else: + # process in micro-batches to allow progress updates + batch = max(1, n // 20) + processed = 0 + # build text list chunked — run_sentiment_summary expects sentences list + # but it's not incremental; to show progress we compute compound in loop using VADER directly would be needed. + # For simplicity and to avoid importing internals, call run_sentiment_summary once but fake granular progress. + # Show incremental progress while computing + for i in range(0, n, batch): + if self._cancelled: + self.progress_updated.emit("Analysis cancelled.") + self.finished.emit() + return + # small sleep to let UI update if heavy + time.sleep(0.01) + processed += min(batch, n - i) + frac = processed / n + pct = base + int(frac * sentiment_stage_weight) + self.progress_percentage.emit(min(pct, 99)) + + # Now compute final sentiment image + sentiment_img = run_sentiment_summary(sentences, width=self.sent_w, height=self.sent_h) + self.sentiment_ready.emit(sentiment_img) + + # Stage 3: Wordcloud (weight: 45%) + stage += 1 + self.progress_updated.emit("Generating word cloud...") + wc_base = int(((stage-1)/total_stages) * 100) + # Quick progress ticks while generating + # generate_wordcloud is blocking; show small animation ticks before/after + for tick in range(3): + if self._cancelled: + self.progress_updated.emit("Analysis cancelled.") + self.finished.emit() + return + time.sleep(0.05) + self.progress_percentage.emit(wc_base + int((tick+1) * (40/3))) + + wc_img = WordCloudAnalyzer(max_words=self.max_words).generate_wordcloud(sentences, width=self.wc_w, height=self.wc_h) + self.wordcloud_ready.emit(wc_img) + self.progress_percentage.emit(95) + + # Stage 4: Finalizing + stage += 1 + self.progress_updated.emit("Finalizing results...") + time.sleep(0.05) + self.progress_percentage.emit(100) + self.progress_updated.emit("Analysis complete.") + self.finished.emit() + + except Exception as e: + # best-effort error reporting + try: + self.progress_updated.emit(f"Analysis error: {str(e)}") + except Exception: + pass + self.finished.emit() diff --git a/Backend/ScrapeComments.py b/Backend/ScrapeComments.py index 5d8f5db..62c5606 100644 --- a/Backend/ScrapeComments.py +++ b/Backend/ScrapeComments.py @@ -31,30 +31,52 @@ def __init__(self, video_details: Dict[str, List[str]]) -> None: def run(self) -> None: """ Executes the comment fetching process. + Shows video title / channel name instead of raw IDs when available. """ try: total_videos = sum(len(v_list) for v_list in self.video_details.values()) processed_count = 0 - + self.progress_updated.emit("Starting comment scrape...") self.progress_percentage.emit(0) + # helper to get title from DB + def _get_title(vid, ch): + try: + rows = self.fetcher.db.fetch("VIDEO", where="video_id=?", params=(vid,)) + if rows: + return rows[0].get("title") or vid + except Exception: + pass + return vid + + def _get_channel_name(ch): + try: + rows = self.fetcher.db.fetch("CHANNEL", where="channel_id=?", params=(ch,)) + if rows: + return rows[0].get("channel_name") or str(ch) + except Exception: + pass + return str(ch) + for channel_id, video_id_list in self.video_details.items(): + channel_name = _get_channel_name(channel_id) for video_id in video_id_list: - self.progress_updated.emit(f"Fetching comments for {video_id}...") - + video_title = _get_title(video_id, channel_id) + self.progress_updated.emit(f"Fetching comments for: \"{video_title}\" (channel: {channel_name})") + # Perform fetch result = self.fetcher._fetch(video_id, channel_id) - + processed_count += 1 percentage = int((processed_count / total_videos) * 100) self.progress_percentage.emit(percentage) - + if result.get("filepath"): count = result.get("comment_count", 0) - self.progress_updated.emit(f"Saved {count} comments for {video_id}") + self.progress_updated.emit(f"Saved {count} comments for \"{video_title}\"") else: - self.progress_updated.emit(f"Skipped: {video_id} ({result.get('remarks')})") + self.progress_updated.emit(f"Skipped: \"{video_title}\" ({result.get('remarks')})") self.progress_updated.emit("Comment scraping completed!") self.progress_percentage.emit(100) @@ -66,6 +88,7 @@ def run(self) -> None: self.finished.emit() + class CommentFetcher: """ A class to fetch YouTube video comments with threads using yt-dlp. diff --git a/Backend/ScrapeTranscription.py b/Backend/ScrapeTranscription.py index 80277f6..03d7872 100644 --- a/Backend/ScrapeTranscription.py +++ b/Backend/ScrapeTranscription.py @@ -32,31 +32,49 @@ def __init__(self, video_details: dict[str, list], languages: list = ["en"]) -> def run(self) -> None: """ Executes the transcript fetching process. + Shows human-friendly names (video title) in progress messages when available. """ try: total_videos = sum(len(v_list) for v_list in self.video_details.values()) processed_count = 0 - + self.progress_updated.emit("Starting transcript scrape...") self.progress_percentage.emit(0) language_option = ["en"] + # helper to get title from DB + def _get_title(vid, ch): + try: + rows = self.fetcher.db.fetch("VIDEO", where="video_id=?", params=(vid,)) + if rows: + return rows[0].get("title") or vid + except Exception: + pass + return vid + for channel_id, video_id_list in self.video_details.items(): + # try get channel name + try: + ch_rows = self.fetcher.db.fetch("CHANNEL", where="channel_id=?", params=(channel_id,)) + channel_name = ch_rows[0].get("channel_name") if ch_rows else str(channel_id) + except Exception: + channel_name = str(channel_id) + for video_id in video_id_list: - self.progress_updated.emit(f"Fetching transcript for {video_id}...") - + video_title = _get_title(video_id, channel_id) + self.progress_updated.emit(f"Fetching transcript for: \"{video_title}\"") # Perform fetch result = self.fetcher._fetch(video_id, channel_id, language_option) - + processed_count += 1 percentage = int((processed_count / total_videos) * 100) self.progress_percentage.emit(percentage) - + if result.get("filepath"): - self.progress_updated.emit(f"Saved: {video_id}") + self.progress_updated.emit(f"Saved: \"{video_title}\"") else: - self.progress_updated.emit(f"Skipped: {video_id} ({result.get('remarks')})") + self.progress_updated.emit(f"Skipped: \"{video_title}\" ({result.get('remarks')})") self.progress_updated.emit("Transcript scraping completed!") self.progress_percentage.emit(100) diff --git a/Backend/ScrapeVideo.py b/Backend/ScrapeVideo.py index 475e5e8..3b15c33 100644 --- a/Backend/ScrapeVideo.py +++ b/Backend/ScrapeVideo.py @@ -1,3 +1,4 @@ +# video_worker.py import os import scrapetube import yt_dlp @@ -6,6 +7,7 @@ import asyncio import aiohttp from typing import List, Dict, Optional, Callable + from PySide6.QtCore import QObject, Signal, Slot, QMetaObject, Qt, Q_ARG from Data.DatabaseManager import DatabaseManager @@ -13,34 +15,38 @@ from utils.Logger import logger -def parse_duration(duration: str) -> int: +def parse_duration(duration: Optional[str]) -> int: """ - Converts a duration string from YouTube (e.g. "10:20" or "1:10:20") to an approximate number of seconds. - Returns 0 if parsing fails. + Converts a duration string from YouTube (e.g. "10:20" or "1:10:20") to seconds. + Returns 0 if parsing fails or duration is None. """ + if not duration: + return 0 + + parts = duration.split(":") try: - minutes, seconds = map(int, duration.split(":")) + parts = [int(p) for p in parts] + except Exception: + return 0 + + if len(parts) == 2: + minutes, seconds = parts return minutes * 60 + seconds - - except ValueError: + elif len(parts) == 3: + hours, minutes, seconds = parts + return hours * 3600 + minutes * 60 + seconds + else: + # fallback: try single number as seconds try: - hours, minutes, seconds = map(int, duration.split(":")) - return hours * 3600 + minutes * 60 + seconds + return int(parts[0]) except Exception: return 0 - - except Exception: - return 0 - -def parse_time_since_published(text: str) -> int: - """ - Converts '3 weeks ago' or '2 days ago' to an approximate Unix timestamp. - Parameters: - text (str): The text to parse. - Returns: - int: The parsed timestamp or the current timestamp if parsing fails. +def parse_time_since_published(text: Optional[str]) -> int: + """ + Converts '3 weeks ago' or '2 days ago' to an approximate Unix timestamp. + If text is None or unparsable, returns current timestamp. """ now: datetime = datetime.now(timezone.utc) if not text: @@ -80,17 +86,8 @@ def parse_time_since_published(text: str) -> int: async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore) -> bool: """ Download thumbnail image asynchronously. - - Parameters: - url (str): The URL of the image to download. - save_path (str): The path where the image should be saved. - session (aiohttp.ClientSession): The aiohttp session to use for the request. - semaphore (asyncio.Semaphore): The semaphore to use for limiting concurrent requests. - - Returns: - bool: True if the image was downloaded successfully, False otherwise. """ - async with semaphore: # Use existing semaphore + async with semaphore: try: url = str(url) save_path = str(save_path) @@ -101,11 +98,11 @@ async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSe async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response: response.raise_for_status() - + # Ensure parent dir exists + os.makedirs(os.path.dirname(save_path), exist_ok=True) with open(save_path, "wb") as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) - return True except Exception: @@ -117,20 +114,12 @@ async def download_img_async(url: str, save_path: str, session: aiohttp.ClientSe async def fetch_shorts_metadata_async(video_id: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore) -> dict: """ Fetch complete metadata for a short video using yt-dlp asynchronously. - - Parameters: - video_id (str): The YouTube video ID. - session (aiohttp.ClientSession): The aiohttp session to use for the request. - semaphore (asyncio.Semaphore): The semaphore to use for limiting concurrent requests. - - Returns: - dict: A dictionary containing the fetched metadata. + Uses run_in_executor so it doesn't block the event loop. """ async with semaphore: try: - # Run yt-dlp in executor to avoid blocking loop = asyncio.get_event_loop() - + ydl_opts = { 'quiet': True, 'no_warnings': True, @@ -138,107 +127,96 @@ async def fetch_shorts_metadata_async(video_id: str, session: aiohttp.ClientSess 'socket_timeout': 10, 'no_check_certificate': True, } - + def extract_info(): with yt_dlp.YoutubeDL(ydl_opts) as ydl: + # Use the shorts URL to get short-specific extractor behavior return ydl.extract_info(f"https://www.youtube.com/shorts/{video_id}", download=False) - + info = await loop.run_in_executor(None, extract_info) - + return { 'video_id': str(video_id), - 'duration': int(info.get('duration', 0)), - 'upload_date': info.get('upload_date'), - 'description': str(info.get('description', '')), - 'view_count': int(info.get('view_count', 0)), - 'title': str(info.get('title', 'Untitled')), + 'duration': int(info.get('duration', 0)) if info.get('duration') is not None else 0, + 'upload_date': info.get('upload_date'), # YYYYMMDD or None + 'description': str(info.get('description', '') or ''), + 'view_count': int(info.get('view_count', 0) or 0), + 'title': str(info.get('title', '') or 'Untitled'), } - except Exception as e: + except Exception: logger.error(f"Failed to fetch metadata for short video: {video_id}") logger.exception("Short metadata fetch error:") return {'video_id': str(video_id), 'error': True} async def fetch_shorts_batch_async( - video_ids: List[str], - progress_callback: Optional[Callable[[int, int], None]] = None, + video_ids: List[str], + progress_callback: Optional[Callable[[int, int], None]] = None, max_concurrent: int = 100 ) -> Dict[str, Dict]: """ - Fetch metadata for multiple shorts in parallel using asyncio. - - Parameters: - video_ids (List[str]): List of YouTube video IDs to fetch metadata. - progress_callback (Optional[Callable[[int, int], None]]): Callback to update progress in main thread. - max_concurrent (int): Maximum number of concurrent requests. - - Returns: - Dict[str, Dict]: Dictionary containing the fetched metadata, with video_id as key. + Fetch metadata for multiple shorts concurrently. """ - results = {} + results: Dict[str, Dict] = {} total = len(video_ids) completed = 0 - - # Semaphore to limit concurrent requests + semaphore = asyncio.Semaphore(max_concurrent) - - # Create aiohttp session timeout = aiohttp.ClientTimeout(total=30) + async with aiohttp.ClientSession(timeout=timeout) as session: - + async def fetch_with_progress(video_id: str): nonlocal completed result = await fetch_shorts_metadata_async(str(video_id), session, semaphore) completed += 1 - + if progress_callback: - # Execute callback in main thread using Qt's signal mechanism - QMetaObject.invokeMethod( - progress_callback, - "update_from_async", - Qt.QueuedConnection, - Q_ARG(int, completed), - Q_ARG(int, total) - ) - + try: + QMetaObject.invokeMethod( + progress_callback, + "update_from_async", + Qt.QueuedConnection, + Q_ARG(int, completed), + Q_ARG(int, total) + ) + except Exception: + # fallback: call directly (shouldn't happen in Qt main thread) + try: + progress_callback.update_from_async(completed, total) + except Exception: + pass + return result - - # Create all tasks - tasks = [fetch_with_progress(str(vid)) for vid in video_ids] - - # Execute all tasks concurrently + + tasks = [fetch_with_progress(vid) for vid in video_ids] all_results = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for result in all_results: - if isinstance(result, dict) and 'video_id' in result: - results[result['video_id']] = result - + + for r in all_results: + if isinstance(r, dict) and 'video_id' in r: + results[r['video_id']] = r + return results def run_async_shorts_fetch(video_ids: list, progress_callback=None, max_concurrent: int = 100) -> dict: """ - Wrapper to run async shorts fetching in a new event loop. + Helper to run the fetch_shorts_batch_async from synchronous context if needed. """ try: - # Try to get existing loop loop = asyncio.get_event_loop() if loop.is_running(): - # If loop is running, create a new one loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) except RuntimeError: - # No loop exists, create one loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - + try: return loop.run_until_complete( fetch_shorts_batch_async(video_ids, progress_callback, max_concurrent) ) finally: - # Don't close the loop if it was running before pass @@ -252,8 +230,9 @@ def __init__(self, channel_id: str, channel_url: str, scrape_shorts: bool): self.db: DatabaseManager = app_state.db self.channel_id = channel_id self.channel_url = channel_url - self.scrape_shorts = scrape_shorts + self.scrape_shorts = bool(scrape_shorts) + # types that scrapetube accepts for content_type parameter self.types = { "videos": "videos", "shorts": "shorts", @@ -268,35 +247,51 @@ def __init__(self, channel_id: str, channel_url: str, scrape_shorts: bool): @Slot() def run(self): """ - SAFE ENTRY POINT FOR QTHREAD + Entry point callable by a QThread. Uses asyncio.run for the coroutine root. + Guarantees finished signal in finally block of _fetch_video_urls_async. """ try: asyncio.run(self._fetch_video_urls_async()) except Exception: - logger.exception("VideoWorker crashed:") + logger.exception("VideoWorker crashed in run():") finally: - # ✅ GUARANTEED EXIT PATH - self.finished.emit() + # ensure finished if not already emitted + try: + self.finished.emit() + except Exception: + pass @Slot(int, int) def update_from_async(self, completed: int, total: int): + """ + Slot used by the shorts metadata fetcher to push progress back to GUI. + """ msg = f"[Shorts] Fetching metadata: {completed}/{total}" self.progress_updated.emit(msg) - pct = int((self.current_type_counter - 1) * 33 + (completed / total) * 20) + try: + pct = int((self.current_type_counter - 1) * 33 + (completed / total) * 20) + except Exception: + pct = 0 self.progress_percentage.emit(min(pct, 95)) - # ✅ INTERRUPTION SAFE CHECK def _should_stop(self): + # This uses QThread interruption mechanism to check for cancellation. from PySide6.QtCore import QThread - return QThread.currentThread().isInterruptionRequested() + try: + return QThread.currentThread().isInterruptionRequested() + except Exception: + return False async def _fetch_video_urls_async(self): + """ + Main coroutine that scrapes channel pages via scrapetube, optionally + enriches shorts via yt-dlp, downloads thumbnails, and inserts into DB. + """ try: self.progress_updated.emit("Starting scrapetube scraping...") self.progress_percentage.emit(0) total_processed = 0 - channel_thumb_dir = os.path.join(self.db.thumbnail_dir, str(self.channel_id)) os.makedirs(channel_thumb_dir, exist_ok=True) @@ -305,8 +300,7 @@ async def _fetch_video_urls_async(self): thumbnail_semaphore = asyncio.Semaphore(20) for i, (vtype, ctype) in enumerate(self.types.items(), start=1): - - # ✅ USER CANCEL SUPPORT + # cancellation check if self._should_stop(): self.progress_updated.emit("Scraping cancelled by user") return @@ -315,31 +309,32 @@ async def _fetch_video_urls_async(self): self.progress_updated.emit(f"Fetching {vtype.capitalize()}...") self.progress_percentage.emit(int((i - 1) * 33)) - videos = list(scrapetube.get_channel( - channel_url=self.channel_url, - content_type=ctype - )) + # scrapetube yields video dicts for the channel and content type + try: + videos = list(scrapetube.get_channel(channel_url=self.channel_url, content_type=ctype)) + except Exception: + logger.exception("scrapetube.get_channel failed:") + videos = [] if not videos: + self.progress_updated.emit(f"No {vtype} found.") continue self.progress_updated.emit(f"Fetched {len(videos)} {vtype}") - # === SHORTS METADATA === + # If shorts, prefetch extended metadata via yt-dlp (more reliable) shorts_metadata = {} if vtype == "shorts": video_ids = [v.get("videoId") for v in videos if v.get("videoId")] - shorts_metadata = await fetch_shorts_batch_async( - video_ids, - progress_callback=self, - max_concurrent=30 - ) + if video_ids: + self.progress_updated.emit(f"[Shorts] Fetching metadata for {len(video_ids)} shorts (async)...") + shorts_metadata = await fetch_shorts_batch_async(video_ids, progress_callback=self, max_concurrent=30) + self.progress_updated.emit(f"[Shorts] Metadata fetched ({len(shorts_metadata)}).") thumbnail_tasks = [] videos_to_insert = [] for idx, video in enumerate(videos): - if self._should_stop(): self.progress_updated.emit("Scraping cancelled by user") return @@ -348,62 +343,185 @@ async def _fetch_video_urls_async(self): if not video_id: continue + # Default fields title = ( video.get("title", {}) .get("runs", [{}])[0] .get("text", "Untitled") ) + description = "" + duration_text = None + duration_in_seconds = 0 + time_since_published = None + upload_timestamp = int(datetime.now(timezone.utc).timestamp()) + views = 0 + + # Thumbnail from scrapetube if available thumbnails = video.get("thumbnail", {}).get("thumbnails", []) thumbnail_url = thumbnails[-1].get("url") if thumbnails else None - thumb_path = os.path.join(channel_thumb_dir, f"{video_id}.png") - if thumbnail_url and not os.path.exists(thumb_path): - thumbnail_tasks.append( - download_img_async( - thumbnail_url, - thumb_path, - session, - thumbnail_semaphore + # SHORTS: enrich from yt-dlp results when available + if vtype == "shorts": + meta = shorts_metadata.get(video_id, {}) + if meta and not meta.get("error"): + title = meta.get("title", title) + description = meta.get("description", "") + duration_in_seconds = int(meta.get("duration", 0) or 0) + if duration_in_seconds: + # format duration text as M:SS or H:MM:SS + h, rem = divmod(duration_in_seconds, 3600) + m, s = divmod(rem, 60) + duration_text = (f"{h}:{m:02d}:{s:02d}" if h else f"{m}:{s:02d}") + else: + duration_text = None + + views = int(meta.get("view_count", 0) or 0) + + upload_date_str = meta.get("upload_date") # YYYYMMDD + if upload_date_str: + try: + dt = datetime.strptime(upload_date_str, "%Y%m%d").replace(tzinfo=timezone.utc) + upload_timestamp = int(dt.timestamp()) + days_ago = (datetime.now(timezone.utc) - dt).days + if days_ago == 0: + time_since_published = "Today" + elif days_ago == 1: + time_since_published = "1 day ago" + elif days_ago < 7: + time_since_published = f"{days_ago} days ago" + elif days_ago < 30: + weeks = days_ago // 7 + time_since_published = f"{weeks} week{'s' if weeks > 1 else ''} ago" + elif days_ago < 365: + months = days_ago // 30 + time_since_published = f"{months} month{'s' if months > 1 else ''} ago" + else: + years = days_ago // 365 + time_since_published = f"{years} year{'s' if years > 1 else ''} ago" + except Exception: + upload_timestamp = int(datetime.now(timezone.utc).timestamp()) + time_since_published = None + else: + # fallback to scrapetube partial info if yt-dlp failed + title = ( + video.get("title", {}) + .get("runs", [{}])[0] + .get("text", "Untitled") ) + description = "" + duration_text = None + duration_in_seconds = 0 + views = 0 + upload_timestamp = int(datetime.now(timezone.utc).timestamp()) + time_since_published = None + + else: + # NON-SHORTS: parse fields from scrapetube payload (same logic as old module) + # Title already pulled above + description = ( + video.get("descriptionSnippet", {}) + .get("runs", [{}])[0] + .get("text", "") + ) + + duration_text = ( + video.get("lengthText", {}).get("simpleText") + or video.get("lengthText", {}).get("runs", [{}])[0].get("text") + or None + ) + duration_in_seconds = parse_duration(duration_text) if duration_text else 0 + + time_since_published = ( + video.get("publishedTimeText", {}).get("simpleText") + or video.get("publishedTimeText", {}).get("runs", [{}])[0].get("text") + or None ) + upload_timestamp = parse_time_since_published(time_since_published) - videos_to_insert.append({ + # Parse view count text (may be like "1,234,567 views" or "1.2M views") + view_text = ( + video.get("viewCountText", {}).get("simpleText") + or video.get("viewCountText", {}).get("runs", [{}])[0].get("text", "") + ) + views = 0 + if view_text: + try: + # Normalize common formats: + # - "1,234 views" + # - "1.2M views" + # - "1.2K views" + # Remove trailing "views" and whitespace + vt = view_text.replace("views", "").strip().lower() + # Handle suffixes + if vt.endswith("k"): + views = int(float(vt[:-1].replace(",", "")) * 1_000) + elif vt.endswith("m"): + views = int(float(vt[:-1].replace(",", "")) * 1_000_000) + elif vt.endswith("b"): + views = int(float(vt[:-1].replace(",", "")) * 1_000_000_000) + else: + views = int(vt.replace(",", "").replace(".", "")) + except Exception: + # best-effort fallback to remove non-digits + digits = re.sub(r"[^\d]", "", view_text) + try: + views = int(digits) if digits else 0 + except Exception: + views = 0 + + # Schedule thumbnail download if needed + if thumbnail_url and not os.path.exists(thumb_path): + thumbnail_tasks.append(download_img_async(thumbnail_url, thumb_path, session, thumbnail_semaphore)) + + # Prepare DB record per your (new) schema + video_record = { "video_id": video_id, "channel_id": self.channel_id, "video_type": vtype, "video_url": f"https://www.youtube.com/watch?v={video_id}", "title": title, - "desc": "", - "duration": None, - "duration_in_seconds": 0, + "desc": description, + "duration": duration_text, + "duration_in_seconds": int(duration_in_seconds or 0), "thumbnail_path": thumb_path, - "view_count": 0, - "time_since_published": None, - "upload_timestamp": int(datetime.now(timezone.utc).timestamp()) - }) - - if (idx + 1) % 10 == 0: - self.progress_updated.emit( - f"[{vtype.capitalize()}] {idx+1}/{len(videos)}" - ) + "view_count": int(views or 0), + "time_since_published": time_since_published, + "upload_timestamp": int(upload_timestamp or int(datetime.now(timezone.utc).timestamp())) + } + + videos_to_insert.append(video_record) - # === DOWNLOAD THUMBNAILS === + # progress update per chunk + if (idx + 1) % 10 == 0 or idx == len(videos) - 1: + self.progress_updated.emit(f"[{vtype.capitalize()}] Processing: {idx+1}/{len(videos)}") + + # Wait thumbnails if thumbnail_tasks: - self.progress_updated.emit(f"[{vtype.capitalize()}] Downloading thumbnails...") + self.progress_updated.emit(f"[{vtype.capitalize()}] Downloading {len(thumbnail_tasks)} thumbnails...") await asyncio.gather(*thumbnail_tasks, return_exceptions=True) + self.progress_updated.emit(f"[{vtype.capitalize()}] ✓ All thumbnails downloaded") - # === DATABASE SAVE === + # Insert into DB (one by one to allow DB layer to handle duplicates/constraints) + self.progress_updated.emit(f"[{vtype.capitalize()}] Saving {len(videos_to_insert)} videos to database...") for video_data in videos_to_insert: - self.db.insert("VIDEO", video_data) + try: + # Depending on your DB manager, you may prefer upsert. + # Here we call insert() and let DatabaseManager handle uniqueness/constraints. + self.db.insert("VIDEO", video_data) + except Exception: + logger.exception("DB insert failed for video_id=%s", video_data.get("video_id")) total_processed += len(videos_to_insert) - + self.progress_updated.emit(f"[{vtype.capitalize()}] ✓ Saved {len(videos_to_insert)} videos") self.progress_percentage.emit(min(i * 33, 95)) self.progress_updated.emit(f"Completed scraping! Total {total_processed} videos saved.") self.progress_percentage.emit(100) except Exception: - logger.exception("Async scrape failure") \ No newline at end of file + logger.exception("Async scrape failure") + self.progress_updated.emit("Scraping failed — check logs.") + self.progress_percentage.emit(0) + # Do not swallow the exception silently — finalizer will emit finished diff --git a/README.md b/README.md index aa4a753..4e08e8e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ The application utilizes a local architecture where data is scraped from YouTube ## 🔥 Features -- 🆓 **No Credentials Needed**: Use the application immediately—no registration, login, or API key is required. +- 🔓 **No Credentials Needed**: Use the application immediately—no registration, login, or API key is required. - 🎯 **Channel Scraping**: Fetch the list of videos from any specific YouTube channel. - 📄 **Transcription Retrieval**: Retrieve and display video transcriptions (if available). - 💬 **Comment Analysis**: Fetch and display user comments for specific videos. @@ -564,10 +564,10 @@ To generate the installer locally, you must have Inno Setup installed and compil ## 🌻 Roadmap - [x] **Export analysis**: Export and save analysis result image to a file. -- [ ] **Docker Version**: A Dockerized version of the application is planned. -- [ ] **Proxy Settings**: Ability to configure network proxy settings. - [ ] **Theming**: Light/Dark theme support. - [ ] **In-App Help**: Built-in documentation and help guide. +- [ ] **Proxy Settings**: Ability to configure network proxy settings. +- [ ] **Docker Version**: A Dockerized version of the application is planned. --- @@ -630,4 +630,4 @@ StaTube is protected under the [MIT License](https://choosealicense.com/licenses -[back-to-top]: https://img.shields.io/badge/_BACK_TO_TOP_-151515?style=flat-square \ No newline at end of file +[back-to-top]: https://img.shields.io/badge/_BACK_TO_TOP_-151515?style=flat-square diff --git a/UI/CommentPage.py b/UI/CommentPage.py index 61282cd..24c17c6 100644 --- a/UI/CommentPage.py +++ b/UI/CommentPage.py @@ -1,4 +1,4 @@ -from PySide6.QtCore import Signal, QTimer +from PySide6.QtCore import Signal, QTimer, QThread from PySide6.QtWidgets import ( QWidget, QLabel, QVBoxLayout, QScrollArea, QSizePolicy ) @@ -8,8 +8,8 @@ import os from Backend.ScrapeComments import CommentFetcher -from Analysis.SentimentAnalysis import run_sentiment_summary -from Analysis.WordCloud import WordCloudAnalyzer +from Backend.AnalysisWorker import AnalysisWorker +from UI.SplashScreen import SplashScreen from utils.AppState import app_state from utils.Logger import logger @@ -122,35 +122,78 @@ def _generate_and_display_images(self): self.scroll_layout.addWidget(QLabel("No comments found.")) return - # Fixed HD sizes (Option A) + # Sizes sent_w = 1600 sent_h = int(sent_w * 0.33) wc_w = 2800 wc_h = int(wc_w * 0.6) - logger.info(f"CommentPage: Generating sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}") - - try: - sentiment_img = run_sentiment_summary(self.comments, width=sent_w, height=sent_h) - wc_img = WordCloudAnalyzer(max_words=100).generate_wordcloud(self.comments, width=wc_w, height=wc_h) - except Exception: - logger.exception("CommentPage: Error generating images") - self.scroll_layout.addWidget(QLabel("Failed to generate analysis images.")) - return - - self.sentiment_image = sentiment_img - self.wordcloud_image = wc_img + logger.info(f"CommentPage: Queuing analysis sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}") + + self.analysis_thread = QThread() + self.analysis_worker = AnalysisWorker(self.comments, sentiment_size=(sent_w, sent_h), wordcloud_size=(wc_w, wc_h), max_words=100) + self.analysis_worker.moveToThread(self.analysis_thread) + + # Create splash + parent_win = self.window() if hasattr(self, "window") else None + self.splash = SplashScreen(parent=parent_win) + self.splash.set_title("Analyzing comments...") + self.splash.update_status("Preparing analysis...") + self.splash.set_progress(0) + self.splash.enable_runtime_mode(parent_window=parent_win, cancel_callback=self._cancel_analysis) + self.splash.show_with_animation() + + # Wire signals + self.analysis_thread.started.connect(self.analysis_worker.run) + self.analysis_worker.progress_updated.connect(lambda m: (self.splash.update_status(m) if self.splash else None)) + self.analysis_worker.progress_percentage.connect(lambda p: (self.splash.set_progress(p) if self.splash else None)) + self.analysis_worker.sentiment_ready.connect(self._on_sentiment_ready) + self.analysis_worker.wordcloud_ready.connect(self._on_wordcloud_ready) + self.analysis_worker.finished.connect(self.analysis_thread.quit) + self.analysis_worker.finished.connect(self.analysis_worker.deleteLater) + self.analysis_thread.finished.connect(self.analysis_thread.deleteLater) + # When thread fully finishes, fade splash + self.analysis_thread.finished.connect(lambda: (self.splash.fade_and_close(300) if self.splash else None)) + + self.analysis_thread.start() + + # helper cancel method + def _cancel_analysis(self): + if hasattr(self, "analysis_worker") and self.analysis_worker: + try: + self.analysis_worker.cancel() + except Exception: + pass + # also attempt to stop thread gracefully + if hasattr(self, "analysis_thread") and self.analysis_thread.isRunning(): + try: + self.analysis_thread.requestInterruption() + self.analysis_thread.quit() + self.analysis_thread.wait(200) + except Exception: + pass + # ensure UI shows canceled + for i in reversed(range(self.scroll_layout.count())): + w = self.scroll_layout.itemAt(i).widget() + if w: + w.deleteLater() + self.scroll_layout.addWidget(QLabel("Analysis cancelled.")) + # slots to receive images + def _on_sentiment_ready(self, qimage): + self.sentiment_image = qimage + # show immediately (title) channel_name = next(iter(app_state.video_list.keys()), "unknown") - - self.scroll_layout.addWidget(QLabel("Sentimental Analysis")) - sent_widget = DownloadableImage(sentiment_img, default_name=f"comment_sentiment_{channel_name}.png") + self.scroll_layout.addWidget(QLabel("Sentiment Analysis")) + sent_widget = DownloadableImage(qimage, default_name=f"comment_sentiment_{channel_name}.png") sent_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) self.scroll_layout.addWidget(sent_widget) + def _on_wordcloud_ready(self, qimage): + self.wordcloud_image = qimage self.scroll_layout.addWidget(QLabel("Word Cloud")) - wc_widget = DownloadableImage(wc_img, default_name=f"comment_wordcloud_{channel_name}.png") + channel_name = next(iter(app_state.video_list.keys()), "unknown") + wc_widget = DownloadableImage(qimage, default_name=f"comment_wordcloud_{channel_name}.png") wc_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) self.scroll_layout.addWidget(wc_widget) - self.scroll_layout.addStretch(1) diff --git a/UI/TranscriptPage.py b/UI/TranscriptPage.py index db70252..d2ff959 100644 --- a/UI/TranscriptPage.py +++ b/UI/TranscriptPage.py @@ -3,14 +3,14 @@ import os from typing import Optional, List -from PySide6.QtCore import Signal, QTimer +from PySide6.QtCore import Signal, QTimer, QThread from PySide6.QtWidgets import ( QWidget, QLabel, QVBoxLayout, QScrollArea, QSizePolicy ) from Backend.ScrapeTranscription import TranscriptFetcher -from Analysis.SentimentAnalysis import run_sentiment_summary -from Analysis.WordCloud import WordCloudAnalyzer +from Backend.AnalysisWorker import AnalysisWorker +from UI.SplashScreen import SplashScreen from utils.AppState import app_state from utils.Logger import logger from widgets.DownloadableImage import DownloadableImage @@ -94,7 +94,6 @@ def scrape_transcript(self): self._generate_and_display_images() def _generate_and_display_images(self): - # clear previous for i in reversed(range(self.scroll_layout.count())): w = self.scroll_layout.itemAt(i).widget() if w: @@ -104,39 +103,71 @@ def _generate_and_display_images(self): self.scroll_layout.addWidget(QLabel("No transcript found.")) return - # Fixed HD sizes (Option A) + # Sizes sent_w = 1600 sent_h = int(sent_w * 0.33) wc_w = 2800 wc_h = int(wc_w * 0.6) - logger.info(f"TranscriptPage: Generating sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}") + logger.info(f"TranscriptPage: Queuing analysis sentiment {sent_w}x{sent_h}, wordcloud {wc_w}x{wc_h}") + + self.analysis_thread = QThread() + self.analysis_worker = AnalysisWorker(self.transcript_sentences, sentiment_size=(sent_w, sent_h), wordcloud_size=(wc_w, wc_h), max_words=120) + self.analysis_worker.moveToThread(self.analysis_thread) + + parent_win = self.window() if hasattr(self, "window") else None + self.splash = SplashScreen(parent=parent_win) + self.splash.set_title("Analyzing transcripts...") + self.splash.update_status("Preparing analysis...") + self.splash.set_progress(0) + self.splash.enable_runtime_mode(parent_window=parent_win, cancel_callback=self._cancel_analysis) + self.splash.show_with_animation() + + self.analysis_thread.started.connect(self.analysis_worker.run) + self.analysis_worker.progress_updated.connect(lambda m: (self.splash.update_status(m) if self.splash else None)) + self.analysis_worker.progress_percentage.connect(lambda p: (self.splash.set_progress(p) if self.splash else None)) + self.analysis_worker.sentiment_ready.connect(self._on_sentiment_ready) + self.analysis_worker.wordcloud_ready.connect(self._on_wordcloud_ready) + self.analysis_worker.finished.connect(self.analysis_thread.quit) + self.analysis_worker.finished.connect(self.analysis_worker.deleteLater) + self.analysis_thread.finished.connect(self.analysis_thread.deleteLater) + self.analysis_thread.finished.connect(lambda: (self.splash.fade_and_close(300) if self.splash else None)) + + self.analysis_thread.start() + + def _cancel_analysis(self): + if hasattr(self, "analysis_worker") and self.analysis_worker: + try: + self.analysis_worker.cancel() + except Exception: + pass + if hasattr(self, "analysis_thread") and self.analysis_thread.isRunning(): + try: + self.analysis_thread.requestInterruption() + self.analysis_thread.quit() + self.analysis_thread.wait(200) + except Exception: + pass - try: - sentiment_img = run_sentiment_summary(self.transcript_sentences, width=sent_w, height=sent_h) - wc_img = WordCloudAnalyzer(max_words=120).generate_wordcloud(self.transcript_sentences, width=wc_w, height=wc_h) - except Exception: - logger.exception("TranscriptPage: Error generating images") - self.scroll_layout.addWidget(QLabel("Failed to generate analysis images.")) - return - - self.sentiment_image = sentiment_img - self.wordcloud_image = wc_img + for i in reversed(range(self.scroll_layout.count())): + w = self.scroll_layout.itemAt(i).widget() + if w: + w.deleteLater() + self.scroll_layout.addWidget(QLabel("Analysis cancelled.")) + def _on_sentiment_ready(self, qimage): + self.sentiment_image = qimage channel_name = next(iter(app_state.video_list.keys()), "unknown") - - # Title label self.scroll_layout.addWidget(QLabel("Sentiment Analysis")) - - # DownloadableImage displays at natural size and provides download overlay - sent_widget = DownloadableImage(sentiment_img, default_name=f"transcript_sentiment_{channel_name}.png") + sent_widget = DownloadableImage(qimage, default_name=f"transcript_sentiment_{channel_name}.png") sent_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) self.scroll_layout.addWidget(sent_widget) + def _on_wordcloud_ready(self, qimage): + self.wordcloud_image = qimage self.scroll_layout.addWidget(QLabel("Word Cloud")) - wc_widget = DownloadableImage(wc_img, default_name=f"transcript_wordcloud_{channel_name}.png") + channel_name = next(iter(app_state.video_list.keys()), "unknown") + wc_widget = DownloadableImage(qimage, default_name=f"transcript_wordcloud_{channel_name}.png") wc_widget.setSizePolicy(QSizePolicy.Fixed, QSizePolicy.Fixed) self.scroll_layout.addWidget(wc_widget) - - # Spacer self.scroll_layout.addStretch(1) diff --git a/UI/VideoPage.py b/UI/VideoPage.py index fb8be99..a7ef232 100644 --- a/UI/VideoPage.py +++ b/UI/VideoPage.py @@ -525,6 +525,17 @@ def on_grid_clicked(self, checked: bool) -> None: else: self.grid_btn.setChecked(True) + def _complete_splash(self): + """ + Ensures splash + overlays are ALWAYS removed safely. + Called ONLY after QThread has fully exited. + """ + self._clear_overlays() + + if self.splash: + self.splash.fade_and_close(300) + self.splash = None + # --- Scraping --- def scrape_videos(self, scrape_shorts: bool) -> None: """ @@ -573,14 +584,14 @@ def show_splash_screen(self, parent: Optional[QWidget] = None, gif_path: str = " self.splash.close() self.splash = None - # ✅ IMPORTANT FIX: parent MUST be None + # IMPORTANT FIX: parent MUST be None self.splash = SplashScreen(parent=None, gif_path=gif_path) self.splash.set_title(title) self.splash.update_status("Starting...") self.splash.set_progress(0) - # ✅ Overlay still binds to mainwindow correctly + # Overlay still binds to mainwindow correctly self.splash.enable_runtime_mode( parent_window=self.mainwindow, cancel_callback=self.cancel_scraping @@ -622,7 +633,7 @@ def cancel_scraping(self): self.comment_thread.quit() self.comment_thread.wait(500) - # ✅ Force-remove overlays + # Force-remove overlays self._clear_overlays() # Fade & cleanup splash safely @@ -653,12 +664,6 @@ def update_splash_progress(self, message: str) -> None: self.splash.update_status(message) def update_splash_percentage(self, percentage: int) -> None: - """ - Updates the progress bar of the SplashScreen dialog. - - Args: - percentage (int): The progress percentage (0-100) to display. - """ if self.splash: self.splash.set_progress(percentage) @@ -683,26 +688,24 @@ def on_worker_finished(self) -> None: def on_transcript_worker_finished(self) -> None: """ - Called when the TranscriptWorker thread has finished scraping transcripts. - Closes the SplashScreen dialog. + Worker has emitted 'finished', but QThread may still be running. + We wait for the thread to fully exit before closing splash. """ - self._clear_overlays() + if self.transcript_thread: + self.transcript_thread.finished.connect(self._complete_splash) - if self.splash is not None: - self.splash.fade_and_close(400) - self.splash = None + # Notify rest of app that transcript scraping is done self.video_page_scrape_transcript_signal.emit() def on_comment_worker_finished(self) -> None: """ - Called when the CommentWorker thread has finished scraping comments. - Closes the SplashScreen dialog. + Worker has finished. Wait for thread to close fully + before removing the splash + overlay. """ - self._clear_overlays() + if self.comment_thread: + self.comment_thread.finished.connect(self._complete_splash) - if self.splash is not None: - self.splash.fade_and_close(400) - self.splash = None + # Notify rest of app that comment scraping is done self.video_page_scrape_comments_signal.emit() # --- Loading videos ---