diff --git a/src/mcp/local-tools.js b/src/mcp/local-tools.js index 7ad47e4..08a790a 100644 --- a/src/mcp/local-tools.js +++ b/src/mcp/local-tools.js @@ -20,6 +20,9 @@ import { scrapeTweets, searchTweets, scrapeThread, + scrapePost, + scrapeLikedTweets, + discoverLikes, scrapeLikes, scrapeMedia, scrapeListMembers, @@ -61,9 +64,25 @@ async function ensureBrowser() { return { browser, page }; } +/** + * Create a new tab in the shared browser for isolated work. + * Shares cookies/auth with all other tabs. Caller must close the tab when done. + */ +async function newTab(timeout = 60000) { + const { browser: br } = await ensureBrowser(); + const tab = await createPage(br); + tab.setDefaultTimeout(timeout); + return tab; +} + /** * Close browser (called by server.js on SIGINT/SIGTERM) */ +export async function getPage() { + const { page } = await ensureBrowser(); + return page; +} + export async function closeBrowser() { if (browser) { try { @@ -198,8 +217,15 @@ export async function x_search_tweets({ query, limit = 50 }) { // ============================================================================ export async function x_get_thread({ url }) { - const { page: pg } = await ensureBrowser(); - return scrapeThread(pg, url); + const tab = await newTab(); + try { return await scrapeThread(tab, url); } + finally { await tab.close().catch(() => {}); } +} + +export async function x_read_post({ url }) { + const tab = await newTab(); + try { return await scrapePost(tab, url); } + finally { await tab.close().catch(() => {}); } } export async function x_best_time_to_post({ username, limit = 100 }) { @@ -630,6 +656,18 @@ export async function x_get_bookmarks({ limit = 100 }) { return scrapeBookmarks(pg, { limit }); } +export async function x_get_likes({ username, limit = 50, from, to }) { + const tab = await newTab(); + try { return await scrapeLikedTweets(tab, username, { limit, from, to }); } + finally { await tab.close().catch(() => {}); } +} + +export async function x_discover_likes({ username, limit = 50, from, to }) { + const tab = await newTab(); + try { return await discoverLikes(tab, username, { limit, from, to }); } + finally { await tab.close().catch(() => {}); } +} + export async function x_clear_bookmarks() { const { page: pg } = await ensureBrowser(); await pg.goto('https://x.com/i/bookmarks', { waitUntil: 'networkidle2' }); @@ -1336,6 +1374,8 @@ export async function x_client_get_trends() { // ============================================================================ export const toolMap = { + // Internal helper used by xeepy tools + getPage, // Auth x_login, // Scraping (delegated to scrapers/index.js — single source of truth) @@ -1346,6 +1386,7 @@ export const toolMap = { x_get_tweets, x_search_tweets, x_get_thread, + x_read_post, x_best_time_to_post, // Core actions x_follow, @@ -1369,6 +1410,8 @@ export const toolMap = { x_reply, x_bookmark, x_get_bookmarks, + x_get_likes, + x_discover_likes, x_clear_bookmarks, x_auto_like, // Discovery diff --git a/src/mcp/server.js b/src/mcp/server.js index 1f46777..6e1b135 100755 --- a/src/mcp/server.js +++ b/src/mcp/server.js @@ -772,6 +772,17 @@ const TOOLS = [ required: ['title', 'body'], }, }, + { + name: 'x_read_article', + description: 'Read the full content of an X article given a tweet URL or article URL.', + inputSchema: { + type: 'object', + properties: { + url: { type: 'string', description: 'Tweet URL (x.com/user/status/ID) or article URL (x.com/user/article/ID)' }, + }, + required: ['url'], + }, + }, // ====== Creator ====== { name: 'x_creator_analytics', @@ -1129,6 +1140,17 @@ const TOOLS = [ required: ['url'], }, }, + { + name: 'x_read_post', + description: 'Read a tweet/post with full rich data. Returns thread if the post is part of one (author self-replies only). Recursively resolves quoted tweets — if a quoted tweet is itself a thread or contains its own quote tweet, those are fetched too. Each tweet includes: text, media (images + video URLs), article, card (link preview), and engagement stats.', + inputSchema: { + type: 'object', + properties: { + url: { type: 'string', description: 'URL of the tweet/post' }, + }, + required: ['url'], + }, + }, // ====== Posting Analytics ====== { name: 'x_best_time_to_post', @@ -1903,12 +1925,29 @@ const TOOLS = [ }, { name: 'x_get_likes', - description: 'Scrape tweets that a user has liked. Shows what content a user engages with.', + description: 'Scrape tweets that a user has liked. Shows what content a user engages with. Supports timestamp filtering — likes are reverse chronological, so scrolling stops early when it passes the "from" date.', + inputSchema: { + type: 'object', + properties: { + username: { type: 'string', description: 'Username (without @)' }, + limit: { type: 'number', description: 'Maximum liked tweets to return (default: 50)' }, + from: { type: 'string', description: 'Only include likes from this date onward (e.g. "2026-03-01"). Stops scrolling when older tweets are reached.' }, + to: { type: 'string', description: 'Only include likes up to this date (e.g. "2026-03-31"). Skips newer tweets but keeps scrolling.' }, + }, + required: ['username'], + }, + }, + + { + name: 'x_discover_likes', + description: 'Fetch liked tweets and deep-read each one with human-like pacing. Produces two JSONL files: a likes index (summary per tweet) and deep reads (full thread/quote tweet data per tweet via scrapePost). Timing mimics a human browsing their likes tab — scrolling, pausing, tapping into posts, reading, going back. Long-running — check the JSONL files on disk for progress.', inputSchema: { type: 'object', properties: { username: { type: 'string', description: 'Username (without @)' }, limit: { type: 'number', description: 'Maximum liked tweets (default: 50)' }, + from: { type: 'string', description: 'Only include likes from this date onward' }, + to: { type: 'string', description: 'Only include likes up to this date' }, }, required: ['username'], }, @@ -2275,7 +2314,7 @@ async function executeTool(name, args) { const xeepyTools = [ 'x_get_replies', 'x_get_hashtag', 'x_get_likers', 'x_get_retweeters', 'x_get_media', 'x_get_recommendations', 'x_get_mentions', 'x_get_quote_tweets', - 'x_get_likes', 'x_auto_follow', 'x_follow_engagers', 'x_unfollow_all', + 'x_read_article', 'x_auto_follow', 'x_follow_engagers', 'x_unfollow_all', 'x_smart_unfollow', 'x_quote_tweet', 'x_auto_comment', 'x_auto_retweet', 'x_detect_bots', 'x_find_influencers', 'x_smart_target', 'x_crypto_analyze', 'x_grok_analyze_image', 'x_audience_insights', 'x_engagement_report', @@ -2475,20 +2514,84 @@ async function executeXeepyTool(name, args) { return { quotes, count: quotes.length }; } - case 'x_get_likes': { - const page = await localTools.getPage(); - await page.goto(`https://x.com/${args.username}/likes`, { waitUntil: 'networkidle2', timeout: 30000 }); - await new Promise(r => setTimeout(r, 3000)); - const likedTweets = await page.evaluate((limit) => { - const articles = document.querySelectorAll('article[data-testid="tweet"]'); - return Array.from(articles).slice(0, limit).map(el => { - const textEl = el.querySelector('[data-testid="tweetText"]'); - const userEl = el.querySelector('[data-testid="User-Name"]'); - const timeEl = el.querySelector('time'); - return { text: textEl?.textContent || '', author: userEl?.textContent || '', timestamp: timeEl?.getAttribute('datetime') || '' }; + case 'x_read_article': { + const { getPage } = await import('./local-tools.js'); + const page = await getPage(); + let url = args.url; + // Convert tweet URL to article URL if needed — navigate to tweet first to discover article URL + if (url.includes('/status/') && !url.includes('/article/')) { + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + await new Promise(r => setTimeout(r, 4000)); + // Try finding an article link directly on the page + let articleUrl = await page.evaluate(() => { + const links = [...document.querySelectorAll('a[href*="/article/"]')]; + const articleLink = links.find(a => a.href.match(/\/article\/\d+$/)); + return articleLink?.href || ''; }); - }, args.limit || 50); - return { likedTweets, count: likedTweets.length, username: args.username }; + // Fallback: click the article-cover-image to navigate to the real article + // (handles quote tweets where the article belongs to the quoted author) + if (!articleUrl) { + const cover = await page.$('[data-testid="article-cover-image"]'); + if (cover) { + await cover.click(); + await new Promise(r => setTimeout(r, 5000)); + // Check if we navigated to an article or a tweet with an article + articleUrl = await page.evaluate(() => { + // Check for direct article links + const links = [...document.querySelectorAll('a[href*="/article/"]')]; + const articleLink = links.find(a => a.href.match(/\/article\/\d+$/)); + return articleLink?.href || ''; + }); + // If we landed on a tweet page with twitterArticleReadView, use current URL + if (!articleUrl) { + const hasReadView = await page.evaluate(() => !!document.querySelector('[data-testid="twitterArticleReadView"]')); + if (hasReadView) articleUrl = page.url(); + } + } + } + if (!articleUrl) return { content: [{ type: 'text', text: JSON.stringify({ error: 'No article found on this tweet' }) }] }; + url = articleUrl; + } + await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }); + await new Promise(r => setTimeout(r, 4000)); + // Scroll through to load lazy content + for (let i = 0; i < 25; i++) { + await page.evaluate(() => window.scrollBy(0, 800)); + await new Promise(r => setTimeout(r, 500)); + } + const article = await page.evaluate(() => { + const title = document.querySelector('[data-testid="twitter-article-title"]')?.textContent?.trim() || ''; + const readView = document.querySelector('[data-testid="twitterArticleReadView"]'); + if (!readView) return { error: 'Article content not found' }; + // Get author from User-Name + const userNameEl = document.querySelector('[data-testid="User-Name"]'); + const authorName = userNameEl?.querySelector('span')?.textContent?.trim() || ''; + const authorHandle = userNameEl?.querySelector('a[href^="/"]')?.getAttribute('href')?.replace('/', '') || ''; + // Get clean article text — innerText includes header/footer noise + const fullText = readView.innerText; + // Strip header: title, author, @handle, timestamp, engagement numbers + // The header pattern is: title\nauthor\n@handle\n·\ntimestamp\nengagement... + const lines = fullText.split('\n'); + let startIdx = 0; + // Skip past the header — find first line that's actual content (long paragraph) + for (let i = 0; i < Math.min(lines.length, 15); i++) { + if (lines[i].length > 100) { startIdx = i; break; } + } + // Strip footer: author name, @handle, "Following", bio at the end + let endIdx = lines.length; + for (let i = lines.length - 1; i > Math.max(0, lines.length - 10); i--) { + if (lines[i] === authorName || lines[i] === '@' + authorHandle || lines[i] === 'Following') { + endIdx = Math.min(endIdx, i); + } + } + const cleanText = lines.slice(startIdx, endIdx).join('\n').trim(); + // Filter images — exclude profile pics (small thumbnails) + const images = [...readView.querySelectorAll('img')] + .map(i => i.src) + .filter(s => s.includes('twimg') && !s.includes('_normal.') && !s.includes('_bigger.') && !s.includes('profile_images')); + return { title, author: authorName, handle: authorHandle, text: cleanText, images, url: location.href }; + }); + return { content: [{ type: 'text', text: JSON.stringify(article, null, 2) }] }; } // ── Follow Automation ── diff --git a/src/scrapers/index.js b/src/scrapers/index.js index ff6646f..3f5ce9d 100644 --- a/src/scrapers/index.js +++ b/src/scrapers/index.js @@ -79,6 +79,9 @@ export const { scrapeTweets, searchTweets, scrapeThread, + scrapePost, + scrapeLikedTweets, + discoverLikes, scrapeLikes, scrapeHashtag, scrapeMedia, @@ -308,6 +311,9 @@ export default { scrapeTweets, searchTweets, scrapeThread, + scrapePost, + scrapeLikedTweets, + discoverLikes, scrapeLikes, scrapeHashtag, scrapeMedia, diff --git a/src/scrapers/twitter/index.js b/src/scrapers/twitter/index.js index ded588d..bc0e22f 100644 --- a/src/scrapers/twitter/index.js +++ b/src/scrapers/twitter/index.js @@ -27,7 +27,27 @@ puppeteer.use(StealthPlugin()); // ============================================================================ const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); -const randomDelay = (min = 1000, max = 3000) => sleep(min + Math.random() * (max - min)); + +/** Human-like delay using log-normal distribution with occasional distraction spikes. */ +const randomDelay = (min = 2000, max = 7000) => { + const u1 = Math.random(); + const u2 = Math.random(); + const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2); + const median = min + (max - min) * 0.4; + const spread = (max - min) * 0.25; + const base = median + z * spread; + const distraction = Math.random() < 0.08 ? 8000 + Math.random() * 12000 : 0; + const delay = Math.max(min, Math.min(base, max)) + distraction; + return sleep(delay); +}; + +/** Throw if the page redirected to login (expired/invalid cookie). */ +function checkAuth(page) { + const url = page.url(); + if (url.includes('/login') || url.includes('/i/flow/login')) { + throw new Error('Authentication failed — cookie may be expired.\n\nRun: xactions login'); + } +} /** * Create a browser instance with stealth settings. @@ -438,55 +458,646 @@ export async function searchTweets(page, query, options = {}) { // ============================================================================ // Thread Scraper // ============================================================================ +// TweetDetail GraphQL helpers (shared by scrapeThread and scrapePost) +// ============================================================================ /** - * Scrape a full tweet thread + * Fetch TweetDetail GraphQL API from the page context using session cookies. + * The page must already be on x.com (for cookies to be available). + * Includes a human-like delay before each call. */ -export async function scrapeThread(page, tweetUrl) { - await page.goto(tweetUrl, { waitUntil: 'networkidle2' }); - await randomDelay(); +async function fetchTweetDetail(page, tweetId, retries = 2) { + await randomDelay(2000, 5000); + + for (let attempt = 0; attempt <= retries; attempt++) { + const result = await page.evaluate(async (id) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return { error: 'no_ct0' }; + const variables = JSON.stringify({ + focalTweetId: id, with_rux_injections: false, rankingMode: 'Relevance', + includePromotedContent: false, withCommunity: true, + withQuickPromoteEligibilityTweetFields: true, withBirdwatchNotes: true, withVoice: true, + }); + const features = JSON.stringify({ + rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + freedom_of_speech_not_reach_fetch_enabled: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + }); + const url = `https://x.com/i/api/graphql/t66713qxyDI9pc4Jyb6wxQ/TweetDetail?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`; + try { + const resp = await fetch(url, { + headers: { + 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', + 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session', + }, + credentials: 'include', + }); + if (resp.status === 429) return { error: 'rate_limited', status: 429 }; + if (!resp.ok) return { error: 'http_error', status: resp.status }; + return await resp.json(); + } catch (e) { return { error: 'fetch_failed', message: e.message }; } + }, tweetId); + + // Success — has data, not an error object + if (result && !result.error) return result; + + // No ct0 — page might need a refresh to get fresh cookies + if (result?.error === 'no_ct0') { + await page.reload({ waitUntil: 'networkidle2', timeout: 30000 }); + await randomDelay(3000, 5000); + continue; + } - for (let i = 0; i < 5; i++) { - await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); - await randomDelay(1000, 2000); + // Rate limited — back off and retry + if (result?.error === 'rate_limited') { + await randomDelay(10000 + attempt * 15000, 20000 + attempt * 20000); + continue; + } + + // Other errors — retry with shorter backoff + if (attempt < retries) { + await randomDelay(5000, 10000); + } } - const thread = await page.evaluate(() => { - const articles = document.querySelectorAll('article[data-testid="tweet"]'); - const mainTweetId = window.location.pathname.match(/status\/(\d+)/)?.[1]; - - const mainArticle = Array.from(articles).find(a => - a.querySelector(`a[href*="/status/${mainTweetId}"]`) - ); - const mainAuthor = mainArticle?.querySelector('[data-testid="User-Name"] a')?.href?.split('/')[3]; + return null; +} - return Array.from(articles) - .map((article) => { - const textEl = article.querySelector('[data-testid="tweetText"]'); - const authorLink = article.querySelector('[data-testid="User-Name"] a[href^="/"]'); - const timeEl = article.querySelector('time'); - const linkEl = article.querySelector('a[href*="/status/"]'); - - const author = authorLink?.href?.split('/')[3]; - - return { - id: linkEl?.href?.match(/status\/(\d+)/)?.[1] || null, - text: textEl?.textContent || null, - author, - timestamp: timeEl?.getAttribute('datetime') || null, - url: linkEl?.href || null, - isMainAuthor: author === mainAuthor, - platform: 'twitter', - }; - }) - .filter(t => t.id && t.isMainAuthor); +/** Extract timeline entries from a TweetDetail GraphQL response. */ +function extractEntries(graphqlData) { + const instructions = graphqlData?.data?.threaded_conversation_with_injections_v2?.instructions || []; + const entries = []; + for (const inst of instructions) { + if (inst.entries) entries.push(...inst.entries); + } + return entries; +} + +/** Unwrap TweetWithVisibilityResults wrapper. */ +function unwrapResult(result) { + if (result?.__typename === 'TweetWithVisibilityResults') return result.tweet; + return result; +} + +/** Get screen_name from a user result (handles both new core and legacy paths). */ +function getScreenName(result) { + const user = result?.core?.user_results?.result; + return user?.core?.screen_name || user?.legacy?.screen_name || ''; +} + +/** + * Parse rich data from a single tweet GraphQL result. + * Does NOT recurse into quoted tweets — returns quotedTweetId for the caller to handle. + */ +function parseTweetResult(result) { + result = unwrapResult(result); + if (!result?.legacy) return null; + + const legacy = result.legacy; + const author = getScreenName(result); + const text = result.note_tweet?.note_tweet_results?.result?.text || legacy.full_text || ''; + + // Media: images and videos + const media = (legacy.extended_entities?.media || []).map(m => { + const item = { type: m.type, url: m.media_url_https }; + if (m.type === 'video' || m.type === 'animated_gif') { + const best = m.video_info?.variants + ?.filter(v => v.content_type === 'video/mp4') + .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0]; + if (best) item.videoUrl = best.url; + } + return item; }); - return thread; + // Article (X Articles — long-form posts) + let article = null; + if (result.article?.article_results?.result) { + const a = result.article.article_results.result; + article = { + id: a.rest_id || null, + title: a.title || null, + coverImage: a.cover_media?.media_info?.original_img_url || null, + url: `https://x.com/${author}/article/${result.rest_id}`, + }; + } + + // Card (link previews — external URLs) + let card = null; + if (result.card?.legacy?.binding_values) { + const vals = {}; + for (const v of result.card.legacy.binding_values) { + vals[v.key] = v.value?.string_value || v.value?.scribe_value?.value || v.value?.image_value?.url || ''; + } + if (vals.title || vals.card_url) { + card = { title: vals.title || '', description: vals.description || '', url: vals.card_url || '', image: vals.thumbnail_image_original || '' }; + } + } + + // URLs: external links in tweet text (from both legacy and note_tweet entities) + const rawUrls = [ + ...(legacy.entities?.urls || []), + ...(result.note_tweet?.note_tweet_results?.result?.entity_set?.urls || []), + ]; + const urls = rawUrls + .map(u => ({ url: u.expanded_url || u.url || '', display: u.display_url || '' })) + .filter(u => u.url && !u.url.includes('x.com/') && !u.url.includes('twitter.com/')); + + // Quoted tweet ID (for recursive fetching — not parsed from this response) + const quotedTweetId = result.quoted_status_result?.result?.rest_id || legacy.quoted_status_id_str || null; + + return { + id: result.rest_id, + author, + text, + timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : null, + url: `https://x.com/${author}/status/${result.rest_id}`, + media, + article, + card, + urls: urls.length > 0 ? urls : undefined, + quotedTweetId, + inReplyTo: legacy.in_reply_to_status_id_str || null, + replies: legacy.reply_count || 0, + retweets: legacy.retweet_count || 0, + likes: legacy.favorite_count || 0, + views: result.views?.count || '0', + platform: 'twitter', + }; +} + +/** + * From a list of entries, collect all tweets by a given author and filter + * to the self-reply thread chain (root tweet + author replying to themselves). + */ +function parseThreadFromEntries(entries, mainAuthor, mainTweetId) { + const candidates = new Map(); + + for (const entry of entries) { + const result = unwrapResult(entry.content?.itemContent?.tweet_results?.result); + if (result && getScreenName(result).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(result); + if (parsed) candidates.set(parsed.id, parsed); + } + for (const item of (entry.content?.items || [])) { + const r = unwrapResult(item.item?.itemContent?.tweet_results?.result); + if (r && getScreenName(r).toLowerCase() === mainAuthor.toLowerCase()) { + const parsed = parseTweetResult(r); + if (parsed) candidates.set(parsed.id, parsed); + } + } + } + + const threadIds = new Set(candidates.keys()); + return Array.from(candidates.values()) + .filter(t => t.id === mainTweetId || (t.inReplyTo && threadIds.has(t.inReplyTo))) + .sort((a, b) => { + const ta = t => t.timestamp ? new Date(t.timestamp).getTime() : 0; + return ta(a) - ta(b); + }); +} + +// ============================================================================ +// Thread Scraper +// ============================================================================ + +/** + * Scrape a full tweet thread (author's self-reply chain). + */ +export async function scrapeThread(page, tweetUrl) { + const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null; + const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null; + if (!mainTweetId || !mainAuthor) return []; + + await page.goto(tweetUrl, { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + + const graphqlData = await fetchTweetDetail(page, mainTweetId); + if (!graphqlData) return []; + + const entries = extractEntries(graphqlData); + const thread = parseThreadFromEntries(entries, mainAuthor, mainTweetId); + + // Strip internal fields for backward compatibility + return thread.map(({ inReplyTo, quotedTweetId, media, article, card, ...rest }) => rest); +} + +// ============================================================================ +// Post Scraper (rich data + recursive quoted tweets) +// ============================================================================ + +/** + * Scrape a single post or thread with full rich data. + * + * Returns the thread (1 tweet if single post, N if thread) with rich data + * per tweet: text, media, article, card, engagement, and recursively + * resolved quoted posts (which may themselves be threads). + * + * @param {import('puppeteer').Page} page + * @param {string} tweetUrl + * @param {number} [maxDepth=5] - Max recursion depth for nested quote tweets + */ +export async function scrapePost(page, tweetUrl, maxDepth = 5) { + const mainTweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1] || null; + const mainAuthor = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0] || null; + if (!mainTweetId || !mainAuthor) throw new Error('Invalid tweet URL'); + + // Ensure we're on x.com for cookie access + if (!page.url().includes('x.com')) { + await page.goto('https://x.com', { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + } + + return _scrapePostRecursive(page, mainTweetId, mainAuthor, maxDepth, 0); +} + +async function _scrapePostRecursive(page, tweetId, author, maxDepth, depth) { + const graphqlData = await fetchTweetDetail(page, tweetId); + if (!graphqlData) return { thread: [], error: 'fetchTweetDetail returned null after retries' }; + + // Check if the API returned an error object (shouldn't happen after retries, but just in case) + if (graphqlData.error) return { thread: [], error: graphqlData.error }; + + const entries = extractEntries(graphqlData); + const thread = parseThreadFromEntries(entries, author, tweetId); + if (thread.length === 0) return { thread: [], error: `no tweets found for @${author} in TweetDetail response (${entries.length} entries)` }; + + // For each thread tweet, resolve its quoted post recursively + for (const tweet of thread) { + if (tweet.quotedTweetId && depth < maxDepth) { + // Fetch the quoted tweet as a focal tweet to get its full data (thread + its own QTs) + const qtData = await fetchTweetDetail(page, tweet.quotedTweetId); + if (qtData) { + const qtEntries = extractEntries(qtData); + // Find the focal tweet result to get its author + const focalEntry = qtEntries.find(e => + e.entryId?.includes(tweet.quotedTweetId)); + const focalResult = unwrapResult( + focalEntry?.content?.itemContent?.tweet_results?.result); + const qtAuthor = focalResult ? getScreenName(focalResult) : ''; + + if (qtAuthor) { + tweet.quotedPost = await _scrapePostRecursive( + page, tweet.quotedTweetId, qtAuthor, maxDepth, depth + 1); + } + } + } + // Clean up internal field + delete tweet.quotedTweetId; + delete tweet.inReplyTo; + } + + return { thread }; +} + +// ============================================================================ +// Liked Tweets Scraper (a user's liked tweets page) +// ============================================================================ + +/** + * Scrape a user's liked tweets via the Likes GraphQL API. + * + * Uses cursor-based pagination — no DOM scraping or scroll limits. + * Writes results incrementally to a JSONL file so progress survives + * crashes and memory stays bounded for large pulls. + * + * Returns { file, count, username, dateRange } — the caller reads the + * file for the full data. + * + * @param {import('puppeteer').Page} page + * @param {string} username + * @param {object} options + * @param {number} [options.limit=50] - Max tweets to return + * @param {string} [options.from] - Only include likes from this date onward (stops when older) + * @param {string} [options.to] - Only include likes up to this date (skips newer) + */ +export async function scrapeLikedTweets(page, username, options = {}) { + const { limit = 50, from, to } = options; + + if (!username) throw new Error('Username is required for scrapeLikedTweets'); + + const fromDate = from ? new Date(from) : null; + const toDate = to ? new Date(to) : null; + if (fromDate && isNaN(fromDate.getTime())) throw new Error(`Invalid "from" date: ${from}`); + if (toDate && isNaN(toDate.getTime())) throw new Error(`Invalid "to" date: ${to}`); + + // Ensure we're on x.com for cookie access + if (!page.url().includes('x.com')) { + await page.goto('https://x.com', { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + } + + // Resolve numeric userId from username + const userId = await page.evaluate(async (screenName) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = JSON.stringify({ screen_name: screenName, withSafetyModeUserFields: true }); + const features = JSON.stringify({ hidden_profile_subscriptions_enabled: true, responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true }); + const url = `https://x.com/i/api/graphql/IGgvgiOx4QZndDHuD3x9TQ/UserByScreenName?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`; + try { + const resp = await fetch(url, { + headers: { 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session' }, + credentials: 'include', + }); + const data = await resp.json(); + return data?.data?.user?.result?.rest_id || null; + } catch { return null; } + }, username); + + if (!userId) throw new Error(`Could not resolve userId for @${username}`); + + // Set up JSONL output file + const exportDir = `${process.env.HOME || '/tmp'}/.xactions/exports`; + await fs.mkdir(exportDir, { recursive: true }); + const ts = new Date().toISOString().replace(/[:.]/g, '-'); + const filePath = `${exportDir}/likes-${username}-${ts}.jsonl`; + + let count = 0; + let cursor = null; + let firstTimestamp = null; + let lastTimestamp = null; + let emptyPages = 0; + let passedFromDate = false; + + while (count < limit && emptyPages < 3 && !passedFromDate) { + await randomDelay(2000, 5000); + + const pageData = await page.evaluate(async ({ userId, cursor, pageSize }) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = { userId, count: pageSize, includePromotedContent: false, withClientEventToken: false, withBirdwatchNotes: false, withVoice: true }; + if (cursor) variables.cursor = cursor; + const features = { + rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + freedom_of_speech_not_reach_fetch_enabled: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + }; + const url = `https://x.com/i/api/graphql/KPuet6dGbC8LB2sOLx7tZQ/Likes?variables=${encodeURIComponent(JSON.stringify(variables))}&features=${encodeURIComponent(JSON.stringify(features))}`; + try { + const resp = await fetch(url, { + headers: { 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session' }, + credentials: 'include', + }); + return await resp.json(); + } catch { return null; } + }, { userId, cursor, pageSize: Math.min(20, limit - count) }); + + if (!pageData) { emptyPages++; continue; } + + // Extract entries from timeline (path: data.user.result.timeline.timeline.instructions) + const instructions = pageData?.data?.user?.result?.timeline?.timeline?.instructions || []; + const entries = []; + for (const inst of instructions) { + if (inst.entries) entries.push(...inst.entries); + } + + // Find cursor for next page + const cursorEntry = entries.find(e => e.entryId?.startsWith('cursor-bottom')); + cursor = cursorEntry?.content?.value || null; + + // Parse tweets + const batch = []; + for (const entry of entries) { + const result = entry.content?.itemContent?.tweet_results?.result; + if (!result) continue; + const parsed = parseTweetResult(result); + if (!parsed) continue; + + const tweetDate = parsed.timestamp ? new Date(parsed.timestamp) : null; + if (tweetDate) { + if (fromDate && tweetDate < fromDate) { passedFromDate = true; break; } + if (toDate && tweetDate > toDate) continue; + } + + // Clean internal fields + delete parsed.quotedTweetId; + delete parsed.inReplyTo; + + if (!firstTimestamp && parsed.timestamp) firstTimestamp = parsed.timestamp; + if (parsed.timestamp) lastTimestamp = parsed.timestamp; + batch.push(parsed); + count++; + if (count >= limit) break; + } + + if (batch.length > 0) { + const lines = batch.map(t => JSON.stringify(t)).join('\n') + '\n'; + await fs.appendFile(filePath, lines); + emptyPages = 0; + } else { + emptyPages++; + } + + if (!cursor) break; + } + + return { file: filePath, count, username, dateRange: { from: firstTimestamp, to: lastTimestamp } }; +} + +// ============================================================================ +// Discover Likes (interleaved fetch + deep read with human-like pacing) +// ============================================================================ + +/** + * Fetch liked tweets and deep-read each one, interleaved with human-like + * timing. Produces two JSONL files: + * - likes index (summary per tweet from the Likes API) + * - deep reads (full scrapePost output per tweet) + * + * The pacing mimics a human browsing their likes: scroll through a page, + * pause, tap into a post, read it, go back, scroll more. + * + * @param {import('puppeteer').Page} page + * @param {string} username + * @param {object} options + * @param {number} [options.limit=50] - Max tweets + * @param {string} [options.from] - Only likes from this date onward + * @param {string} [options.to] - Only likes up to this date + */ +export async function discoverLikes(page, username, options = {}) { + const { limit = 50, from, to } = options; + + if (!username) throw new Error('Username is required'); + + const fromDate = from ? new Date(from) : null; + const toDate = to ? new Date(to) : null; + if (fromDate && isNaN(fromDate.getTime())) throw new Error(`Invalid "from" date: ${from}`); + if (toDate && isNaN(toDate.getTime())) throw new Error(`Invalid "to" date: ${to}`); + + // Ensure we're on x.com + if (!page.url().includes('x.com')) { + await page.goto('https://x.com', { waitUntil: 'networkidle2', timeout: 30000 }); + checkAuth(page); + await randomDelay(2000, 3000); + } + + // Resolve userId + const userId = await page.evaluate(async (screenName) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = JSON.stringify({ screen_name: screenName, withSafetyModeUserFields: true }); + const features = JSON.stringify({ hidden_profile_subscriptions_enabled: true, responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true }); + const url = `https://x.com/i/api/graphql/IGgvgiOx4QZndDHuD3x9TQ/UserByScreenName?variables=${encodeURIComponent(variables)}&features=${encodeURIComponent(features)}`; + try { + const resp = await fetch(url, { + headers: { 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session' }, + credentials: 'include', + }); + const data = await resp.json(); + return data?.data?.user?.result?.rest_id || null; + } catch { return null; } + }, username); + + if (!userId) throw new Error(`Could not resolve userId for @${username}`); + + // Set up output files + const exportDir = `${process.env.HOME || '/tmp'}/.xactions/exports`; + await fs.mkdir(exportDir, { recursive: true }); + const ts = new Date().toISOString().replace(/[:.]/g, '-'); + const likesFile = `${exportDir}/likes-${username}-${ts}.jsonl`; + const deepFile = `${exportDir}/likes-${username}-${ts}-deep.jsonl`; + + let count = 0; + let deepCount = 0; + let cursor = null; + let firstTimestamp = null; + let lastTimestamp = null; + let emptyPages = 0; + let passedFromDate = false; + + while (count < limit && emptyPages < 3 && !passedFromDate) { + // Pause between pages — like scrolling through the feed + await randomDelay(3000, 8000); + + // Fetch a page of likes + const pageData = await page.evaluate(async ({ userId, cursor, pageSize }) => { + const ct0 = document.cookie.match(/ct0=([^;]+)/)?.[1]; + if (!ct0) return null; + const variables = { userId, count: pageSize, includePromotedContent: false, withClientEventToken: false, withBirdwatchNotes: false, withVoice: true }; + if (cursor) variables.cursor = cursor; + const features = { + rweb_video_screen_enabled: false, responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + freedom_of_speech_not_reach_fetch_enabled: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + }; + const url = `https://x.com/i/api/graphql/KPuet6dGbC8LB2sOLx7tZQ/Likes?variables=${encodeURIComponent(JSON.stringify(variables))}&features=${encodeURIComponent(JSON.stringify(features))}`; + try { + const resp = await fetch(url, { + headers: { 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 'x-csrf-token': ct0, 'x-twitter-active-user': 'yes', 'x-twitter-auth-type': 'OAuth2Session' }, + credentials: 'include', + }); + return await resp.json(); + } catch { return null; } + }, { userId, cursor, pageSize: Math.min(20, limit - count) }); + + if (!pageData) { emptyPages++; continue; } + + const instructions = pageData?.data?.user?.result?.timeline?.timeline?.instructions || []; + const entries = []; + for (const inst of instructions) { + if (inst.entries) entries.push(...inst.entries); + } + + const cursorEntry = entries.find(e => e.entryId?.startsWith('cursor-bottom')); + cursor = cursorEntry?.content?.value || null; + + // Process each tweet in the page + const batch = []; + for (const entry of entries) { + const result = entry.content?.itemContent?.tweet_results?.result; + if (!result) continue; + const parsed = parseTweetResult(result); + if (!parsed) continue; + + const tweetDate = parsed.timestamp ? new Date(parsed.timestamp) : null; + if (tweetDate) { + if (fromDate && tweetDate < fromDate) { passedFromDate = true; break; } + if (toDate && tweetDate > toDate) continue; + } + + delete parsed.quotedTweetId; + delete parsed.inReplyTo; + + if (!firstTimestamp && parsed.timestamp) firstTimestamp = parsed.timestamp; + if (parsed.timestamp) lastTimestamp = parsed.timestamp; + batch.push(parsed); + count++; + if (count >= limit) break; + } + + // Write likes batch + if (batch.length > 0) { + const lines = batch.map(t => JSON.stringify(t)).join('\n') + '\n'; + await fs.appendFile(likesFile, lines); + emptyPages = 0; + } else { + emptyPages++; + } + + // Deep read each tweet — interleaved with human-like pauses + for (const tweet of batch) { + // Pause before tapping in — decision time + await randomDelay(2000, 5000); + + try { + const tweetUrl = tweet.url; + if (!tweetUrl) continue; + const author = new URL(tweetUrl).pathname.split('/').filter(Boolean)[0]; + const tweetId = new URL(tweetUrl).pathname.match(/status\/(\d+)/)?.[1]; + if (!tweetId || !author) continue; + + const deepData = await _scrapePostRecursive(page, tweetId, author, 5, 0); + if (deepData.thread.length > 0) { + await fs.appendFile(deepFile, JSON.stringify(deepData) + '\n'); + deepCount++; + } + } catch {} + + // Pause after reading — absorbing the content + await randomDelay(5000, 15000); + } + + if (!cursor) break; + } + + return { + likesFile, + deepReadsFile: deepFile, + likesCount: count, + deepReadsCount: deepCount, + username, + dateRange: { from: firstTimestamp, to: lastTimestamp }, + }; } // ============================================================================ -// Likes Scraper +// Likes Scraper (who liked a specific tweet) // ============================================================================ /** @@ -938,6 +1549,9 @@ export default { scrapeTweets, searchTweets, scrapeThread, + scrapePost, + scrapeLikedTweets, + discoverLikes, scrapeLikes, scrapeHashtag, scrapeMedia,