-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.js
More file actions
127 lines (111 loc) · 3.61 KB
/
parser.js
File metadata and controls
127 lines (111 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/**
* Used for standard web articles(not social media posts)
* Grabs tags that are commonly found in webpage articles
* */
function parseArticleText() {
console.log("Running article parser...");
const selectors = [
'article',
'section',
'main',
'div[id*="content"]',
'div[class*="content"]',
'div[class*="article"]',
'p',
'h1, h2, h3, h4',
'ul',
'ol',
'li'
];
//avoids duplicate chunks of text.
const seen = new Set();
//stores all parsed text blocks that will be analyzed
const content = [];
//gets all elements that contain the tags listed in selectors and adds them to content
document.querySelectorAll(selectors.join(',')).forEach(el => {
const text = el.innerText.trim();
if (
text.length > 50 &&
!seen.has(text) &&
el.offsetHeight > 0 &&
el.offsetWidth > 0
) {
seen.add(text);
content.push(text);
}
});
console.log(`Article parser found ${content.length} text blocks`);
return content;
}
//Social media post parser
function parseSocialMediaText() {
console.log("Running social media parser...");
const allText = [];
// Facebook posts
document.querySelectorAll('div[data-ad-preview="message"]').forEach(post => {
allText.push(post.innerText.trim());
});
// Twitter/X posts
document.querySelectorAll('article div[lang]').forEach(post => {
allText.push(post.innerText.trim());
});
// Reddit posts
document.querySelectorAll('h3, .md').forEach(post => {
allText.push(post.innerText.trim());
});
console.log(`Social media parser found ${allText.length} posts`);
return allText;
}
/**
* Twitter parser
* */
const seenTexts = new Set();
function parseTwitterPosts() {
console.log("Running Twitter parser...");
const posts = [];
// Select tweet containers
document.querySelectorAll('article').forEach(article => {
// Inside each article, find elements with a language attribute
const tweetParts = article.querySelectorAll('div[lang]');
const fullText = Array.from(tweetParts)
.map(el => el.innerText.trim())
.join('\n');
if (fullText.length > 0 && !seenTexts.has(fullText)) {
seenTexts.add(fullText);
posts.push(fullText);
}
});
console.log(`Twitter parser found ${posts.length} tweets`);
return posts;
}
// Function to determine which parser to use based on the current site
function determineParser() {
const hostname = window.location.hostname.toLowerCase();
console.log("Determining parser for:", hostname);
if (hostname.includes('twitter.com') || hostname.includes('x.com')) {
console.log("Using Twitter parser");
return parseTwitterPosts;
} else if (
hostname.includes('reddit.com') ||
hostname.includes('facebook.com')
) {
console.log("Using social media parser");
return parseSocialMediaText;
} else {
console.log("Using article parser");
return parseArticleText;
}
}
// Attach the appropriate parser to window.parseVisibleText
function initializeParser() {
try {
window.parseVisibleText = determineParser();
console.log("Parser initialized successfully");
} catch (error) {
console.error("Error initializing parser:", error);
// Fallback to article parser if something goes wrong
window.parseVisibleText = parseArticleText;
}
}
// Initialize parser when the script loads
initializeParser();