Skip to content

Commit 48eb1a1

Browse files
authored
Feat(extractor): Supporting Claude.ai (#210)
* chore: allow claude.ai domain * feat: claude.ai raw export * feat: better content extraction * feat: better content targeting * feat: adding Turndown formating rules * feat: select chat title * feat: improving code * feat: extract ClaudeChat artifacts content note: needs to post-process as Turndown does not support async click() * fix: artifacts correctly formatted * fix: update rules and timing * fix: make sure artifact content is extracted * fix: optimizing * feat: artifact name and separation in export * fix: preserveLineBreaksInPre * refactor&fix: separating rules for PRE vs CODE without PRE * fix: artifact title formatting * fix: avoid selecting code copy button as it causes claude clipboard focus error breaking execution * fix: preserveLineBreaksInCode target only CODE tags * fix: keep newlines in artifacts code blocks * fix: [wrong spacing3] * fix: refactor + spacing attempt * docs: cleaning note: Turndown is automatically deleting spaces in spans, need to find a workaround * chore: attempt, not working * feat&fix: encapsulate div>code inside a PRE node to export artifact code correctly * chore: cleaning code * fix: adding artifact name as an id to put artifact content at the right place In case of bug, artifacts are ot wrongly mixed into the export * fix: only target artifacts preview buttons for replacement * fix: preserveLineBreaksInCode for user Questions auto code blocks without PRE
1 parent 3583424 commit 48eb1a1

11 files changed

+189
-15
lines changed

src/manifest.json

+4-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
"*://*.phind.com/*",
2222
"*://*.perplexity.ai/*",
2323
"*://google.com/search*",
24-
"*://chatgpt.com/*"
24+
"*://chatgpt.com/*",
25+
"*://*.claude.ai/*"
2526
],
2627
"action": {
2728
"default_icon": "files/icons/icon_disabled-500.png"
@@ -35,7 +36,8 @@
3536
"*://*.phind.com/*",
3637
"*://*.perplexity.ai/*",
3738
"*://*.google.com/search*",
38-
"*://chatgpt.com/*"
39+
"*://chatgpt.com/*",
40+
"*://*.claude.ai/*"
3941
],
4042
"js": ["tab.js"]
4143
}],

src/services/checker/allowedDomains.json

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
"Phind": "www.phind.com",
44
"Perplexity": "www.perplexity.ai",
55
"MaxAIGoogle": "www.google.com",
6-
"ChatGPT": "chatgpt.com"
6+
"ChatGPT": "chatgpt.com",
7+
"Claude": "claude.ai"
78
},
89
"EXPORT_DOMAINS": {
910
"PhindSearch": "www.phind.com/search",
@@ -13,6 +14,7 @@
1314
"MaxAIGoogle": "www.google.com/search",
1415
"ChatGPT": "chatgpt.com/c",
1516
"ChatGPTShare": "chatgpt.com/share",
16-
"ChatGPTBots": "chatgpt.com/g"
17+
"ChatGPTBots": "chatgpt.com/g",
18+
"ClaudeChat": "claude.ai/chat"
1719
}
1820
}

src/services/checker/domainChecker.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* @param hostAndPath url of the page
55
* @returns {null|{name: string, url: *}}
66
*/
7-
export function domainChecker(allowedDomains: { [x: string]: any; Phind?: string; Perplexity?: string; MaxAIGoogle?: string; ChatGPT?: string; }, hostAndPath: string): null | { name: string; url: any; } {
7+
export function domainChecker(allowedDomains: { [x: string]: any; }, hostAndPath: string): null | { name: string; url: any; } {
88
for (let domainName in allowedDomains) {
99
const url = allowedDomains[domainName];
1010
if (hostAndPath?.startsWith(url)) {

src/services/export/extractor/defineAction.ts

+43-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,51 @@
11
import {clickElements} from "../../interact/cs/interact";
2+
import {sleep} from "../../../utils/jsShorteners";
23

3-
export function defineAction(action: { type: any; selector: string | undefined; }) {
4+
5+
//TODO: currently for Claude artifacts, needs to be genericized
6+
async function clickActClose(markdown: string | undefined, format: ((html: string) => string) | undefined) {
7+
// selector duplication with processMessage in ClaudeChat.js and contentSelector in ClaudeChat.json
8+
const pane = document.querySelector("div.fixed.flex");
9+
const btns = document.querySelectorAll('.font-claude-message button[aria-label="Preview contents"]');
10+
11+
for (const artifactBtn of btns) {
12+
const artifactName = artifactBtn.querySelector(".break-words")?.textContent
13+
// @ts-ignore
14+
artifactBtn.click();
15+
await sleep(100);
16+
// @ts-ignore
17+
document.querySelector("[data-testid=\"undefined-code\"][data-state=\"off\"]")?.click() && await sleep(100); // Click on 'Code' button if it exists and is off
18+
let artifactContent;
19+
let i = 0;
20+
while (!artifactContent && i < 10) {
21+
artifactContent = pane?.querySelector(".code-block__code, .font-claude-message");
22+
await sleep(100);
23+
}
24+
25+
let codeWithPre;
26+
if (pane?.querySelector(".code-block__code")) {
27+
codeWithPre = document.createElement("pre");
28+
codeWithPre.innerHTML = artifactContent?.outerHTML ?? "";
29+
}
30+
31+
markdown = markdown?.replace(`{{@CAPTURE_ARTIFACT_CONTENT:${artifactName}}}`, `---\n**${artifactName ?? "Artifact"}:**\n` + (format?.(codeWithPre?.outerHTML ?? artifactContent?.outerHTML ?? "") ?? "") + "\n---");
32+
}
33+
34+
// @ts-ignore
35+
pane?.querySelector(".justify-end > button")?.click(); // close artifact
36+
37+
return markdown;
38+
}
39+
40+
export async function defineAction(action: {
41+
type: any;
42+
selector?: string
43+
}, markdown?: string, format?: (html: string) => string): Promise<string | void | null> {
444
switch (action.type) {
545
case "click":
646
return clickElements(action.selector);
47+
case "click_act_close":
48+
return await clickActClose(markdown, format);
749
// case "scroll":
850
// return scrollElements;
951
// case "type":
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import {capitalizeFirst} from "../../../format/formatText";
2+
3+
export async function processMessage(content, format, metadata) {
4+
const messageSelector = content.querySelector("[data-testid=\"user-message\"], [data-is-streaming] > div");
5+
6+
const entityName = content?.querySelector("[data-is-streaming]")
7+
? "Claude"
8+
: "User";
9+
10+
const msgTitle = "## " + capitalizeFirst(entityName ?? "") + "\n";
11+
12+
return msgTitle + format(messageSelector?.innerHTML ?? "") + "\n";
13+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"domainName": "Claude Chat",
3+
"pageTitle": {
4+
"selector": "[data-testid=\"chat-menu-trigger\"]"
5+
},
6+
"contentSelector": "[data-test-render-count]",
7+
"turndown": {
8+
"rules": {
9+
"preserveLineBreaksInPre": {
10+
"filter": "filter_PreserveLineBreaksInPre_Claude",
11+
"replacement": "replacement_preserveLineBreaksInPre_Claude"
12+
},
13+
"preserveLineBreaksInCode_userQuestion": {
14+
"filter": "filter_PreserveLineBreaksInCode_Claude",
15+
"replacement": "replacement_preserveLineBreaksInCode_Claude"
16+
},
17+
"formatTables": {
18+
"filter": "filter_formatTables",
19+
"replacement": "replacement_formatTables"
20+
},
21+
"formatKatex": {
22+
"filter": "filter_formatKatex",
23+
"replacement": "replacement_formatKatex"
24+
},
25+
"captureArtifactContent": {
26+
"filter": "filter_captureArtifactContent_Claude",
27+
"replacement": "replacement_captureArtifactContent_Claude"
28+
}
29+
}
30+
},
31+
"actions": {
32+
"afterExtraction": {
33+
"type": "click_act_close"
34+
}
35+
}
36+
}

src/services/export/extractor/extractPage.ts

+4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ export async function extractPage(domain: { name: any; url?: any; }) {
3131
module = require("./domains/ChatGPT");
3232
json = require("./domains/ChatGPT.json");
3333
break;
34+
case "ClaudeChat":
35+
module = require("./domains/ClaudeChat");
36+
json = require("./domains/ClaudeChat.json");
37+
break;
3438
default:
3539
module = require("./domains/ArbitraryPage");
3640
json = require("./domains/ArbitraryPage.json");

src/services/export/extractor/extractPageContent.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export async function extractPageContent(format: (html: string) => string, metad
2626
markdown += await extractSections(messages, metadata, format, processMessage);
2727

2828
if (metadata?.actions?.afterExtraction)
29-
await safeExecute(defineAction(metadata.actions.afterExtraction));
29+
await safeExecute(async () => markdown = await defineAction(metadata.actions.afterExtraction, markdown, format) ?? markdown);
3030

3131
return markdown;
3232
}

src/services/export/extractor/extractPageMetadata.ts

+1-4
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,9 @@ export function extractPageMetadata(metadataBase: {
1919
extractor: any;
2020
} {
2121
return {
22+
...metadataBase,
2223
domainName: metadataBase.domainName ?? window.location.hostname,
2324
pageTitle: getPageTitle(metadataBase.pageTitle?.selector, metadataBase.pageTitle?.treatment),
24-
contentSelector: metadataBase.contentSelector,
25-
actions: metadataBase.actions,
26-
sourcesExtraction: metadataBase.sourcesExtraction,
27-
extractor: metadataBase.extractor,
2825
};
2926
}
3027

src/services/export/extractor/rules/rules.ts

+72
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ export function getBlankReplacement_PerplexityPages(content: any, node: { nodeNa
2020
}
2121
}
2222

23+
2324
/*
2425
--- Format tables ---
2526
*/
@@ -82,6 +83,10 @@ export function filter_preserveLineBreaksInPre_Phind(node: { nodeName: string; q
8283
return node.nodeName === 'PRE' && node.querySelector('div');
8384
}
8485

86+
export function filter_PreserveLineBreaksInPre_Claude(node: { nodeName: string; querySelector: (arg0: string) => any; }) {
87+
return node.nodeName === 'PRE' && node.querySelector('div');
88+
}
89+
8590
export function replacement_PreserveLineBreaksInPre_Perplexity(content: any, node: { querySelector: (arg0: string) => any; }) {
8691
const codeBlock = node.querySelector('code');
8792
const codeContent = codeBlock.textContent.trim();
@@ -103,6 +108,59 @@ export function replacement_preserveLineBreaksInPre_ChatGPT(content: any, node:
103108
return ('\n```' + codeLang + '\n' + codeContent + '\n```');
104109
}
105110

111+
export function replacement_preserveLineBreaksInPre_Claude(content: any, node: any) {
112+
const codeBlock = node.querySelector('code');
113+
const codeContent = codeBlock?.textContent?.trim();
114+
const codeLang = codeBlock.className.split("-")[1] ?? '';
115+
return ('\n```' + codeLang + '\n' + codeContent + '\n```');
116+
}
117+
118+
//---
119+
export function filter_PreserveLineBreaksInCode_Claude(node: {
120+
parentNode: any;
121+
nodeName: string; querySelector: (arg0: string) => any; }) {
122+
return node.nodeName === 'CODE' && node.parentNode.classList.contains('code-block__code') && !node.parentNode.parentNode.parentNode.parentNode.parentNode.querySelector('pre');
123+
// return node.nodeName === 'CODE' && node.parentNode.parentNode.parentNode.parentNode.nodeType !== "PRE";
124+
}
125+
126+
export function replacement_preserveLineBreaksInCode_Claude(content: any, node: any) {
127+
const clonedNode = node.cloneNode(true);
128+
const topLevelSpans = Array.from(clonedNode.children);
129+
130+
// @ts-ignore
131+
topLevelSpans.forEach((span: HTMLElement, index: number) => {
132+
const nestedSpans = Array.from(span.children);
133+
134+
if (nestedSpans.length > 0) {
135+
const firstSpan = nestedSpans[0] as HTMLElement;
136+
const text = firstSpan.textContent || '';
137+
138+
// Count spaces at the beginning of line
139+
const leadingSpaces = text.match(/^[\s\t]*/)?.[0].length || 0;
140+
141+
// Converts spaces to tabs (4 spaces = 1 tab)
142+
if (leadingSpaces > 0) {
143+
const tabCount = Math.floor(leadingSpaces / 4);
144+
firstSpan.textContent = '\t'.repeat(tabCount) + text.trim();
145+
}
146+
// else if (firstSpan?.textContent === '') {
147+
// firstSpan.textContent = '\t';
148+
// }
149+
}
150+
151+
// Adds a line break after each span except the last
152+
if (index < topLevelSpans.length - 1) {
153+
const newLineSpan = document.createElement('span');
154+
newLineSpan.textContent = '\n';
155+
span.parentNode?.insertBefore(newLineSpan, span.nextSibling);
156+
}
157+
});
158+
159+
const codeContent = clonedNode.textContent?.trim() || '';
160+
const codeLang = node?.className?.split("-")[1] || '';
161+
162+
return `\n\`\`\`${codeLang}\n${codeContent}\n\`\`\``;
163+
}
106164

107165

108166
/*
@@ -179,6 +237,20 @@ export function replacement_formatKatex(content: any, node: { querySelector: (ar
179237
return '$' + mathml + '$';
180238
}
181239

240+
/*
241+
--- Claude rules ---
242+
*/
243+
export function filter_captureArtifactContent_Claude(node: Element) {
244+
//target button[aria-label="Preview contents"] :
245+
return node.nodeName === 'BUTTON' && node.getAttribute('aria-label') === 'Preview contents';
246+
}
247+
248+
export function replacement_captureArtifactContent_Claude(content: any, node: Element) {
249+
return `{{@CAPTURE_ARTIFACT_CONTENT:${node.querySelector(".break-words")?.textContent}}}`;
250+
}
251+
252+
253+
182254
/*
183255
--- Phind rules ---
184256
*/

src/services/interact/cs/interact.ts

+10-4
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,25 @@ export async function clickElements(cssSelector = '.fe-chevron-down') {
3232
* @param content {HTMLElement} The content of the page
3333
* @returns {Promise<Element | null | undefined>} The last element clicked on
3434
*/
35-
export async function selectAndClick(actionsList: Array<{ selector: string; scope: string; wait: number | undefined; }>, content: HTMLElement): Promise<Element | null> {
35+
export async function selectAndClick(actionsList: Array<{ selector?: string; scope?: string; wait?: number; }>, content: HTMLElement): Promise<Element | Document | null> {
3636
let element = null;
3737
for (const query of actionsList) {
3838
switch (query.scope) {
3939
case 'content':
40-
element = content.querySelector(query.selector);
40+
element = query.selector
41+
? content.querySelector(query.selector)
42+
: content;
4143
break;
4244
case 'document':
43-
element = document.querySelector(query.selector);
45+
element = query.selector
46+
? document.querySelector(query.selector)
47+
: document;
4448
break;
4549
default:
4650
console.warn("Unknown scope: " + query.scope + ". Defaulting to content for query: " + query.selector + ".");
47-
return content.querySelector(query.selector);
51+
return query.selector
52+
? content.querySelector(query.selector)
53+
: content;
4854
}
4955

5056
//TODO: define custom delay for each domain separately in JSON configuration

0 commit comments

Comments
 (0)