Skip to content

Commit c6b513a

Browse files
committed
feat: add md mode in searchscraper
1 parent e9b6ca7 commit c6b513a

File tree

10 files changed

+1056
-157
lines changed

10 files changed

+1056
-157
lines changed
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/**
2+
* Basic SearchScraper Markdown Example
3+
*
4+
* This example demonstrates the simplest way to use the SearchScraper API
5+
* in markdown mode to search and scrape web pages, returning raw markdown content
6+
* instead of AI-extracted data.
7+
*
8+
* Features demonstrated:
9+
* - Basic search and scrape with markdown output
10+
* - Simple error handling
11+
* - Minimal code approach
12+
* - Cost-effective: Only 2 credits per page (vs 10 credits for AI extraction)
13+
*/
14+
15+
import { searchScraper } from 'scrapegraph-js';
16+
import 'dotenv/config';
17+
18+
const apiKey = process.env.SGAI_APIKEY;
19+
20+
async function basicSearchScraperMarkdownExample() {
21+
console.log('🔍 Basic SearchScraper Markdown Example');
22+
console.log('='.repeat(50));
23+
24+
// Configuration
25+
const userPrompt = 'Latest developments in artificial intelligence';
26+
const numResults = 3;
27+
28+
console.log(`📝 Query: ${userPrompt}`);
29+
console.log(`📊 Results: ${numResults} websites`);
30+
console.log('🔧 Mode: Markdown conversion');
31+
console.log('💰 Cost: 2 credits per page (vs 10 for AI extraction)');
32+
33+
try {
34+
// Send a searchscraper request in markdown mode
35+
const response = await searchScraper(
36+
apiKey,
37+
userPrompt,
38+
numResults,
39+
null, // schema
40+
null, // userAgent
41+
{
42+
extractionMode: false, // false = markdown mode, true = AI extraction mode
43+
}
44+
);
45+
46+
console.log('\n✅ SearchScraper markdown completed successfully!');
47+
console.log(`📄 Request ID: ${response.request_id || 'N/A'}`);
48+
49+
// For async requests, you would need to poll for results
50+
if (response.request_id && !response.status) {
51+
console.log('📝 This is an async request. Use getSearchScraperRequest() to retrieve results.');
52+
console.log(`🔍 Use: getSearchScraperRequest('${response.request_id}')`);
53+
} else {
54+
// If it's a sync response, display the results
55+
if (response.markdown_content) {
56+
const markdownContent = response.markdown_content;
57+
console.log('\n📝 Markdown Content Preview:');
58+
console.log(markdownContent.length > 500
59+
? markdownContent.substring(0, 500) + '...'
60+
: markdownContent
61+
);
62+
} else {
63+
console.log('⚠️ No markdown content returned');
64+
}
65+
66+
if (response.reference_urls) {
67+
console.log(`\n🔗 References: ${response.reference_urls.length}`);
68+
console.log('\n🔗 Reference URLs:');
69+
response.reference_urls.forEach((url, index) => {
70+
console.log(` ${index + 1}. ${url}`);
71+
});
72+
} else {
73+
console.log('⚠️ No reference URLs returned');
74+
}
75+
}
76+
77+
return true;
78+
79+
} catch (error) {
80+
console.error(`❌ Error: ${error.message}`);
81+
return false;
82+
}
83+
}
84+
85+
// Run the example
86+
try {
87+
const success = await basicSearchScraperMarkdownExample();
88+
process.exit(success ? 0 : 1);
89+
} catch (error) {
90+
console.error('❌ Unexpected error:', error.message);
91+
process.exit(1);
92+
}
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/**
2+
* Advanced SearchScraper Markdown Example with Async Polling
3+
*
4+
* This example demonstrates using the SearchScraper API in markdown mode
5+
* with async request handling and result polling.
6+
*
7+
* Features demonstrated:
8+
* - Async search and scrape with markdown output
9+
* - Polling for async results with timeout handling
10+
* - Error handling with async operations
11+
* - Cost-effective: Only 2 credits per page (vs 10 credits for AI extraction)
12+
*/
13+
14+
import { searchScraper, getSearchScraperRequest } from 'scrapegraph-js';
15+
import 'dotenv/config';
16+
17+
const apiKey = process.env.SGAI_APIKEY;
18+
19+
/**
20+
* Poll for completion of an async SearchScraper request.
21+
*
22+
* @param {string} requestId - The request ID to poll for
23+
* @param {number} maxWaitTime - Maximum time to wait in seconds
24+
* @returns {Promise<Object|null>} The completed response or null if timeout
25+
*/
26+
async function waitForCompletion(requestId, maxWaitTime = 60) {
27+
const startTime = Date.now();
28+
29+
while (Date.now() - startTime < maxWaitTime * 1000) {
30+
try {
31+
const result = await getSearchScraperRequest(apiKey, requestId);
32+
33+
if (result.status === 'completed') {
34+
return result;
35+
} else if (result.status === 'failed') {
36+
console.error(`❌ Request failed: ${result.error || 'Unknown error'}`);
37+
return null;
38+
} else {
39+
console.log(`⏳ Status: ${result.status || 'processing'}... waiting 5 seconds`);
40+
await new Promise(resolve => setTimeout(resolve, 5000));
41+
}
42+
43+
} catch (error) {
44+
console.warn(`⚠️ Error polling for results: ${error.message}`);
45+
await new Promise(resolve => setTimeout(resolve, 5000));
46+
}
47+
}
48+
49+
console.log('⏰ Timeout waiting for completion');
50+
return null;
51+
}
52+
53+
async function advancedSearchScraperMarkdownExample() {
54+
console.log('🔍 Advanced SearchScraper Markdown Example with Async Polling');
55+
console.log('='.repeat(60));
56+
57+
// Configuration
58+
const userPrompt = 'Latest developments in artificial intelligence';
59+
const numResults = 3;
60+
61+
console.log(`📝 Query: ${userPrompt}`);
62+
console.log(`📊 Results: ${numResults} websites`);
63+
console.log('🔧 Mode: Markdown conversion');
64+
console.log('💰 Cost: 2 credits per page (vs 10 for AI extraction)');
65+
66+
try {
67+
// Send a searchscraper request in markdown mode
68+
const response = await searchScraper(
69+
apiKey,
70+
userPrompt,
71+
numResults,
72+
null, // schema
73+
null, // userAgent
74+
{
75+
extractionMode: false, // false = markdown mode, true = AI extraction mode
76+
}
77+
);
78+
79+
console.log('\n✅ SearchScraper request submitted successfully!');
80+
console.log(`📄 Request ID: ${response.request_id || 'N/A'}`);
81+
82+
// Check if this is an async request that needs polling
83+
if (response.request_id && !response.status) {
84+
console.log('⏳ Waiting for async processing to complete...');
85+
86+
// Poll for completion
87+
const finalResult = await waitForCompletion(response.request_id);
88+
89+
if (finalResult) {
90+
// Update response with final results
91+
Object.assign(response, finalResult);
92+
} else {
93+
console.error('❌ Failed to get completed results');
94+
return false;
95+
}
96+
}
97+
98+
// Display results
99+
if (response.status === 'completed') {
100+
console.log('\n🎉 SearchScraper markdown completed successfully!');
101+
102+
// Display markdown content (first 500 chars)
103+
if (response.markdown_content) {
104+
const markdownContent = response.markdown_content;
105+
console.log('\n📝 Markdown Content Preview:');
106+
console.log(markdownContent.length > 500
107+
? markdownContent.substring(0, 500) + '...'
108+
: markdownContent
109+
);
110+
} else {
111+
console.log('⚠️ No markdown content returned');
112+
}
113+
114+
// Display reference URLs
115+
if (response.reference_urls && response.reference_urls.length > 0) {
116+
console.log(`\n🔗 References: ${response.reference_urls.length}`);
117+
console.log('\n🔗 Reference URLs:');
118+
response.reference_urls.forEach((url, index) => {
119+
console.log(` ${index + 1}. ${url}`);
120+
});
121+
} else {
122+
console.log('⚠️ No reference URLs returned');
123+
}
124+
125+
return true;
126+
} else {
127+
console.error(`❌ Request not completed. Status: ${response.status || 'unknown'}`);
128+
return false;
129+
}
130+
131+
} catch (error) {
132+
console.error(`❌ Error: ${error.message}`);
133+
return false;
134+
}
135+
}
136+
137+
// Run the example
138+
try {
139+
const success = await advancedSearchScraperMarkdownExample();
140+
process.exit(success ? 0 : 1);
141+
} catch (error) {
142+
console.error('❌ Unexpected error:', error.message);
143+
process.exit(1);
144+
}

scrapegraph-js/src/searchScraper.js

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ import { getMockResponse } from './utils/mockResponse.js';
1818
* @param {Object} options - Optional configuration options
1919
* @param {boolean} options.mock - Override mock mode for this request
2020
* @param {boolean} options.renderHeavyJs - Whether to render heavy JavaScript on the page
21+
* @param {boolean} [options.extractionMode=true] - Whether to use AI extraction (true) or markdown conversion (false).
22+
* AI extraction costs 10 credits per page, markdown conversion costs 2 credits per page.
2123
* @returns {Promise<string>} Extracted data in JSON format matching the provided schema
2224
* @throws - Will throw an error in case of an HTTP failure.
2325
*/
2426
export async function searchScraper(apiKey, prompt, numResults = 3, schema = null, userAgent = null, options = {}) {
25-
const { mock = null, renderHeavyJs = false } = options;
27+
const { mock = null, renderHeavyJs = false, extractionMode = true } = options;
2628

2729
// Check if mock mode is enabled
2830
const useMock = mock !== null ? mock : isMockEnabled();
@@ -51,6 +53,7 @@ export async function searchScraper(apiKey, prompt, numResults = 3, schema = nul
5153
user_prompt: prompt,
5254
num_results: numResults,
5355
render_heavy_js: renderHeavyJs,
56+
extraction_mode: extractionMode,
5457
};
5558

5659
if (schema) {

0 commit comments

Comments
 (0)