-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler-storage.js
126 lines (93 loc) · 3.62 KB
/
crawler-storage.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
try {
const url = `https://www.trustpilot.com/search?query=${formattedKeyword}&page=${pageNumber+1}`;
await page.goto(url);
console.log(`Successfully fetched: ${url}`);
const script = await page.$("script[id='__NEXT_DATA__']");
const innerHTML = await page.evaluate(element => element.innerHTML, script);
const jsonData = JSON.parse(innerHTML);
const businessUnits = jsonData.props.pageProps.businessUnits;
for (const business of businessUnits) {
let category = "n/a";
if ("categories" in business && business.categories.length > 0) {
category = business.categories[0].categoryId;
}
let location = "n/a";
if ("location" in business && "country" in business.location) {
location = business.location.country
}
const trustpilotFormatted = business.contact.website.split("://")[1];
const businessInfo = {
name: business.displayName.toLowerCase().replace(" ", "").replace("'", ""),
stars: business.stars,
rating: business.trustScore,
num_reviews: business.numberOfReviews,
website: business.contact.website,
trustpilot_url: `https://www.trustpilot.com/review/${trustpilotFormatted}`,
location: location,
category: category
};
await writeToCsv([businessInfo], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, pages, location, retries) {
const pageList = range(0, pages);
const browser = await puppeteer.launch()
for (const page of pageList) {
await scrapeSearchResults(browser, keyword, page, location, retries);
}
await browser.close();
}
async function main() {
const keywords = ["online bank"];
const concurrencyLimit = 5;
const pages = 1;
const location = "us";
const retries = 3;
for (const keyword of keywords) {
await startScrape(keyword, pages, location, concurrencyLimit, retries);
}
}
main();