forked from github/lightcrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Drew McMillan
committed
Apr 13, 2018
0 parents
commit 696b473
Showing
6 changed files
with
319 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# light-mc-crawler | ||
Forked from https://github.com/github/lightcrawler | ||
|
||
Crawl a website and run it through Google lighthouse, finding mixed content | ||
|
||
```bash | ||
npm install -s https://github.com/drewmcmillan/light-mc-crawler | ||
|
||
light-mc-crawler --url https://www.example.com --config light-mc-crawler-config.json | ||
``` | ||
|
||
where `light-mc-crawler-config.json` looks something like this: | ||
```json | ||
{ | ||
"maxDepth": 2, | ||
"maxChromeInstances": 5, | ||
"limit": "/music/", | ||
"httpsOnly": true, | ||
"showHttpLinksDuring": false, | ||
"showHttpLinksAfter": true | ||
} | ||
``` | ||
|
||
## Arguments | ||
|
||
### limit | ||
Limits the crawling to urls containing a certain substring | ||
|
||
### httpsOnly | ||
Converts any http links found into https | ||
|
||
### showHttpLinksDuring | ||
Logs any http links found during the crawling | ||
|
||
### showHttpLinksAfter | ||
Logs any http links found after the crawling | ||
|
||
### Example output | ||
``` | ||
Http link on https://www.example.com/music | ||
http://www.example.com/music/resources/idt-sh/dancing | ||
http://www.example.com/music/resources/idt-sh/clubs | ||
http://www.example.com/music/musictime | ||
http://www.example.com/music/musictime | ||
http://www.example.com/music/10628994 | ||
http://www.example.com/music/help-41670342 | ||
Mixed Content | ||
https://www.example.com/music ✗ is-on-https - Does not use HTTPS | ||
http://www.petmd.com/sites/default/files/petmd-cat-happy-10.jpg | ||
http://www.argospetinsurance.co.uk/assets/uploads/2017/12/cat-pet-animal-domestic-104827.jpeg | ||
https://www.example.com/music ✗ mixed-content - Some insecure resources can be upgraded to HTTPS | ||
``` | ||
|
||
Enjoy! | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env node | ||
|
||
const yargs = require('yargs') | ||
const lightcrawler = require('.') | ||
|
||
const options = yargs.demandOption(['c', 'u']) | ||
.alias('u', 'url').describe('url', 'URL to crawl') | ||
.alias('h', 'help').help('h') | ||
.alias('c', 'config').describe('config', 'Options for lighthouse') | ||
.argv | ||
|
||
lightcrawler(options) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"maxDepth": 1, | ||
"maxChromeInstances": 5, | ||
"limit": "/music/", | ||
"httpsOnly": true, | ||
"showHttpLinksDuring": false, | ||
"showHttpLinksAfter": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
const cheerio = require('cheerio') | ||
const ChildProcess = require('child_process') | ||
const Crawler = require('simplecrawler') | ||
const path = require('path') | ||
const queue = require('async/queue') | ||
const fs = require('fs') | ||
const colors = require('colors') | ||
|
||
const stats = { | ||
pageCount: 0, | ||
violationCounts: {}, | ||
foundHttpLinks: {}, | ||
passedAuditsCount: 0, | ||
startTime: null, | ||
auditTimesByPageUrl: {} | ||
} | ||
|
||
module.exports = (options) => { | ||
stats.startTime = new Date() | ||
|
||
const configPath = path.resolve(options.config) | ||
const config = JSON.parse(fs.readFileSync(configPath)) | ||
|
||
const crawler = new Crawler(options.url) | ||
crawler.respectRobotsTxt = false | ||
crawler.parseHTMLComments = false | ||
crawler.parseScriptTags = false | ||
crawler.maxDepth = config.maxDepth || 1 | ||
|
||
|
||
crawler.discoverResources = (buffer, item) => { | ||
const page = cheerio.load(buffer.toString('utf8')) | ||
var links = page('a[href]').map(function () { | ||
return page(this).attr('href') | ||
}).get() | ||
|
||
if(config.limit){ | ||
links = links.filter(function(s){ | ||
return ~s.indexOf(config.limit); | ||
}); | ||
} | ||
|
||
if(config.showHttpLinksDuring || config.showHttpLinksAfter){ | ||
links.forEach(function(link) { | ||
if(link.indexOf('http://') !== -1){ | ||
if(!stats.foundHttpLinks[item.url]){ | ||
stats.foundHttpLinks[item.url] = []; | ||
} | ||
|
||
stats.foundHttpLinks[item.url].push(link) | ||
} | ||
}); | ||
|
||
if(config.showHttpLinksDuring && stats.foundHttpLinks[item.url]){ | ||
console.log(); | ||
console.log('Http link(s) on '.bold.underline + item.url.bold.underline); | ||
stats.foundHttpLinks[item.url].forEach(function(link) { | ||
console.log(' ' + link); | ||
}); | ||
} | ||
} | ||
|
||
return links | ||
} | ||
|
||
let totalErrorCount = 0 | ||
|
||
const lighthouseQueue = queue((url, callback) => { | ||
runLighthouse(url, config, (errorCount) => { | ||
totalErrorCount += errorCount | ||
callback() | ||
}) | ||
}, config.maxChromeInstances || 5) | ||
|
||
crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => { | ||
lighthouseQueue.push(queueItem.url) | ||
}) | ||
|
||
crawler.once('complete', () => { | ||
lighthouseQueue.drain = () => { | ||
printStats(config) | ||
if (totalErrorCount > 0) { | ||
process.exit(1) | ||
} | ||
} | ||
}) | ||
|
||
crawler.start() | ||
} | ||
|
||
function runLighthouse (url, config, callback) { | ||
if(config.httpsOnly){ | ||
url = url.replace("http://", "https://"); | ||
} | ||
|
||
stats.pageCount++ | ||
var mixedContent = require.resolve('lighthouse/lighthouse-core/config/mixed-content.js') | ||
var chromeFlags = config.chromeFlags || '--headless --disable-gpu'; | ||
const args = [ | ||
url, | ||
'--output=json', | ||
'--output-path=stdout', | ||
'--disable-device-emulation', | ||
'--disable-cpu-throttling', | ||
'--disable-network-throttling', | ||
'--chrome-flags=' + chromeFlags, | ||
`--config-path=${mixedContent}` | ||
] | ||
|
||
const lighthousePath = require.resolve('lighthouse/lighthouse-cli/index.js') | ||
const lighthouse = ChildProcess.spawn(lighthousePath, args) | ||
|
||
let output = '' | ||
lighthouse.stdout.on('data', (data) => { | ||
output += data | ||
}) | ||
|
||
stats.auditTimesByPageUrl[url] = {startTime: new Date()} | ||
lighthouse.once('close', () => { | ||
stats.auditTimesByPageUrl[url].endTime = new Date() | ||
let errorCount = 0 | ||
|
||
let report | ||
try { | ||
report = JSON.parse(output) | ||
} catch (parseError) { | ||
console.error(`Parsing JSON report output failed: ${output}`) | ||
callback(1) | ||
return | ||
} | ||
|
||
report.reportCategories.forEach((category) => { | ||
let displayedCategory = false | ||
category.audits.forEach((audit) => { | ||
if (audit.score === 100) { | ||
stats.passedAuditsCount++ | ||
} else { | ||
if (!displayedCategory) { | ||
console.log(); | ||
console.log(category.name.bold.underline); | ||
displayedCategory = true | ||
} | ||
errorCount++ | ||
console.log(url.replace(/\/$/, ''), '\u2717'.red, audit.id.bold, '-', audit.result.description.italic) | ||
|
||
if (stats.violationCounts[category.name] === undefined) { | ||
stats.violationCounts[category.name] = 0 | ||
} | ||
|
||
if (audit.result.extendedInfo) { | ||
const {value} = audit.result.extendedInfo | ||
if (Array.isArray(value)) { | ||
stats.violationCounts[category.name] += value.length | ||
value.forEach((result) => { | ||
if (result.url) { | ||
console.log(` ${result.url}`) | ||
} | ||
}) | ||
} else if (Array.isArray(value.nodes)) { | ||
stats.violationCounts[category.name] += value.nodes.length | ||
const messagesToNodes = {} | ||
value.nodes.forEach((result) => { | ||
let message = result.failureSummary | ||
message = message.replace(/^Fix any of the following:/g, '').trim() | ||
if (messagesToNodes[message]) { | ||
messagesToNodes[message].push(result.html) | ||
} else { | ||
messagesToNodes[message] = [result.html] | ||
} | ||
}) | ||
Object.keys(messagesToNodes).forEach((message) => { | ||
console.log(` ${message}`) | ||
messagesToNodes[message].forEach(node => { | ||
console.log(` ${node}`.gray) | ||
}) | ||
}) | ||
} else { | ||
stats.violationCounts[category.name]++ | ||
} | ||
} | ||
} | ||
}) | ||
}) | ||
|
||
callback(errorCount) | ||
}) | ||
} | ||
|
||
function printStats(config) { | ||
console.log(); | ||
console.log(); | ||
if(config.showHttpLinksAfter){ | ||
for(var index in stats.foundHttpLinks) { | ||
console.log('Http link(s) on '.bold.underline + index.bold.underline); | ||
stats.foundHttpLinks[index].forEach(function(link) { | ||
console.log(' ' + link); | ||
}); | ||
} | ||
} | ||
console.log(); | ||
console.log(); | ||
console.log('Lighthouse Summary'.bold.underline); | ||
console.log(` Total Pages Scanned: ${stats.pageCount}`); | ||
console.log(` Total Auditing Time: ${new Date() - stats.startTime} ms`); | ||
const totalTime = Object.keys(stats.auditTimesByPageUrl).reduce((sum, url) => { | ||
const {endTime, startTime} = stats.auditTimesByPageUrl[url] | ||
return (endTime - startTime) + sum | ||
}, 0) | ||
console.log(` Average Page Audit Time: ${Math.round(totalTime/stats.pageCount)} ms`); | ||
console.log(` Total Audits Passed: ${stats.passedAuditsCount}`, '\u2713'.green); | ||
if (Object.keys(stats.violationCounts).length === 0) { | ||
console.log(` Total Violations: None! \\o/ 🎉`); | ||
} else { | ||
console.log(` Total Violations:`); | ||
Object.keys(stats.violationCounts).forEach(category => { | ||
console.log(` ${category}: ${stats.violationCounts[category]}`, '\u2717'.red); | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
"name": "light-mc-crawler", | ||
"version": "1.2.0", | ||
"description": "", | ||
"main": "index.js", | ||
"scripts": { | ||
"example": "node ./cli.js --url https://www.example.com --config example.json" | ||
}, | ||
"bin": { | ||
"light-mc-crawler": "./cli.js" | ||
}, | ||
"keywords": [], | ||
"author": "", | ||
"license": "ISC", | ||
"dependencies": { | ||
"async": "^2.4.1", | ||
"cheerio": "^1.0.0-rc.1", | ||
"colors": "^1.1.2", | ||
"lighthouse": "^2.9.4", | ||
"simplecrawler": "^1.1.3", | ||
"yargs": "^8.0.2" | ||
} | ||
} |