Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Drew McMillan committed Apr 13, 2018
0 parents commit 696b473
Show file tree
Hide file tree
Showing 6 changed files with 319 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# light-mc-crawler
Forked from https://github.com/github/lightcrawler

Crawl a website and run it through Google lighthouse, finding mixed content

```bash
npm install -s https://github.com/drewmcmillan/light-mc-crawler

light-mc-crawler --url https://www.example.com --config light-mc-crawler-config.json
```

where `light-mc-crawler-config.json` looks something like this:
```json
{
"maxDepth": 2,
"maxChromeInstances": 5,
"limit": "/music/",
"httpsOnly": true,
"showHttpLinksDuring": false,
"showHttpLinksAfter": true
}
```

## Arguments

### limit
Limits the crawling to urls containing a certain substring

### httpsOnly
Converts any http links found into https

### showHttpLinksDuring
Logs any http links found during the crawling

### showHttpLinksAfter
Logs any http links found after the crawling

### Example output
```
Http link on https://www.example.com/music
http://www.example.com/music/resources/idt-sh/dancing
http://www.example.com/music/resources/idt-sh/clubs
http://www.example.com/music/musictime
http://www.example.com/music/musictime
http://www.example.com/music/10628994
http://www.example.com/music/help-41670342
Mixed Content
https://www.example.com/music ✗ is-on-https - Does not use HTTPS
http://www.petmd.com/sites/default/files/petmd-cat-happy-10.jpg
http://www.argospetinsurance.co.uk/assets/uploads/2017/12/cat-pet-animal-domestic-104827.jpeg
https://www.example.com/music ✗ mixed-content - Some insecure resources can be upgraded to HTTPS
```

Enjoy!

12 changes: 12 additions & 0 deletions cli.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env node

const yargs = require('yargs')
const lightcrawler = require('.')

const options = yargs.demandOption(['c', 'u'])
.alias('u', 'url').describe('url', 'URL to crawl')
.alias('h', 'help').help('h')
.alias('c', 'config').describe('config', 'Options for lighthouse')
.argv

lightcrawler(options)
8 changes: 8 additions & 0 deletions example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"maxDepth": 1,
"maxChromeInstances": 5,
"limit": "/music/",
"httpsOnly": true,
"showHttpLinksDuring": false,
"showHttpLinksAfter": true
}
219 changes: 219 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
const cheerio = require('cheerio')
const ChildProcess = require('child_process')
const Crawler = require('simplecrawler')
const path = require('path')
const queue = require('async/queue')
const fs = require('fs')
const colors = require('colors')

const stats = {
pageCount: 0,
violationCounts: {},
foundHttpLinks: {},
passedAuditsCount: 0,
startTime: null,
auditTimesByPageUrl: {}
}

module.exports = (options) => {
stats.startTime = new Date()

const configPath = path.resolve(options.config)
const config = JSON.parse(fs.readFileSync(configPath))

const crawler = new Crawler(options.url)
crawler.respectRobotsTxt = false
crawler.parseHTMLComments = false
crawler.parseScriptTags = false
crawler.maxDepth = config.maxDepth || 1


crawler.discoverResources = (buffer, item) => {
const page = cheerio.load(buffer.toString('utf8'))
var links = page('a[href]').map(function () {
return page(this).attr('href')
}).get()

if(config.limit){
links = links.filter(function(s){
return ~s.indexOf(config.limit);
});
}

if(config.showHttpLinksDuring || config.showHttpLinksAfter){
links.forEach(function(link) {
if(link.indexOf('http://') !== -1){
if(!stats.foundHttpLinks[item.url]){
stats.foundHttpLinks[item.url] = [];
}

stats.foundHttpLinks[item.url].push(link)
}
});

if(config.showHttpLinksDuring && stats.foundHttpLinks[item.url]){
console.log();
console.log('Http link(s) on '.bold.underline + item.url.bold.underline);
stats.foundHttpLinks[item.url].forEach(function(link) {
console.log(' ' + link);
});
}
}

return links
}

let totalErrorCount = 0

const lighthouseQueue = queue((url, callback) => {
runLighthouse(url, config, (errorCount) => {
totalErrorCount += errorCount
callback()
})
}, config.maxChromeInstances || 5)

crawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
lighthouseQueue.push(queueItem.url)
})

crawler.once('complete', () => {
lighthouseQueue.drain = () => {
printStats(config)
if (totalErrorCount > 0) {
process.exit(1)
}
}
})

crawler.start()
}

function runLighthouse (url, config, callback) {
if(config.httpsOnly){
url = url.replace("http://", "https://");
}

stats.pageCount++
var mixedContent = require.resolve('lighthouse/lighthouse-core/config/mixed-content.js')
var chromeFlags = config.chromeFlags || '--headless --disable-gpu';
const args = [
url,
'--output=json',
'--output-path=stdout',
'--disable-device-emulation',
'--disable-cpu-throttling',
'--disable-network-throttling',
'--chrome-flags=' + chromeFlags,
`--config-path=${mixedContent}`
]

const lighthousePath = require.resolve('lighthouse/lighthouse-cli/index.js')
const lighthouse = ChildProcess.spawn(lighthousePath, args)

let output = ''
lighthouse.stdout.on('data', (data) => {
output += data
})

stats.auditTimesByPageUrl[url] = {startTime: new Date()}
lighthouse.once('close', () => {
stats.auditTimesByPageUrl[url].endTime = new Date()
let errorCount = 0

let report
try {
report = JSON.parse(output)
} catch (parseError) {
console.error(`Parsing JSON report output failed: ${output}`)
callback(1)
return
}

report.reportCategories.forEach((category) => {
let displayedCategory = false
category.audits.forEach((audit) => {
if (audit.score === 100) {
stats.passedAuditsCount++
} else {
if (!displayedCategory) {
console.log();
console.log(category.name.bold.underline);
displayedCategory = true
}
errorCount++
console.log(url.replace(/\/$/, ''), '\u2717'.red, audit.id.bold, '-', audit.result.description.italic)

if (stats.violationCounts[category.name] === undefined) {
stats.violationCounts[category.name] = 0
}

if (audit.result.extendedInfo) {
const {value} = audit.result.extendedInfo
if (Array.isArray(value)) {
stats.violationCounts[category.name] += value.length
value.forEach((result) => {
if (result.url) {
console.log(` ${result.url}`)
}
})
} else if (Array.isArray(value.nodes)) {
stats.violationCounts[category.name] += value.nodes.length
const messagesToNodes = {}
value.nodes.forEach((result) => {
let message = result.failureSummary
message = message.replace(/^Fix any of the following:/g, '').trim()
if (messagesToNodes[message]) {
messagesToNodes[message].push(result.html)
} else {
messagesToNodes[message] = [result.html]
}
})
Object.keys(messagesToNodes).forEach((message) => {
console.log(` ${message}`)
messagesToNodes[message].forEach(node => {
console.log(` ${node}`.gray)
})
})
} else {
stats.violationCounts[category.name]++
}
}
}
})
})

callback(errorCount)
})
}

function printStats(config) {
console.log();
console.log();
if(config.showHttpLinksAfter){
for(var index in stats.foundHttpLinks) {
console.log('Http link(s) on '.bold.underline + index.bold.underline);
stats.foundHttpLinks[index].forEach(function(link) {
console.log(' ' + link);
});
}
}
console.log();
console.log();
console.log('Lighthouse Summary'.bold.underline);
console.log(` Total Pages Scanned: ${stats.pageCount}`);
console.log(` Total Auditing Time: ${new Date() - stats.startTime} ms`);
const totalTime = Object.keys(stats.auditTimesByPageUrl).reduce((sum, url) => {
const {endTime, startTime} = stats.auditTimesByPageUrl[url]
return (endTime - startTime) + sum
}, 0)
console.log(` Average Page Audit Time: ${Math.round(totalTime/stats.pageCount)} ms`);
console.log(` Total Audits Passed: ${stats.passedAuditsCount}`, '\u2713'.green);
if (Object.keys(stats.violationCounts).length === 0) {
console.log(` Total Violations: None! \\o/ 🎉`);
} else {
console.log(` Total Violations:`);
Object.keys(stats.violationCounts).forEach(category => {
console.log(` ${category}: ${stats.violationCounts[category]}`, '\u2717'.red);
})
}
}
23 changes: 23 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"name": "light-mc-crawler",
"version": "1.2.0",
"description": "",
"main": "index.js",
"scripts": {
"example": "node ./cli.js --url https://www.example.com --config example.json"
},
"bin": {
"light-mc-crawler": "./cli.js"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"async": "^2.4.1",
"cheerio": "^1.0.0-rc.1",
"colors": "^1.1.2",
"lighthouse": "^2.9.4",
"simplecrawler": "^1.1.3",
"yargs": "^8.0.2"
}
}

0 comments on commit 696b473

Please sign in to comment.