Skip to content
This repository was archived by the owner on Mar 25, 2022. It is now read-only.

Commit 8c7a7a7

Browse files
committed
90% done
0 parents  commit 8c7a7a7

10 files changed

+585
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
node_modules

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
# Architecture Philosophy
3+
4+
This bot employs a 'Divide and Conquer' strategy.
5+
6+
**What it doesn't do:** Grab all the document numbers first and then scrape them
7+
8+
**What it does:** Grab a bunch of document numbers, suppose 100 from 5 pages. Scrape them, continue where left.
9+
10+
---
11+

backlog.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"first":{"doc":"SCE0000144093728","page":-1},"last":{"doc":"DCR0000001000809","page":-1}}

index.js

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
var table_name = 'documents'
2+
3+
var fs = require('fs')
4+
var url = require('url')
5+
6+
var store = require('./store')
7+
var scraper = require('./scraper')
8+
var indexer = require('./indexer')
9+
10+
var log = require('./logger')
11+
12+
13+
// Load backlog to store
14+
15+
log.load()
16+
17+
18+
// Set up data dump file
19+
20+
iso_date = new Date()
21+
store.data_filename = 'data'+iso_date+'.sql'
22+
23+
sql_insert_query = "INSERT INTO "+table_name+" () VALUES "
24+
25+
try {
26+
fs.writeFileSync(store.data_filename, sql_insert_query)
27+
}
28+
29+
catch(e) {
30+
console.log(e)
31+
process.exit(1)
32+
}
33+
34+
35+
// Write to dump file when scraper queue gets empty
36+
// Restart the process
37+
38+
scraper.on('drain',function(){
39+
fs.appendFile('message.txt', JSON.stringify(store.documents, null, '\t'), function(err) {
40+
if (err) throw err
41+
console.log('Data File: Write Complete')
42+
})
43+
44+
console.log(store.backlog.last.doc)
45+
})
46+
47+
indexer.on('drain', function() {
48+
49+
})
50+
51+
52+
53+
/** Queue Pages **/
54+
55+
start_indexer()
56+
57+
58+
/** Queue Certificates **/
59+
60+
// Pass uri_list
61+
62+
scraper.queue(['http://www.adene.pt/sce/certificados/SCE0000144093728', 'http://www.adene.pt/sce/certificados/DCR0000001000809'])
63+
64+
65+
/** On Exit **/
66+
67+
process.on('exit', function(code) {
68+
69+
console.log('\n## EXITING - Code '+code+' ##')
70+
71+
// Save from store
72+
log.save()
73+
});
74+
75+
76+
var lib = {
77+
78+
get_index_uri: function(page_number) {
79+
return constants.index.uri+'page='+page_number+'&'+constants.index.querystring
80+
},
81+
82+
start_indexer: function(page_number, offset) {
83+
84+
page_list = []
85+
86+
for(i=page_number; i<offset; i++) {
87+
page_list.push(get_index_uri(i))
88+
}
89+
90+
indexer.queue(page_list)
91+
92+
}
93+
94+
}
95+
96+
var constants = {
97+
index: {
98+
uri: 'http://www.adene.pt/sce/micro/certificados-energeticos?',
99+
querystring: 'tipo_cert=Todos&tipo_ed=Todos&morada=&concelho=all&distrito=all&freguesia=all&conservatoria=&conservatoria_nr=&artigo=&fracao=&numero=&op=Pesquisar&form_build_id=form-qpN7d8_HPQqSQJGhFxB024FLI8tBZLX_naofWt_Mwlo&form_id=certificados_webservice_form'
100+
}
101+
}

indexer.js

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/** INDEXER **/
2+
var store = require('./store')
3+
4+
var Crawler = require('crawler')
5+
6+
var indexer = new Crawler({
7+
maxConnections: 1,
8+
9+
callback: function(err, res, done) {
10+
11+
var $ = res.$
12+
13+
store.doc_uri_list
14+
15+
done()
16+
17+
}
18+
19+
})
20+
21+
exports = module.exports = indexer

logger.js

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
var fs = require('fs')
2+
var store = require('./store')
3+
4+
var log = module.exports = {
5+
6+
filename: 'backlog.json',
7+
8+
struct: {
9+
first: {
10+
doc: null,
11+
page: null
12+
},
13+
last: {
14+
doc: null,
15+
page: null
16+
}
17+
},
18+
19+
create: function() {
20+
21+
fs.open(this.filename, 'wx', function(err, fd) {
22+
if (err) {
23+
if (err.code === "EEXIST") {
24+
console.log('[Logger] Log file already exists');
25+
return;
26+
} else throw err
27+
}
28+
29+
else {
30+
fs.writeFile(fd, JSON.stringify(log.struct), function(err) {
31+
if(err)
32+
console.log('[Log File] Write Failed')
33+
})
34+
}
35+
36+
})
37+
38+
},
39+
40+
save: function() {
41+
42+
try {
43+
fs.writeFileSync('./'+this.filename, JSON.stringify(store.backlog))
44+
} catch(e) {
45+
console.log('[Log File] Dump failed')
46+
}
47+
48+
},
49+
50+
load: function() {
51+
52+
try {
53+
data = fs.readFileSync('./'+this.filename);
54+
store.backlog = JSON.parse(data.toString())
55+
}
56+
57+
catch(err) {
58+
throw err
59+
}
60+
61+
}
62+
}
63+
64+
65+
log.create()

message.txt

+232
Large diffs are not rendered by default.

package.json

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"name": "adene-bot",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "index.js",
6+
"scripts": {
7+
"test": "echo \"Error: no test specified\" && exit 1"
8+
},
9+
"author": "",
10+
"license": "ISC"
11+
}

scraper.js

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
var Crawler = require('crawler')
2+
var store = require('./store')
3+
var fs = require('fs')
4+
5+
var scraper = new Crawler({
6+
maxConnections: 10,
7+
8+
callback: function(err, res, done){
9+
10+
if(err)
11+
console.log(err)
12+
13+
else {
14+
15+
var $ = res.$
16+
17+
// Registry Object
18+
var ident_string = $('.perito-item-ident').text().trim()
19+
var ident = registry_data(ident_string)
20+
21+
// Energy Class
22+
var img_str = $('#cert-image').attr("src").trim()
23+
var img_name = img_str.substring(img_str.lastIndexOf("/")+1, img_str.length)
24+
var energy_class = img_name.split('.')[0]
25+
26+
// Verify dates
27+
var issue_date = $(".perito-item-data-emissao *")[0].next.data.trim()
28+
if(issue_date.length < 10)
29+
issue_date = "NULL"
30+
31+
var expiry_date = $(".perito-item-data-emissao *")[0].next.data.trim()
32+
if(expiry_date.length < 10)
33+
expiry_date = "NULL"
34+
35+
36+
// Compile final data
37+
var doc = {
38+
doc_number: $('#doc_certificado strong').text(),
39+
type: $(".perito-item:nth-of-type(3)").clone().children().remove().end().text().trim(),
40+
info: $('#texto_certificado > span').text().trim(),
41+
energy_class: energy_class,
42+
address: $(".perito-item:nth-of-type(4)").clone().children().remove().end().text().trim(),
43+
location: $(".perito-item-localidade *")[0].next.data.trim(),
44+
town: $(".perito-item-freguesia *")[0].next.data.trim(),
45+
country: $(".perito-item-concelho *")[0].next.data.trim(),
46+
issue_date: $(".perito-item-data-emissao *")[0].next.data.trim(),
47+
expiry_date: $(".perito-item-validade *")[0].next.data.trim(),
48+
expert_name: $(".perito-item-nome *")[0].next.data.trim(),
49+
expert_number: $(".perito-item-cod *")[0].next.data.trim(),
50+
reg_name: ident.name,
51+
reg_number: ident.number
52+
}
53+
54+
// Add to the SQL insert statement
55+
var sql_valstring = '('
56+
for(var key in doc) {
57+
sql_valstring += "'"+doc[key]+"',"
58+
}
59+
sql_valstring = sql_valstring.slice(0, -1)
60+
sql_valstring += '),'
61+
62+
try {
63+
fs.appendFileSync('./'+store.data_filename, sql_valstring)
64+
}
65+
66+
catch(err) {
67+
console.log(err)
68+
return 0
69+
}
70+
71+
// Save doc to dump
72+
store.documents.push(doc)
73+
74+
// Mark last saved doc
75+
store.backlog.last.doc = doc.doc_number
76+
77+
// Mark first doc
78+
if(!store.flag_first_read) {
79+
store.flag_first_read = true
80+
store.backlog.first.doc = doc.doc_number
81+
}
82+
83+
done()
84+
85+
}
86+
}
87+
})
88+
89+
/** Strip out info from registry information string **/
90+
91+
function registry_data(string) {
92+
93+
string = string.replace(/\s\s+/g, ' ')
94+
95+
var reg_name_rx = /Imóvel descrito na Conservatória do (.*?) sob o nº/gi
96+
var reg_no_rx = /sob o nº (.*)/gi
97+
98+
try {
99+
var reg_name = reg_name_rx.exec(string)[1]
100+
var reg_no = reg_no_rx.exec(string)[1]
101+
}
102+
103+
catch(e) {
104+
var reg_name = '-'
105+
var reg_no = '-'
106+
}
107+
108+
return {
109+
name: reg_name,
110+
number: reg_no
111+
}
112+
113+
}
114+
115+
exports = module.exports = scraper

store.js

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
module.exports = {
2+
3+
flag_first_read: false,
4+
data_filename: '',
5+
6+
documents: {
7+
8+
uri_list: [],
9+
data: []
10+
11+
},
12+
13+
backlog: {
14+
15+
first: {
16+
doc: 0,
17+
page: -1
18+
},
19+
20+
last: {
21+
doc: 0,
22+
page: -1
23+
}
24+
25+
}
26+
27+
}

0 commit comments

Comments
 (0)