-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_factordb.pl
125 lines (96 loc) · 3.49 KB
/
scrape_factordb.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/perl
# Daniel "Trizen" Șuteu
# Date: 09 December 2019
# https://github.com/trizen
# Extract factors from factordb.com for a given number, by scrapping the website.
# Do NOT use this script! Use "get_factordb.pl" instead, which uses FactorDB's API.
use 5.020;
use warnings;
use experimental qw(signatures);
use CHI;
use WWW::Mechanize::Cached;
use URI::Escape qw(uri_escape);
use File::Basename qw(dirname);
use File::Spec::Functions qw(rel2abs catdir);
use constant {
USE_TOR_PROXY => 1, # true to use the Tor proxy to connect to factorDB (127.0.0.1:9050)
};
my $cache = CHI->new(driver => 'BerkeleyDB',
root_dir => catdir(dirname(rel2abs($0)), 'cache'));
my $mech = WWW::Mechanize::Cached->new(
autocheck => 1,
show_progress => 0,
stack_depth => 10,
cache => $cache,
agent => "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
);
{
state $accepted_encodings = HTTP::Message::decodable();
$mech->default_header('Accept-Encoding' => $accepted_encodings);
};
{
require LWP::ConnCache;
my $cache = LWP::ConnCache->new;
$cache->total_capacity(undef); # no limit
$mech->conn_cache($cache);
};
if (USE_TOR_PROXY) {
$mech->proxy(['http', 'https'], "socks://127.0.0.1:9050");
}
sub extract_from_id ($id) {
my $resp = $mech->get("http://factordb.com/index.php?showid=$id");
if ($resp->decoded_content =~ m{<td align="center">Number</td>\s*<td align="center">(.*?)</td>}s) {
my $number = $1;
$number =~ s/<(.*?)>//g;
$number = join('', split(' ', $number));
return $number;
}
$mech->invalidate_last_request;
die "Failed to extract number from ID = $id\n";
}
my $expr = $ARGV[0] || die "usage: perl $0 [NUMBER | EXPR | URL]\n";
$expr = join('', split(' ', $expr)); # remove any whitespace
my $main_url = "http://factordb.com/index.php?query=" . uri_escape($expr);
if ($expr =~ m{^http://factordb\.com/}) {
$main_url = $expr;
}
my $resp = $mech->get($main_url);
my $content = $resp->decoded_content;
if ($content =~ m{ = \(?(<a href=".*?</td>)}) {
my $factor_data = $1;
# Invalidate request if `n` is not fully factorized.
if ( $content =~ m{<td>(?:CF|C|U)\s*(?:<font color="#FF0000">\*</font>)?</td>}
or $factor_data =~ m{<font color="#002099">}) {
$mech->invalidate_last_request;
}
my @factors;
while ($factor_data =~ m{<a href="index\.php\?id=(\d+)"><font color="#(\d+)">([\d.^]+)</font></a>}g) {
my ($id, $color, $n) = ($1, $2, $3);
my $is_prime = ($color eq '000000');
my $pow = 1;
if ($n =~ s/\^(\d+)\z//) {
$pow = $1;
}
if ($n =~ /\./) {
push @factors, (extract_from_id($id)) x $pow;
}
else {
if ($is_prime) {
push @factors, ($n) x $pow;
}
else {
require Math::Prime::Util;
my @f = Math::Prime::Util::factor($n);
push @factors, (@f) x $pow;
}
}
}
if ($content =~ m{\)\^(\d+)</td>}) {
@factors = (@factors) x $1;
}
say for @factors;
}
else {
$mech->invalidate_last_request;
die "Failed: $main_url\n";
}