-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.php
110 lines (94 loc) · 3.1 KB
/
crawler.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
<?php
// Functions
function sendGet(string $link) : ?string
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $link);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
$response = curl_exec($ch);
curl_close($ch);
return $response;
}
function checkCategory(string $category_link) : ?array
{
$category_data = sendGet($category_link);
if ($category_data === null) return null;
// Detect next page
$regex = '/<a href=\"(\/fa\/section\/ajax\/([0-9]+)\/([0-9]+))\" class=\"next\">/i';
preg_match($regex, $category_data, $match);
$next_page = null;
if (isset($match[1], $match[2], $match[3])) $next_page = "https://www.yjc.ir" . $match[1];
// var_dump($next_page);
// Detect IDs
$regex = '/href=\"\/fa\/news\/([0-9]+)/i';
// var_dump($regex);
preg_match_all($regex, $category_data, $matches);
// var_dump($matches);
if ($matches && isset($matches[1])) {
$ids = $matches[1];
return [
"ids" => $ids,
"next_page" => $next_page,
];
}
return null;
}
function parseCategory(string $category_link): array
{
$category_page_ids = checkCategory($category_link);
if ($category_page_ids === null) {
return [];
}
$ids = (array) $category_page_ids["ids"];
print count($ids) . "\n";
while ($category_page_ids["next_page"] !== null) {
print "next page: " . $category_page_ids["next_page"] . ": ";
$category_page_ids = checkCategory($category_page_ids["next_page"]);
$next_ids = $category_page_ids["ids"];
print count($next_ids) . " <-> ";
if (is_array($next_ids) && !empty($next_ids)) {
foreach ($next_ids as $next_id) $ids[] = $next_id;
}
print count($ids) . "\n";
}
// Filter and remove duplicates
$ids = array_unique($ids);
$ids = array_filter($ids, function ($id) {
return is_numeric($id);
});
$ids = array_values($ids);
return $ids;
}
function link2name(string $link) : string
{
return str_replace("https://www.yjc.ir/fa/", "", $link);
}
$categories = [
"https://www.yjc.ir/fa/political",
"https://www.yjc.ir/fa/world",
"https://www.yjc.ir/fa/sports",
"https://www.yjc.ir/fa/social",
"https://www.yjc.ir/fa/comercial",
"https://www.yjc.ir/fa/art",
"https://www.yjc.ir/fa/science",
"https://www.yjc.ir/fa/multimedia",
"https://www.yjc.ir/fa/photo",
"https://www.yjc.ir/fa/states",
"https://www.yjc.ir/fa/freereporter",
"https://www.yjc.ir/fa/netsearching",
];
// Main
foreach ($categories as $category_link) {
print "$category_link\n";
$ids = parseCategory($category_link);
file_put_contents(link2name($category_link) . ".json", json_encode($ids));
// exit();
}
// foreach ($categories as $category_link) {
// $ids = json_decode(file_get_contents(link2name($category_link) . ".json"), true);
// foreach ($ids as $id) {
// print "$id\n";
// }
// }