File tree Expand file tree Collapse file tree 3 files changed +31
-5
lines changed
webmagic-core/src/main/java/us/codecraft/webmagic
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples Expand file tree Collapse file tree 3 files changed +31
-5
lines changed Original file line number Diff line number Diff line change @@ -28,13 +28,15 @@ Release Notes
2828
2929 }
3030
31+ 增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
32+
3133增加基于redis的分布式支持。
3234
3335增加XPath2.0语法支持(webmagic-saxon模块)。
3436
3537增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
3638
37- 修复一些已有bug 。
39+ 修复了不支持https的bug 。
3840
3941补充了文档:[ webmagic-0.2.0用户手册] ( http://code4craft.github.io/webmagic/ ) 。
4042
Original file line number Diff line number Diff line change @@ -220,9 +220,17 @@ private void destroyEach(Object object) {
220220 }
221221 }
222222
223- public void test (String url ){
223+ /**
224+ * 用某些特定URL进行爬虫测试
225+ * @param urls 要抓取的url
226+ */
227+ public void test (String ... urls ){
224228 checkComponent ();
225- processRequest (new Request (url ));
229+ if (urls .length >0 ){
230+ for (String url : urls ) {
231+ processRequest (new Request (url ));
232+ }
233+ }
226234 }
227235
228236 private void processRequest (Request request ) {
Original file line number Diff line number Diff line change 88import us .codecraft .webmagic .model .annotation .HelpUrl ;
99import us .codecraft .webmagic .model .annotation .TargetUrl ;
1010import us .codecraft .webmagic .pipeline .JsonFilePageModelPipeline ;
11+ import us .codecraft .webmagic .scheduler .FileCacheQueueScheduler ;
1112
1213import java .util .List ;
1314
@@ -32,12 +33,19 @@ public class GithubRepo implements HasKey {
3233 @ ExtractBy (value = "//div[@class='repository-lang-stats']//li//span[@class='lang']" ,multi = true )
3334 private List <String > language ;
3435
36+ @ ExtractBy ("//a[@class='social-count js-social-count']/text()" )
37+ private String star ;
38+
39+ @ ExtractBy ("//a[@class='social-count js-social-count']/text()" )
40+ private String fork ;
41+
3542 @ ExtractByUrl
3643 private String url ;
3744
3845 public static void main (String [] args ) {
39- OOSpider .create (Site .me ().addStartUrl ("https://github.com/explore" ).setSleepTime (0 ),
40- new JsonFilePageModelPipeline (), GithubRepo .class ).thread (15 ).run ();
46+ OOSpider .create (Site .me ().addStartUrl ("https://github.com/explore" ).setSleepTime (0 ).setRetryTimes (3 ),
47+ new JsonFilePageModelPipeline (), GithubRepo .class )
48+ .scheduler (new FileCacheQueueScheduler ("/data/webmagic/cache/" )).thread (15 ).run ();
4149 }
4250
4351 @ Override
@@ -64,4 +72,12 @@ public List<String> getLanguage() {
6472 public String getUrl () {
6573 return url ;
6674 }
75+
76+ public String getStar () {
77+ return star ;
78+ }
79+
80+ public String getFork () {
81+ return fork ;
82+ }
6783}
You can’t perform that action at this time.
0 commit comments