Skip to content

rudeh1253/java-crawler-web

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

35 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Java Web Crawler

Add the following jar file on your CLASSPATH

From Google Drive

How to use

TestClass.java

package nsl.test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.nsl.web.crawling.MainCrawler;

public class TestCrawler extends MainCrawler {

    public TestCrawler(String entryUrl) {
        super(entryUrl);
    }

    public TestCrawler(String entryURL, int threadPoolSize) {
        super(entryURL, threadPoolSize);
    }

    private String urlPatternToGetNextTarget = "...";

    @Override
    public List<String> findNextTargets(String html, String thisPageUrl) {
        // Implement how to find targets to process next
        System.out.println(thisPageUrl);
        if (!thisPageUrl.equals(urlPAtternToGetNextTarget)) {
            System.out.println("This is not the place to find next targets");
            return null;
        }
        System.out.println("Start to find targets to crawl next.");
        List<String> nextTargets = new LinkedList<>();
        Document fullDocument = Jsoup.parse(html);
        Element postListBody = fullDocument.getElementById("postListBody");
        Elements items = postListBody.getElementsByClass("item");
        for (Element item : items) {
            Element linkElem = item.getElementsByClass("link pcol2").get(0);
            String link = linkElem.attr("href");
            nextTargets.add([domain] + link);
        }
        System.out.println(nextTargets);
        return nextTargets;
    }

    AtomicLong al = new AtomicLong(0L);

    @Override
    public void processPage(String html, String thisPageUrl) {
        // Implement how to process each web page
        // You may make use of Jsoup to parse HTMLs
        BufferedWriter bw = null;
        try {
            long id = al.incrementAndGet();
            bw = new BufferedWriter(new FileWriter(new File("[url to store file]/" + id + ".html")));
            bw.write(html);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (bw != null) {
                try {
                    bw.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }    
}

Main.java

public class Main {

    public static void main(String[] args) {
        TestClass testClass = new TestClass("https://www.google.com/");
        // testClass.setCookie(...);
        // testClass.setRequestProperty(...);
        try {
            testClass.run();
        } catch (InterruptedException | ExecutionException e) {
            // Process exceptions
        }
    }
}

Documentation

References index.html in docs directory

About

Web crawler with Java

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages