DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile


  • submit to reddit
        The core classes from my webcrawler implementation.

Again, not really working code as there are a bunch of dependencies missing. This is really for demonstration purposes.

public class WebCrawler<T extends WebPage> implements Iterable<T>
    private final HashSet<T> visitedPages = new HashSet<T>();
    private final LinkedList<Object> workQueue = new LinkedList<Object>();
    private final PageProcessor<T> processor;
    // Map of URLs to pages.
    private final Map<String, WebPage> pages = new HashMap<String, WebPage>();     
    private final Predicate<Object> unvisited = new Predicate<Object>() { 
        public boolean satisfies(Object page){
            return !WebCrawler.this.visitedPages.contains(page);}};    

    public WebCrawler(PageProcessor<T> processor, String... urls){
        this.processor = processor;
        for (String url : urls){
    /* Iterator which iterates over all WebPages that haven't yet been visited.
     * It is thoroughly lazy and a web page will never be visited until it turns
     * up in this iterator.*/
    public final Iterator<T> pageIterator = (Iterator<T>)
        new FlatteningIterator(
            new ListeningIterator<Object>(
                    new FilterIterator<Object>(
                        new PoppingIterator<Object>(this.workQueue))){
                @Override public void onNext(Object next){
                    if (next instanceof WebPage){
                            new FilterIterator(unvisited, WebCrawler.this.processor.linkedPages((WebPage)next)));}}});
    public Iterator<T> iterator(){ return, this.pageIterator); }                        

 * Abstract class representing a mechanism for processing urls into pages. Contains 
 * utility methods and a cacheing strategy.
 * @author david
public abstract class PageProcessor<T extends WebPage> implements Transformer<String, T>
    private final PageCache<T> cache;
    private final IteratorTransformer<String, T> iteratorTransformer = new IteratorTransformer<String, T>(this);
    private Predicate<String> domain;
    public PageProcessor(Predicate<String> domain, PageCache<T> cache){
        this.domain = domain;
        this.cache = cache;}
    public PageProcessor(String domainPrefix, PageCache<T> cache){
        this(StringUtils.startsWith(domainPrefix), cache);}
     * Take the Url and return a WebPage corresponding to it.
    protected abstract T process(String url);
    public T transform(String url){ return; }
     * If the page has previously been processed, retrieve it from the internal cache.
     * Else process it and put it in the eternal cache.
    public T page(String url){
        T page = cache.getCachedPage(url);
        if (page == null){
                page = this.process(url);
        return page;}
     * Returns an iterator over all pages linked to by this page.
    public Iterator<T> linkedPages(WebPage page){
        return iteratorTransformer.transform(new FilterIterator(domain, page.getLinkUrls()));}

 * A very simple PageProcessor<WebPage> implementation based on the HTMLParser library
 * which uses a MapBackedPageCache.
 * @author david
public class HtmlParserPageProcessor extends PageProcessor<WebPage>
    private static NodeFilter ALLOWED_TAGS = new NodeFilter(){
        public boolean accept(Node node){ 
            return (node instanceof LinkTag) || (node instanceof TitleTag);}};
    public HtmlParserPageProcessor(Predicate<String> domain){
        super(domain,  new MapBackedPageCache<WebPage>());}
    public HtmlParserPageProcessor(String domain){
        super(domain,  new MapBackedPageCache<WebPage>());}
     * Fetches the resource represented by the URL, parses the HTML and extracts
     * the title element and all the links and uses them to build a WebPage object.
    public WebPage process(String url){
            Parser parser = new Parser(url);
            NodeIterator iterator = parser.parse(ALLOWED_TAGS).elements();
            String title = "";
            List<String> links = new ArrayList<String>();
            while (iterator.hasMoreNodes()){
                Node node = iterator.nextNode();
                if (node instanceof TitleTag) title = ((TitleTag)node).getTitle();
                else if (node instanceof LinkTag) links.add(((LinkTag)node).extractLink());}
            return new WebPage(url, title, links);}               
        catch (Exception e){ throw new RuntimeException(e); }}