DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Webcrawler
The core classes from my webcrawler implementation.
Again, not really working code as there are a bunch of dependencies missing. This is really for demonstration purposes.
public class WebCrawler<T extends WebPage> implements Iterable<T>
{
private final HashSet<T> visitedPages = new HashSet<T>();
private final LinkedList<Object> workQueue = new LinkedList<Object>();
private final PageProcessor<T> processor;
// Map of URLs to pages.
private final Map<String, WebPage> pages = new HashMap<String, WebPage>();
private final Predicate<Object> unvisited = new Predicate<Object>() {
public boolean satisfies(Object page){
return !WebCrawler.this.visitedPages.contains(page);}};
public WebCrawler(PageProcessor<T> processor, String... urls){
this.processor = processor;
for (String url : urls){
this.workQueue.add(processor.page(url));}}
/* Iterator which iterates over all WebPages that haven't yet been visited.
* It is thoroughly lazy and a web page will never be visited until it turns
* up in this iterator.*/
public final Iterator<T> pageIterator = (Iterator<T>)
new FlatteningIterator(
new ListeningIterator<Object>(
new FilterIterator<Object>(
unvisited,
new PoppingIterator<Object>(this.workQueue))){
@Override public void onNext(Object next){
if (next instanceof WebPage){
WebCrawler.this.visitedPages.add((T)next);
WebCrawler.this.workQueue.add(
new FilterIterator(unvisited, WebCrawler.this.processor.linkedPages((WebPage)next)));}}});
public Iterator<T> iterator(){ return IteratorUtils.link(this.visitedPages.iterator(), this.pageIterator); }
}
/**
* Abstract class representing a mechanism for processing urls into pages. Contains
* utility methods and a cacheing strategy.
*
* @author david
*/
public abstract class PageProcessor<T extends WebPage> implements Transformer<String, T>
{
private final PageCache<T> cache;
private final IteratorTransformer<String, T> iteratorTransformer = new IteratorTransformer<String, T>(this);
private Predicate<String> domain;
public PageProcessor(Predicate<String> domain, PageCache<T> cache){
this.domain = domain;
this.cache = cache;}
public PageProcessor(String domainPrefix, PageCache<T> cache){
this(StringUtils.startsWith(domainPrefix), cache);}
/**
* Take the Url and return a WebPage corresponding to it.
*/
protected abstract T process(String url);
public T transform(String url){ return this.page(url); }
/**
* If the page has previously been processed, retrieve it from the internal cache.
* Else process it and put it in the eternal cache.
*/
public T page(String url){
T page = cache.getCachedPage(url);
if (page == null){
page = this.process(url);
cache.cachePage(page);}
return page;}
/**
* Returns an iterator over all pages linked to by this page.
*/
public Iterator<T> linkedPages(WebPage page){
return iteratorTransformer.transform(new FilterIterator(domain, page.getLinkUrls()));}
}
/**
* A very simple PageProcessor<WebPage> implementation based on the HTMLParser library
* which uses a MapBackedPageCache.
*
* @author david
*/
public class HtmlParserPageProcessor extends PageProcessor<WebPage>
{
private static NodeFilter ALLOWED_TAGS = new NodeFilter(){
public boolean accept(Node node){
return (node instanceof LinkTag) || (node instanceof TitleTag);}};
public HtmlParserPageProcessor(Predicate<String> domain){
super(domain, new MapBackedPageCache<WebPage>());}
public HtmlParserPageProcessor(String domain){
super(domain, new MapBackedPageCache<WebPage>());}
/**
* Fetches the resource represented by the URL, parses the HTML and extracts
* the title element and all the links and uses them to build a WebPage object.
*/
public WebPage process(String url){
try{
Parser parser = new Parser(url);
NodeIterator iterator = parser.parse(ALLOWED_TAGS).elements();
String title = "";
List<String> links = new ArrayList<String>();
while (iterator.hasMoreNodes()){
Node node = iterator.nextNode();
if (node instanceof TitleTag) title = ((TitleTag)node).getTitle();
else if (node instanceof LinkTag) links.add(((LinkTag)node).extractLink());}
return new WebPage(url, title, links);}
catch (Exception e){ throw new RuntimeException(e); }}
}




