Search in sources :

Example 1 with HtmlParseData

use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.

the class BasicCrawler method visit.

/**
     * This function is called when a page is fetched and ready to be processed
     * by your program.
     */
@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    String domain = page.getWebURL().getDomain();
    String path = page.getWebURL().getPath();
    String subDomain = page.getWebURL().getSubDomain();
    String parentUrl = page.getWebURL().getParentUrl();
    String anchor = page.getWebURL().getAnchor();
    logger.debug("Docid: {}", docid);
    logger.info("URL: {}", url);
    logger.debug("Domain: '{}'", domain);
    logger.debug("Sub-domain: '{}'", subDomain);
    logger.debug("Path: '{}'", path);
    logger.debug("Parent page: {}", parentUrl);
    logger.debug("Anchor text: {}", anchor);
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();
        logger.debug("Text length: {}", text.length());
        logger.debug("Html length: {}", html.length());
        logger.debug("Number of outgoing links: {}", links.size());
    }
    Header[] responseHeaders = page.getFetchResponseHeaders();
    if (responseHeaders != null) {
        logger.debug("Response headers:");
        for (Header header : responseHeaders) {
            logger.debug("\t{}: {}", header.getName(), header.getValue());
        }
    }
    logger.debug("=============");
}
Also used : Header(org.apache.http.Header) WebURL(edu.uci.ics.crawler4j.url.WebURL) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Example 2 with HtmlParseData

use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.

the class Downloader method processUrl.

public void processUrl(String url) {
    logger.debug("Processing: {}", url);
    Page page = download(url);
    if (page != null) {
        ParseData parseData = page.getParseData();
        if (parseData != null) {
            if (parseData instanceof HtmlParseData) {
                HtmlParseData htmlParseData = (HtmlParseData) parseData;
                logger.debug("Title: {}", htmlParseData.getTitle());
                logger.debug("Text length: {}", htmlParseData.getText().length());
                logger.debug("Html length: {}", htmlParseData.getHtml().length());
            }
        } else {
            logger.warn("Couldn't parse the content of the page.");
        }
    } else {
        logger.warn("Couldn't fetch the content of the page.");
    }
    logger.debug("==============");
}
Also used : ParseData(edu.uci.ics.crawler4j.parser.ParseData) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData) Page(edu.uci.ics.crawler4j.crawler.Page) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Example 3 with HtmlParseData

use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.

the class LocalDataCollectorCrawler method visit.

@Override
public void visit(Page page) {
    logger.info("Visited: {}", page.getWebURL().getURL());
    myCrawlStat.incProcessedPages();
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData parseData = (HtmlParseData) page.getParseData();
        Set<WebURL> links = parseData.getOutgoingUrls();
        myCrawlStat.incTotalLinks(links.size());
        try {
            myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
        } catch (UnsupportedEncodingException ignored) {
        // Do nothing
        }
    }
    // We dump this crawler statistics after processing every 50 pages
    if ((myCrawlStat.getTotalProcessedPages() % 50) == 0) {
        dumpMyData();
    }
}
Also used : WebURL(edu.uci.ics.crawler4j.url.WebURL) UnsupportedEncodingException(java.io.UnsupportedEncodingException) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Example 4 with HtmlParseData

use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.

the class BasicCrawler method visit.

@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    int parentDocid = page.getWebURL().getParentDocid();
    logger.debug("Docid: {}", docid);
    logger.info("URL: {}", url);
    logger.debug("Docid of parent page: {}", parentDocid);
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();
        logger.debug("Text length: {}", text.length());
        logger.debug("Html length: {}", html.length());
        logger.debug("Number of outgoing links: {}", links.size());
    }
    logger.debug("=============");
}
Also used : WebURL(edu.uci.ics.crawler4j.url.WebURL) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Example 5 with HtmlParseData

use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.

the class BasicCrawler method visit.

@Override
public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    int parentDocid = page.getWebURL().getParentDocid();
    logger.debug("Docid: {}", docid);
    logger.info("URL: {}", url);
    logger.debug("Docid of parent page: {}", parentDocid);
    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        Set<WebURL> links = htmlParseData.getOutgoingUrls();
        logger.debug("Text length: {}", text.length());
        logger.debug("Html length: {}", html.length());
        logger.debug("Number of outgoing links: {}", links.size());
    }
    logger.debug("=============");
}
Also used : WebURL(edu.uci.ics.crawler4j.url.WebURL) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Aggregations

HtmlParseData (edu.uci.ics.crawler4j.parser.HtmlParseData)7 WebURL (edu.uci.ics.crawler4j.url.WebURL)6 Page (edu.uci.ics.crawler4j.crawler.Page)1 ParseData (edu.uci.ics.crawler4j.parser.ParseData)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 Header (org.apache.http.Header)1