use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.
the class BasicCrawler method visit.
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Domain: '{}'", domain);
logger.debug("Sub-domain: '{}'", subDomain);
logger.debug("Path: '{}'", path);
logger.debug("Parent page: {}", parentUrl);
logger.debug("Anchor text: {}", anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
logger.debug("Response headers:");
for (Header header : responseHeaders) {
logger.debug("\t{}: {}", header.getName(), header.getValue());
}
}
logger.debug("=============");
}
use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.
the class Downloader method processUrl.
public void processUrl(String url) {
logger.debug("Processing: {}", url);
Page page = download(url);
if (page != null) {
ParseData parseData = page.getParseData();
if (parseData != null) {
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
logger.debug("Title: {}", htmlParseData.getTitle());
logger.debug("Text length: {}", htmlParseData.getText().length());
logger.debug("Html length: {}", htmlParseData.getHtml().length());
}
} else {
logger.warn("Couldn't parse the content of the page.");
}
} else {
logger.warn("Couldn't fetch the content of the page.");
}
logger.debug("==============");
}
use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.
the class LocalDataCollectorCrawler method visit.
@Override
public void visit(Page page) {
logger.info("Visited: {}", page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
Set<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if ((myCrawlStat.getTotalProcessedPages() % 50) == 0) {
dumpMyData();
}
}
use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.
the class BasicCrawler method visit.
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
use of edu.uci.ics.crawler4j.parser.HtmlParseData in project crawler4j by yasserg.
the class BasicCrawler method visit.
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Docid of parent page: {}", parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: {}", text.length());
logger.debug("Html length: {}", html.length());
logger.debug("Number of outgoing links: {}", links.size());
}
logger.debug("=============");
}
Aggregations