use of edu.uci.ics.crawler4j.parser.ParseData in project crawler4j by yasserg.
the class WebCrawler method processPage.
private void processPage(WebURL curURL) {
PageFetchResult fetchResult = null;
try {
if (curURL == null) {
return;
}
fetchResult = pageFetcher.fetchPage(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
// Finds the status reason for all known statuses
Page page = new Page(curURL);
page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
page.setStatusCode(statusCode);
if (statusCode < 200 || statusCode > 299) {
// Not 2XX: 2XX status codes indicate success
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
// is 3xx todo
// follow https://issues.apache.org/jira/browse/HTTPCORE-389
page.setRedirect(true);
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);
onRedirectedStatusCode(page);
if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
} else {
// All other http codes other than 3xx & 200
String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
Locale.ENGLISH);
// the status reason for all known statuses
String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}
} else {
// if status code is 200
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
}
if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
throw new ContentFetchException();
}
if (page.isTruncated()) {
logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
}
parser.parse(page, curURL.getURL());
if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
}
}
frontier.scheduleAll(toSchedule);
} else {
logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
}
visit(page);
}
} catch (PageBiggerThanMaxSizeException e) {
onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
} catch (ParseException pe) {
onParseError(curURL);
} catch (ContentFetchException cfe) {
onContentFetchError(curURL);
} catch (NotAllowedContentException nace) {
logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
} catch (Exception e) {
onUnhandledException(curURL, e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
use of edu.uci.ics.crawler4j.parser.ParseData in project crawler4j by yasserg.
the class Downloader method processUrl.
public void processUrl(String url) {
logger.debug("Processing: {}", url);
Page page = download(url);
if (page != null) {
ParseData parseData = page.getParseData();
if (parseData != null) {
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
logger.debug("Title: {}", htmlParseData.getTitle());
logger.debug("Text length: {}", htmlParseData.getText().length());
logger.debug("Html length: {}", htmlParseData.getHtml().length());
}
} else {
logger.warn("Couldn't parse the content of the page.");
}
} else {
logger.warn("Couldn't fetch the content of the page.");
}
logger.debug("==============");
}
Aggregations