use of edu.uci.ics.crawler4j.crawler.exceptions.ParseException in project crawler4j by yasserg.
the class Parser method parse.
public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
if (Util.hasBinaryContent(page.getContentType())) {
// BINARY
BinaryParseData parseData = new BinaryParseData();
if (config.isIncludeBinaryContentInCrawling()) {
if (config.isProcessBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
} else {
parseData.setHtml("<html></html>");
}
page.setParseData(parseData);
if (parseData.getHtml() == null) {
throw new ParseException();
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
} else {
throw new NotAllowedContentException();
}
} else if (Util.hasPlainTextContent(page.getContentType())) {
// plain Text
try {
TextParseData parseData = new TextParseData();
if (page.getContentCharset() == null) {
parseData.setTextContent(new String(page.getContentData()));
} else {
parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
} else {
// isHTML
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
if (page.getContentCharset() == null) {
page.setContentCharset(metadata.get("Content-Encoding"));
}
HtmlParseData parseData = new HtmlParseData();
parseData.setText(contentHandler.getBodyText().trim());
parseData.setTitle(metadata.get(DublinCore.TITLE));
parseData.setMetaTags(contentHandler.getMetaTags());
// Please note that identifying language takes less than 10 milliseconds
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
page.setLanguage(languageIdentifier.getLanguage());
Set<WebURL> outgoingUrls = new HashSet<>();
String baseURL = contentHandler.getBaseUrl();
if (baseURL != null) {
contextURL = baseURL;
}
int urlCount = 0;
for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
String href = urlAnchorPair.getHref();
if ((href == null) || href.trim().isEmpty()) {
continue;
}
String hrefLoweredCase = href.trim().toLowerCase();
if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
outgoingUrls.add(webURL);
urlCount++;
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
break;
}
}
}
}
parseData.setOutgoingUrls(outgoingUrls);
try {
if (page.getContentCharset() == null) {
parseData.setHtml(new String(page.getContentData()));
} else {
parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
}
page.setParseData(parseData);
} catch (UnsupportedEncodingException e) {
logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
throw new ParseException();
}
}
}
use of edu.uci.ics.crawler4j.crawler.exceptions.ParseException in project crawler4j by yasserg.
the class WebCrawler method processPage.
private void processPage(WebURL curURL) {
PageFetchResult fetchResult = null;
try {
if (curURL == null) {
return;
}
fetchResult = pageFetcher.fetchPage(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
// Finds the status reason for all known statuses
Page page = new Page(curURL);
page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
page.setStatusCode(statusCode);
if (statusCode < 200 || statusCode > 299) {
// Not 2XX: 2XX status codes indicate success
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
// is 3xx todo
// follow https://issues.apache.org/jira/browse/HTTPCORE-389
page.setRedirect(true);
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);
onRedirectedStatusCode(page);
if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
} else {
// All other http codes other than 3xx & 200
String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
Locale.ENGLISH);
// the status reason for all known statuses
String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}
} else {
// if status code is 200
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
}
if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
throw new ContentFetchException();
}
if (page.isTruncated()) {
logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
}
parser.parse(page, curURL.getURL());
if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
}
}
frontier.scheduleAll(toSchedule);
} else {
logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
}
visit(page);
}
} catch (PageBiggerThanMaxSizeException e) {
onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
} catch (ParseException pe) {
onParseError(curURL);
} catch (ContentFetchException cfe) {
onContentFetchError(curURL);
} catch (NotAllowedContentException nace) {
logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
} catch (Exception e) {
onUnhandledException(curURL, e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
Aggregations