use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class SegmentHandler method handle.
@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
try {
String uri = req.getUri().toString();
LOG.info("URI: " + uri);
addMyHeader(res, "URI", uri);
Text url = new Text(uri.toString());
CrawlDatum cd = seg.getCrawlDatum(url);
if (cd != null) {
addMyHeader(res, "Res", "found");
LOG.info("-got " + cd.toString());
ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
if (ps != null) {
Integer TrCode = protoCodes.get(ps.getCode());
if (TrCode != null) {
res.setStatus(TrCode.intValue());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
addMyHeader(res, "ProtocolStatus", ps.toString());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
Content c = seg.getContent(url);
if (c == null) {
// missing content
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
return;
}
byte[] data = c.getContent();
LOG.debug("-data len=" + data.length);
Metadata meta = c.getMetadata();
String[] names = meta.names();
LOG.debug("- " + names.length + " meta");
for (int i = 0; i < names.length; i++) {
boolean my = true;
char ch = names[i].charAt(0);
if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
// pretty good chance it's a standard header
my = false;
}
String[] values = meta.getValues(names[i]);
for (int k = 0; k < values.length; k++) {
if (my) {
addMyHeader(res, names[i], values[k]);
} else {
res.addHeader(names[i], values[k]);
}
}
}
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
res.setContentType(meta.get(Metadata.CONTENT_TYPE));
res.setContentLength(data.length);
OutputStream os = res.getOutputStream();
os.write(data, 0, data.length);
res.flushBuffer();
} else {
addMyHeader(res, "Res", "not found");
LOG.info(" -not found " + url);
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn(StringUtils.stringifyException(e));
addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class FetcherThread method output.
private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// store the guessed content type in the crawldatum
if (content.getContentType() != null)
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
/*
* Note: Fetcher will only follow meta-redirects coming from the
* original URL.
*/
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
try {
parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
}
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
}
/*
* Store status code in content So we can read this value during parsing
* (as a separate job) and decide to parse or not.
*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}
try {
context.write(key, new NutchWritable(datum));
if (content != null && storingContent)
context.write(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
ParseData parseData = parse.getData();
if (!parseStatus.isSuccess()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(conf);
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
String origin = null;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(url.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origin = URLUtil.getDomainName(originURL).toLowerCase();
} else // use host
{
origin = originURL.getHost().toLowerCase();
}
}
// used by fetchNode
if (fetchNode != null) {
fetchNode.setOutlinks(links);
fetchNode.setTitle(parseData.getTitle());
FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
}
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
HashSet<String> outlinks = new HashSet<>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
if (toUrl == null) {
continue;
}
validCount++;
links[i].setUrl(toUrl);
outlinkList.add(links[i]);
outlinks.add(toUrl);
}
// Publish fetch report event
if (activatePublisher) {
FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
reportEvent.addOutlinksToEventData(outlinkList);
reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
publisher.publish(reportEvent, conf);
}
// Only process depth N outlinks
if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
FetchItem ft = FetchItem.create(url, null, queueMode);
FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
// Calculate variable number of outlinks by depth using the
// divisor (outlinks = Math.floor(divisor / depth * num.links))
int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
String followUrl;
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
followUrl = iter.next();
// Check whether we'll follow external outlinks
if (outlinksIgnoreExternal) {
if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
continue;
}
}
// Already followed?
int urlHashCode = followUrl.hashCode();
if (queue.alreadyFetched.contains(urlHashCode)) {
continue;
}
queue.alreadyFetched.add(urlHashCode);
// Create new FetchItem with depth incremented
FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
((FetchItemQueues) fetchQueues).addFetchItem(fit);
outlinkCounter++;
}
}
// Overwrite the outlinks in ParseData with the normalized and
// filtered set
parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
}
}
} catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
}
// return parse status if it exits
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
}
return null;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class FetcherThread method handleRedirect.
private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException, InterruptedException {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
if (newUrl == null || newUrl.equals(urlString)) {
LOG.debug(" - {} redirect skipped: {}", redirType, (newUrl != null ? "to same url" : "filtered"));
return null;
}
if (ignoreAlsoRedirects && (ignoreExternalLinks || ignoreInternalLinks)) {
try {
URL origUrl = new URL(urlString);
URL redirUrl = new URL(newUrl);
if (ignoreExternalLinks) {
String origHostOrDomain, newHostOrDomain;
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origHostOrDomain = URLUtil.getDomainName(origUrl).toLowerCase();
newHostOrDomain = URLUtil.getDomainName(redirUrl).toLowerCase();
} else {
// byHost
origHostOrDomain = origUrl.getHost().toLowerCase();
newHostOrDomain = redirUrl.getHost().toLowerCase();
}
if (!origHostOrDomain.equals(newHostOrDomain)) {
LOG.debug(" - ignoring redirect {} from {} to {} because external links are ignored", redirType, urlString, newUrl);
return null;
}
}
if (ignoreInternalLinks) {
String origHost = origUrl.getHost().toLowerCase();
String newHost = redirUrl.getHost().toLowerCase();
if (origHost.equals(newHost)) {
LOG.debug(" - ignoring redirect {} from {} to {} because internal links are ignored", redirType, urlString, newUrl);
return null;
}
}
} catch (MalformedURLException e) {
return null;
}
}
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
url = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
redirectCount++;
LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
return url;
} else {
CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
// transfer existing metadata
newDatum.getMetaData().putAll(datum.getMetaData());
try {
scfilters.initialScore(url, newDatum);
} catch (ScoringFilterException e) {
e.printStackTrace();
}
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
}
output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
return null;
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class UpdateHostDbMapper method map.
/**
* Mapper ingesting records from the HostDB, CrawlDB and plaintext host
* scores file. Statistics and scores are passed on.
*
* @param key
* @param value
* @param context
*/
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
// Get the key!
String keyStr = key.toString();
// Check if we process records from the CrawlDB
if (key instanceof Text && value instanceof CrawlDatum) {
// Get the normalized and filtered host of this URL
buffer = filterNormalize(URLUtil.getHost(keyStr));
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
return;
}
// Set the host of this URL
host.set(buffer);
crawlDatum = (CrawlDatum) value;
hostDatum = new HostDatum();
// Do not resolve homepages when the root URL is unfetched
if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
// Get the protocol
String protocol = URLUtil.getProtocol(keyStr);
// Get the proposed homepage URL
String homepage = protocol + "://" + buffer + "/";
// Check if the current key is equals the host
if (keyStr.equals(homepage)) {
// Check if this is a redirect to the real home page
if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
// Obtain the repr url for this redirect via protocolstatus from the metadata
ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
// Get the protocol status' arguments
args = z.getArgs();
// ..and the possible redirect URL
reprUrl = args[0];
// Am i a redirect?
if (reprUrl != null) {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
context.write(host, new NutchWritable(hostDatum));
hostDatum.setHomepageUrl(reprUrl);
} else {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
}
} else {
hostDatum.setHomepageUrl(homepage);
context.write(host, new NutchWritable(hostDatum));
LOG.info("UpdateHostDb: homepage: " + homepage);
}
}
}
// Always emit crawl datum
context.write(host, new NutchWritable(crawlDatum));
}
// Check if we got a record from the hostdb
if (key instanceof Text && value instanceof HostDatum) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
return;
}
// Get a HostDatum
hostDatum = (HostDatum) value;
key.set(buffer);
// we're aggregating them from CrawlDB anyway
if (readingCrawlDb) {
hostDatum.resetStatistics();
}
context.write(key, new NutchWritable(hostDatum));
}
// Check if we got a record with host scores
if (key instanceof Text && value instanceof Text) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
return;
}
key.set(buffer);
context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class ParserChecker method run.
public int run(String[] args) throws Exception {
boolean dumpText = false;
boolean force = false;
String contentType = null;
String url = null;
String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
if (args.length == 0) {
LOG.error(usage);
return (-1);
}
// used to simulate the metadata propagated from injection
HashMap<String, String> metadata = new HashMap<>();
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-forceAs")) {
force = true;
contentType = args[++i];
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
int firstEquals = nextOne.indexOf("=");
if (firstEquals != -1) {
k = nextOne.substring(0, firstEquals);
v = nextOne.substring(firstEquals + 1);
} else
k = nextOne;
metadata.put(k, v);
} else if (i != args.length - 1) {
LOG.error(usage);
System.exit(-1);
} else {
url = URLUtil.toASCII(args[i]);
}
}
if (LOG.isInfoEnabled()) {
LOG.info("fetching: " + url);
}
CrawlDatum cd = new CrawlDatum();
Iterator<String> iter = metadata.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
String value = metadata.get(key);
if (value == null)
value = "";
cd.getMetaData().put(new Text(key), new Text(value));
}
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
// if the configuration permits, handle redirects until we either run
// out of allowed redirects or we stop getting redirect statuses.
int maxRedirects = conf.getInt("http.redirect.max", 0);
int numRedirects = 0;
while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
LOG.info("Handling redirect to " + newURL);
protocol = factory.getProtocol(newURL);
turl = new Text(newURL);
output = protocol.getProtocolOutput(turl, cd);
numRedirects++;
}
if (!output.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: " + output.getStatus());
if (output.getStatus().isRedirect()) {
System.err.println("Redirect(s) not handled due to configuration.");
System.err.println("Max Redirects to handle per config: " + maxRedirects);
System.err.println("Number of Redirects handled: " + numRedirects);
}
return (-1);
}
Content content = output.getContent();
if (content == null) {
LOG.error("No content for " + url);
return (-1);
}
if (force) {
content.setContentType(contentType);
} else {
contentType = content.getContentType();
}
if (contentType == null) {
LOG.error("Failed to determine content type!");
return (-1);
}
if (ParseSegment.isTruncated(content)) {
LOG.warn("Content is truncated, parse may fail!");
}
ScoringFilters scfilters = new ScoringFilters(conf);
// call the scoring filters
try {
scfilters.passScoreBeforeParsing(turl, cd, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
LOG.warn(StringUtils.stringifyException(e));
}
}
ParseResult parseResult = new ParseUtil(conf).parse(content);
if (parseResult == null) {
LOG.error("Parsing content failed!");
return (-1);
}
// calculate the signature
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
LOG.info("signature: " + StringUtil.toHexString(signature));
}
Parse parse = parseResult.get(turl);
if (parse == null) {
LOG.error("Failed to get parse from parse result");
LOG.error("Available parses in parse result (by URL key):");
for (Map.Entry<Text, Parse> entry : parseResult) {
LOG.error(" " + entry.getKey());
}
LOG.error("Parse result does not contain a parse for URL to be checked:");
LOG.error(" " + turl);
return -1;
}
// call the scoring filters
try {
scfilters.passScoreAfterParsing(turl, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
LOG.warn(StringUtils.stringifyException(e));
}
}
for (Map.Entry<Text, Parse> entry : parseResult) {
parse = entry.getValue();
LOG.info("---------\nUrl\n---------------\n");
System.out.print(entry.getKey());
LOG.info("\n---------\nParseData\n---------\n");
System.out.print(parse.getData().toString());
if (dumpText) {
LOG.info("---------\nParseText\n---------\n");
System.out.print(parse.getText());
}
}
return 0;
}
Aggregations