use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestParseData method testParseData.
@Test
public void testParseData() throws Exception {
String title = "The Foo Page";
Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"), new Outlink("http://bar.com/", "Bar") };
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
metaData.add("Charset", "UTF-8");
ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
WritableTestUtils.testWritable(r, null);
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class SegmentHandler method handle.
@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
try {
String uri = req.getUri().toString();
LOG.info("URI: " + uri);
addMyHeader(res, "URI", uri);
Text url = new Text(uri.toString());
CrawlDatum cd = seg.getCrawlDatum(url);
if (cd != null) {
addMyHeader(res, "Res", "found");
LOG.info("-got " + cd.toString());
ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
if (ps != null) {
Integer TrCode = protoCodes.get(ps.getCode());
if (TrCode != null) {
res.setStatus(TrCode.intValue());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
addMyHeader(res, "ProtocolStatus", ps.toString());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
Content c = seg.getContent(url);
if (c == null) {
// missing content
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
return;
}
byte[] data = c.getContent();
LOG.debug("-data len=" + data.length);
Metadata meta = c.getMetadata();
String[] names = meta.names();
LOG.debug("- " + names.length + " meta");
for (int i = 0; i < names.length; i++) {
boolean my = true;
char ch = names[i].charAt(0);
if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
// pretty good chance it's a standard header
my = false;
}
String[] values = meta.getValues(names[i]);
for (int k = 0; k < values.length; k++) {
if (my) {
addMyHeader(res, names[i], values[k]);
} else {
res.addHeader(names[i], values[k]);
}
}
}
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
res.setContentType(meta.get(Metadata.CONTENT_TYPE));
res.setContentLength(data.length);
OutputStream os = res.getOutputStream();
os.write(data, 0, data.length);
res.flushBuffer();
} else {
addMyHeader(res, "Res", "not found");
LOG.info(" -not found " + url);
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn(StringUtils.stringifyException(e));
addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
}
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class FetcherThread method output.
private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// store the guessed content type in the crawldatum
if (content.getContentType() != null)
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
/*
* Note: Fetcher will only follow meta-redirects coming from the
* original URL.
*/
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
try {
parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
}
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
}
/*
* Store status code in content So we can read this value during parsing
* (as a separate job) and decide to parse or not.
*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}
try {
context.write(key, new NutchWritable(datum));
if (content != null && storingContent)
context.write(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
ParseData parseData = parse.getData();
if (!parseStatus.isSuccess()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(conf);
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
}
}
String origin = null;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(url.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origin = URLUtil.getDomainName(originURL).toLowerCase();
} else // use host
{
origin = originURL.getHost().toLowerCase();
}
}
// used by fetchNode
if (fetchNode != null) {
fetchNode.setOutlinks(links);
fetchNode.setTitle(parseData.getTitle());
FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
}
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
HashSet<String> outlinks = new HashSet<>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
if (toUrl == null) {
continue;
}
validCount++;
links[i].setUrl(toUrl);
outlinkList.add(links[i]);
outlinks.add(toUrl);
}
// Publish fetch report event
if (activatePublisher) {
FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
reportEvent.addOutlinksToEventData(outlinkList);
reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
publisher.publish(reportEvent, conf);
}
// Only process depth N outlinks
if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
FetchItem ft = FetchItem.create(url, null, queueMode);
FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
// Calculate variable number of outlinks by depth using the
// divisor (outlinks = Math.floor(divisor / depth * num.links))
int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
String followUrl;
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
followUrl = iter.next();
// Check whether we'll follow external outlinks
if (outlinksIgnoreExternal) {
if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
continue;
}
}
// Already followed?
int urlHashCode = followUrl.hashCode();
if (queue.alreadyFetched.contains(urlHashCode)) {
continue;
}
queue.alreadyFetched.add(urlHashCode);
// Create new FetchItem with depth incremented
FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
((FetchItemQueues) fetchQueues).addFetchItem(fit);
outlinkCounter++;
}
}
// Overwrite the outlinks in ParseData with the normalized and
// filtered set
parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
}
}
} catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
}
// return parse status if it exits
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
}
return null;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class ParseSegment method isTruncated.
/**
* Checks if the page's content is truncated.
*
* @param content
* @return If the page is truncated <code>true</code>. When it is not, or when
* it could be determined, <code>false</code>.
*/
public static boolean isTruncated(Content content) {
byte[] contentBytes = content.getContent();
if (contentBytes == null)
return false;
Metadata metadata = content.getMetadata();
if (metadata == null)
return false;
String lengthStr = metadata.get(Response.CONTENT_LENGTH);
if (lengthStr != null)
lengthStr = lengthStr.trim();
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
int inHeaderSize;
String url = content.getUrl();
try {
inHeaderSize = Integer.parseInt(lengthStr);
} catch (NumberFormatException e) {
LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
int actualSize = contentBytes.length;
if (inHeaderSize > actualSize) {
LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize);
return true;
}
if (LOG.isDebugEnabled()) {
LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
}
return false;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestHTMLLanguageParser method getContent.
private Content getContent(String text) {
Metadata meta = new Metadata();
meta.add("Content-Type", "text/html");
return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
}
Aggregations