use of org.apache.nutch.parse.ParseResult in project nutch by apache.
the class FeedParser method main.
/**
* Runs a command line version of this {@link Parser}.
*
* @param args
* A single argument (expected at arg[0]) representing a path on the
* local filesystem that points to a feed file.
*
* @throws Exception
* If any error occurs.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: FeedParser <feed>");
System.exit(1);
}
String name = args[0];
String url = "file:" + name;
Configuration conf = NutchConfiguration.create();
FeedParser parser = new FeedParser();
parser.setConf(conf);
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
for (Entry<Text, Parse> entry : parseResult) {
System.out.println("key: " + entry.getKey());
Parse parse = entry.getValue();
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText() + "\n");
}
}
use of org.apache.nutch.parse.ParseResult in project nutch by apache.
the class FeedParser method getParse.
/**
* Parses the given feed and extracts out and parsers all linked items within
* the feed, using the underlying ROME feed parsing library.
*
* @param content
* A {@link Content} object representing the feed that is being
* parsed by this {@link Parser}.
*
* @return A {@link ParseResult} containing all {@link Parse}d feeds that were
* present in the feed file that this {@link Parser} dealt with.
*/
@Override
public ParseResult getParse(Content content) {
SyndFeed feed = null;
ParseResult parseResult = new ParseResult(content.getUrl());
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, defaultEncoding);
try {
InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
input.setEncoding(encoding);
SyndFeedInput feedInput = new SyndFeedInput();
feed = feedInput.build(input);
} catch (Exception e) {
// return empty parse
LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String feedLink = feed.getLink();
try {
feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
if (feedLink != null)
feedLink = filters.filter(feedLink);
} catch (Exception e) {
feedLink = null;
}
List<?> entries = feed.getEntries();
for (Object entry : entries) {
addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
}
String feedDesc = stripTags(feed.getDescriptionEx());
String feedTitle = stripTags(feed.getTitleEx());
parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
return parseResult;
}
use of org.apache.nutch.parse.ParseResult in project nutch by apache.
the class FetcherThread method output.
private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
if (content != null) {
Metadata metadata = content.getMetadata();
// store the guessed content type in the crawldatum
if (content.getContentType() != null)
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
// add segment to metadata
metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e);
}
}
if (status == CrawlDatum.STATUS_FETCH_SUCCESS) {
if (parsing && !(skipTruncated && ParseSegment.isTruncated(content))) {
try {
parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, StringUtils.stringifyException(e));
}
}
if (parseResult == null && (parsing || signatureWithoutParsing)) {
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}
}
/*
* Store status code in content So we can read this value during parsing
* (as a separate job) and decide to parse or not.
*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}
try {
context.write(key, new NutchWritable(datum));
if (content != null && storingContent)
context.write(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
ParseData parseData = parse.getData();
if (!parseStatus.isSuccess()) {
LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, parseStatus);
parse = parseStatus.getEmptyParse(conf);
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e);
}
}
String origin = null;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(url.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origin = URLUtil.getDomainName(originURL).toLowerCase();
} else // use host
{
origin = originURL.getHost().toLowerCase();
}
}
// used by fetchNode
if (fetchNode != null) {
fetchNode.setOutlinks(links);
fetchNode.setTitle(parseData.getTitle());
FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
}
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
HashSet<String> outlinks = new HashSet<>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
if (toUrl.length() > maxOutlinkLength) {
continue;
}
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
if (toUrl == null) {
continue;
}
validCount++;
links[i].setUrl(toUrl);
outlinkList.add(links[i]);
outlinks.add(toUrl);
}
// Publish fetch report event
if (activatePublisher) {
FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
reportEvent.addOutlinksToEventData(outlinkList);
reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
publisher.publish(reportEvent, conf);
}
// Only process depth N outlinks
if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth && !fetchQueues.timelimitExceeded()) {
FetchItem ft = FetchItem.create(url, null, queueMode);
FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
String followUrl;
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
followUrl = iter.next();
// Check whether we'll follow external outlinks
if (outlinksIgnoreExternal) {
if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
continue;
}
}
// Already followed?
int urlHashCode = followUrl.hashCode();
if (queue.alreadyFetched.contains(urlHashCode)) {
continue;
}
queue.alreadyFetched.add(urlHashCode);
// Create new FetchItem with depth incremented
FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
fetchQueues.addFetchItem(fit);
outlinkCounter++;
}
}
// Overwrite the outlinks in ParseData with the normalized and
// filtered set
parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
}
}
} catch (IOException e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:", e);
}
}
// multiple parses) which allows Fetcher to follow meta-redirects
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
}
return null;
}
use of org.apache.nutch.parse.ParseResult in project nutch by apache.
the class HtmlParser method getParse.
@Override
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) {
LOG.trace("Parsing...");
}
root = parse(input);
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
LOG.error("Error: ", e);
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
// okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
// extract text
utils.getText(sb, root);
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
// extract title
utils.getTitle(sb, root);
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) {
// okay to follow links
// extract outlinks
ArrayList<Outlink> l = new ArrayList<Outlink>();
URL baseTag = base;
String baseTagHref = utils.getBase(root);
if (baseTagHref != null) {
try {
baseTag = new URL(base, baseTagHref);
} catch (MalformedURLException e) {
baseTag = base;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
utils.getOutlinks(baseTag, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
}
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
if (metaTags.getNoCache()) {
// not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
use of org.apache.nutch.parse.ParseResult in project nutch by apache.
the class SmallStack method main.
/**
* @param args arguments are: 0. Name of input SWF file.
* @throws IOException if there is a fatal error processing the input
* file
*/
public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream(args[0]);
byte[] buf = new byte[in.available()];
in.read(buf);
in.close();
SWFParser parser = new SWFParser();
ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create()));
Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
Aggregations