use of org.apache.nutch.parse.ParseText in project nutch by apache.
the class FeedParser method getParse.
/**
* Parses the given feed and extracts out and parsers all linked items within
* the feed, using the underlying ROME feed parsing library.
*
* @param content
* A {@link Content} object representing the feed that is being
* parsed by this {@link Parser}.
*
* @return A {@link ParseResult} containing all {@link Parse}d feeds that were
* present in the feed file that this {@link Parser} dealt with.
*/
public ParseResult getParse(Content content) {
SyndFeed feed = null;
ParseResult parseResult = new ParseResult(content.getUrl());
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, defaultEncoding);
try {
InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
input.setEncoding(encoding);
SyndFeedInput feedInput = new SyndFeedInput();
feed = feedInput.build(input);
} catch (Exception e) {
// return empty parse
LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String feedLink = feed.getLink();
try {
feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
if (feedLink != null)
feedLink = filters.filter(feedLink);
} catch (Exception e) {
feedLink = null;
}
List<?> entries = feed.getEntries();
for (Object entry : entries) {
addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
}
String feedDesc = stripTags(feed.getDescriptionEx());
String feedTitle = stripTags(feed.getTitleEx());
parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
return parseResult;
}
use of org.apache.nutch.parse.ParseText in project nutch by apache.
the class FeedParser method addToMap.
private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
String link = entry.getLink(), text = null, title = null;
Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
Parse parse = null;
SyndContent description = entry.getDescription();
try {
link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
if (link != null)
link = filters.filter(link);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (link == null)
return;
title = stripTags(entry.getTitleEx());
if (feedLink != null)
parseMeta.set("feed", feedLink);
addFields(parseMeta, contentMeta, feed, entry);
// some item descriptions contain markup text in them,
// so we temporarily set their content-type to parse them
// with another plugin
String contentType = contentMeta.get(Response.CONTENT_TYPE);
if (description != null)
text = description.getValue();
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
for (Object syndContent : contents) {
buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
try {
Parser parser = parserFactory.getParsers(contentType, link)[0];
parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
} catch (ParserNotFound e) {
/* ignore */
}
if (parse != null) {
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
}
}
use of org.apache.nutch.parse.ParseText in project nutch by apache.
the class JSParseFilter method filter.
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String url = content.getBaseUrl();
ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
walk(doc, parse, metaTags, url, outlinks);
if (outlinks.size() > 0) {
Outlink[] old = parse.getData().getOutlinks();
String title = parse.getData().getTitle();
List<Outlink> list = Arrays.asList(old);
outlinks.addAll(list);
ParseStatus status = parse.getData().getStatus();
String text = parse.getText();
Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
// replace original parse obj with new one
parseResult.put(content.getUrl(), new ParseText(text), parseData);
}
return parseResult;
}
use of org.apache.nutch.parse.ParseText in project nutch by apache.
the class TestIndexerMapReduce method runIndexer.
/**
* Run {@link IndexerMapReduce.reduce(...)} to get a "indexed"
* {@link NutchDocument} by passing objects from segment and CrawlDb to the
* indexer.
*
* @param dbDatum
* crawl datum from CrawlDb
* @param fetchDatum
* crawl datum (fetch status) from segment
* @param parseText
* plain text from parsed document
* @param parseData
* parse data
* @param content
* (optional, if index binary content) protocol content
* @return "indexed" document
*/
public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText, ParseData parseData, Content content) {
List<NutchWritable> values = new ArrayList<NutchWritable>();
values.add(new NutchWritable(dbDatum));
values.add(new NutchWritable(fetchDatum));
values.add(new NutchWritable(parseText));
values.add(new NutchWritable(parseData));
values.add(new NutchWritable(content));
reduceDriver = ReduceDriver.newReduceDriver(reducer);
reduceDriver.getConfiguration().addResource(configuration);
reduceDriver.withInput(testUrlText, values);
List<Pair<Text, NutchIndexAction>> reduceResult;
NutchDocument doc = null;
try {
reduceResult = reduceDriver.run();
for (Pair<Text, NutchIndexAction> p : reduceResult) {
if (p.getSecond().action != NutchIndexAction.DELETE) {
doc = p.getSecond().doc;
}
}
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
}
return doc;
}
use of org.apache.nutch.parse.ParseText in project nutch by apache.
the class TestSegmentMerger method setUp.
@Before
public void setUp() throws Exception {
conf = NutchConfiguration.create();
fs = FileSystem.get(conf);
testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
seg1 = new Path(testDir, "seg1");
seg2 = new Path(testDir, "seg2");
out = new Path(testDir, "out");
// create large parse-text segments
System.err.println("Creating large segment 1...");
DecimalFormat df = new DecimalFormat("0000000");
Text k = new Text();
Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
Option kOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class);
MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
long curSize = 0;
countSeg1 = 0;
FileStatus fileStatus = fs.getFileStatus(ptPath);
long blkSize = fileStatus.getBlockSize();
while (curSize < blkSize * 2) {
k.set("seg1-" + df.format(countSeg1));
w.append(k, new ParseText("seg1 text " + countSeg1));
countSeg1++;
// roughly ...
curSize += 40;
}
w.close();
System.err.println(" - done: " + countSeg1 + " records.");
System.err.println("Creating large segment 2...");
ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class);
w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
curSize = 0;
countSeg2 = 0;
while (curSize < blkSize * 2) {
k.set("seg2-" + df.format(countSeg2));
w.append(k, new ParseText("seg2 text " + countSeg2));
countSeg2++;
// roughly ...
curSize += 40;
}
w.close();
System.err.println(" - done: " + countSeg2 + " records.");
}
Aggregations