Search in sources :

Example 1 with ParseText

use of org.apache.nutch.parse.ParseText in project nutch by apache.

the class FeedParser method getParse.

/**
 * Parses the given feed and extracts out and parsers all linked items within
 * the feed, using the underlying ROME feed parsing library.
 *
 * @param content
 *          A {@link Content} object representing the feed that is being
 *          parsed by this {@link Parser}.
 *
 * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
 *         present in the feed file that this {@link Parser} dealt with.
 */
public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());
    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
        InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
        input.setEncoding(encoding);
        SyndFeedInput feedInput = new SyndFeedInput();
        feed = feedInput.build(input);
    } catch (Exception e) {
        // return empty parse
        LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String feedLink = feed.getLink();
    try {
        feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
        if (feedLink != null)
            feedLink = filters.filter(feedLink);
    } catch (Exception e) {
        feedLink = null;
    }
    List<?> entries = feed.getEntries();
    for (Object entry : entries) {
        addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
    }
    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());
    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) InputSource(org.xml.sax.InputSource) ParseResult(org.apache.nutch.parse.ParseResult) ParseText(org.apache.nutch.parse.ParseText) SyndFeed(com.rometools.rome.feed.synd.SyndFeed) ParseStatus(org.apache.nutch.parse.ParseStatus) EncodingDetector(org.apache.nutch.util.EncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) SyndFeedInput(com.rometools.rome.io.SyndFeedInput)

Example 2 with ParseText

use of org.apache.nutch.parse.ParseText in project nutch by apache.

the class FeedParser method addToMap.

private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();
    try {
        link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
        if (link != null)
            link = filters.filter(link);
    } catch (Exception e) {
        e.printStackTrace();
        return;
    }
    if (link == null)
        return;
    title = stripTags(entry.getTitleEx());
    if (feedLink != null)
        parseMeta.set("feed", feedLink);
    addFields(parseMeta, contentMeta, feed, entry);
    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
    // with another plugin
    String contentType = contentMeta.get(Response.CONTENT_TYPE);
    if (description != null)
        text = description.getValue();
    if (text == null) {
        List<?> contents = entry.getContents();
        StringBuilder buf = new StringBuilder();
        for (Object syndContent : contents) {
            buf.append(((SyndContent) syndContent).getValue());
        }
        text = buf.toString();
    }
    try {
        Parser parser = parserFactory.getParsers(contentType, link)[0];
        parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
    } catch (ParserNotFound e) {
    /* ignore */
    }
    if (parse != null) {
        ParseData data = parse.getData();
        data.getContentMeta().remove(Response.CONTENT_TYPE);
        mergeMetadata(data.getParseMeta(), parseMeta);
        parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
    } else {
        contentMeta.remove(Response.CONTENT_TYPE);
        parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParserNotFound(org.apache.nutch.parse.ParserNotFound) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Parser(org.apache.nutch.parse.Parser) ParseText(org.apache.nutch.parse.ParseText) SyndContent(com.rometools.rome.feed.synd.SyndContent) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent)

Example 3 with ParseText

use of org.apache.nutch.parse.ParseText in project nutch by apache.

the class JSParseFilter method filter.

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());
    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
        Outlink[] old = parse.getData().getOutlinks();
        String title = parse.getData().getTitle();
        List<Outlink> list = Arrays.asList(old);
        outlinks.addAll(list);
        ParseStatus status = parse.getData().getStatus();
        String text = parse.getText();
        Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
        ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
        // replace original parse obj with new one
        parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Parse(org.apache.nutch.parse.Parse) ArrayList(java.util.ArrayList) ParseText(org.apache.nutch.parse.ParseText)

Example 4 with ParseText

use of org.apache.nutch.parse.ParseText in project nutch by apache.

the class TestIndexerMapReduce method runIndexer.

/**
 * Run {@link IndexerMapReduce.reduce(...)} to get a &quot;indexed&quot;
 * {@link NutchDocument} by passing objects from segment and CrawlDb to the
 * indexer.
 *
 * @param dbDatum
 *          crawl datum from CrawlDb
 * @param fetchDatum
 *          crawl datum (fetch status) from segment
 * @param parseText
 *          plain text from parsed document
 * @param parseData
 *          parse data
 * @param content
 *          (optional, if index binary content) protocol content
 * @return &quot;indexed&quot; document
 */
public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText, ParseData parseData, Content content) {
    List<NutchWritable> values = new ArrayList<NutchWritable>();
    values.add(new NutchWritable(dbDatum));
    values.add(new NutchWritable(fetchDatum));
    values.add(new NutchWritable(parseText));
    values.add(new NutchWritable(parseData));
    values.add(new NutchWritable(content));
    reduceDriver = ReduceDriver.newReduceDriver(reducer);
    reduceDriver.getConfiguration().addResource(configuration);
    reduceDriver.withInput(testUrlText, values);
    List<Pair<Text, NutchIndexAction>> reduceResult;
    NutchDocument doc = null;
    try {
        reduceResult = reduceDriver.run();
        for (Pair<Text, NutchIndexAction> p : reduceResult) {
            if (p.getSecond().action != NutchIndexAction.DELETE) {
                doc = p.getSecond().doc;
            }
        }
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
    }
    return doc;
}
Also used : ArrayList(java.util.ArrayList) NutchWritable(org.apache.nutch.crawl.NutchWritable) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) Pair(org.apache.hadoop.mrunit.types.Pair)

Example 5 with ParseText

use of org.apache.nutch.parse.ParseText in project nutch by apache.

the class TestSegmentMerger method setUp.

@Before
public void setUp() throws Exception {
    conf = NutchConfiguration.create();
    fs = FileSystem.get(conf);
    testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());
    seg1 = new Path(testDir, "seg1");
    seg2 = new Path(testDir, "seg2");
    out = new Path(testDir, "out");
    // create large parse-text segments
    System.err.println("Creating large segment 1...");
    DecimalFormat df = new DecimalFormat("0000000");
    Text k = new Text();
    Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
    Option kOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class);
    MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
    long curSize = 0;
    countSeg1 = 0;
    FileStatus fileStatus = fs.getFileStatus(ptPath);
    long blkSize = fileStatus.getBlockSize();
    while (curSize < blkSize * 2) {
        k.set("seg1-" + df.format(countSeg1));
        w.append(k, new ParseText("seg1 text " + countSeg1));
        countSeg1++;
        // roughly ...
        curSize += 40;
    }
    w.close();
    System.err.println(" - done: " + countSeg1 + " records.");
    System.err.println("Creating large segment 2...");
    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class);
    w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
    curSize = 0;
    countSeg2 = 0;
    while (curSize < blkSize * 2) {
        k.set("seg2-" + df.format(countSeg2));
        w.append(k, new ParseText("seg2 text " + countSeg2));
        countSeg2++;
        // roughly ...
        curSize += 40;
    }
    w.close();
    System.err.println(" - done: " + countSeg2 + " records.");
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DecimalFormat(java.text.DecimalFormat) MapFile(org.apache.hadoop.io.MapFile) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) ParseText(org.apache.nutch.parse.ParseText) Option(org.apache.hadoop.io.MapFile.Writer.Option) Before(org.junit.Before)

Aggregations

ParseText (org.apache.nutch.parse.ParseText)8 Text (org.apache.hadoop.io.Text)4 Outlink (org.apache.nutch.parse.Outlink)4 Parse (org.apache.nutch.parse.Parse)4 ParseData (org.apache.nutch.parse.ParseData)4 ParseStatus (org.apache.nutch.parse.ParseStatus)4 ArrayList (java.util.ArrayList)3 IOException (java.io.IOException)2 MalformedURLException (java.net.MalformedURLException)2 URL (java.net.URL)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Path (org.apache.hadoop.fs.Path)2 MapFile (org.apache.hadoop.io.MapFile)2 NutchWritable (org.apache.nutch.crawl.NutchWritable)2 Metadata (org.apache.nutch.metadata.Metadata)2 ParseResult (org.apache.nutch.parse.ParseResult)2 SyndContent (com.rometools.rome.feed.synd.SyndContent)1 SyndFeed (com.rometools.rome.feed.synd.SyndFeed)1 SyndFeedInput (com.rometools.rome.io.SyndFeedInput)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1