Search in sources :

Example 6 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class TestRegexParseFilter method testPositiveFilter.

public void testPositiveFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    conf.set("parsefilter.regex.file", file);
    RegexParseFilter filter = new RegexParseFilter();
    String url = "";
    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("true", meta.get("first"));
    assertEquals("true", meta.get("second"));
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 7 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class TestRegexParseFilter method testNegativeFilter.

public void testNegativeFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    conf.set("parsefilter.regex.file", file);
    RegexParseFilter filter = new RegexParseFilter();
    String url = "";
    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("false", meta.get("first"));
    assertEquals("false", meta.get("second"));
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 8 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class TestFeedParser method testParseFetchChannel.

 * Calls the {@link FeedParser} on a sample RSS file and checks that there are
 * 3 {@link ParseResult} entries including the below 2 links:
 * <ul>
 * <li></li>
 * <li></li>
 * </ul>
 * @throws ProtocolNotFound
 *           If the {@link Protocol}Layer cannot be loaded (required to fetch
 *           the {@link Content} for the RSS file).
 * @throws ParseException
 *           If the {@link Parser}Layer cannot be loaded.
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        urlString = urlString.replace('\\', '/');
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
        Assert.assertEquals(3, parseResult.size());
        boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
        for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
            Map.Entry<Text, Parse> entry =;
            if (entry.getKey().toString().equals("")) {
                hasLink1 = true;
            } else if (entry.getKey().toString().equals("")) {
                hasLink2 = true;
            } else if (entry.getKey().toString().equals(urlString)) {
                hasLink3 = true;
        if (!hasLink1 || !hasLink2 || !hasLink3) {
  "Outlinks read from sample rss file are not correct!");
Also used : ParseResult(org.apache.nutch.parse.ParseResult) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Map(java.util.Map) Test(org.junit.Test)

Example 9 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }"fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key =;
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    int maxRedirects = getConf().getInt("http.redirect.max", 3);
    if (followRedirects) {
        if (maxRedirects == 0) {
  "Following max. 3 redirects (ignored http.redirect.max == 0)");
            maxRedirects = 3;
        } else {
  "Following max. {} redirects", maxRedirects);
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    int numRedirects = 0;
    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];"Follow redirect to {}", url);
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        // try again
        protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    if (checkRobotsTxt && protocolOutput == null) {
        System.err.println("Fetch disallowed by robots.txt");
        return -1;
    if (!protocolOutput.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus());
        if (protocolOutput.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        return -1;
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    String contentType = content.getContentType();
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return -1;
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }"parsing: {}", url);"contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    IndexingFilters indexers = new IndexingFilters(getConf());
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
    // For readability if keepClientCnxOpen
    if (doIndex) {
        IndexWriters writers = IndexWriters.get(getConf());, "IndexingFilterChecker");
    return 0;
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Example 10 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class TikaParser method getParse.

ParseResult getParse(Content content, HTMLDocumentImpl doc, DocumentFragment root) {
    String mimeType = content.getContentType();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    // get the right parser using the mime type as a clue
    CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
    Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    LOG.debug("Using Tika parser {} for mime-type {}.", parser.getClass().getName(), mimeType);
    byte[] raw = content.getContent();
    Metadata tikamd = new Metadata();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domHandler = (ContentHandler) domBuilder;
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    if (parseEmbedded) {
        context.set(Parser.class, new AutoDetectParser(tikaConfig));
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) {
            nutchMetadata.add(tikaMDName, v);
            if (tikaMDName.equalsIgnoreCase(Nutch.ROBOTS_METATAG) && nutchMetadata.get(Nutch.ROBOTS_METATAG) == null) {
                // NUTCH-2720 force lowercase robots directive
                nutchMetadata.add(Nutch.ROBOTS_METATAG, v);
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    return filteredParse;
Also used : MalformedURLException( Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL( BoilerpipeContentHandler(org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) CompositeParser(org.apache.tika.parser.CompositeParser) MalformedURLException( Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayInputStream( ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)


ParseResult (org.apache.nutch.parse.ParseResult)11 Parse (org.apache.nutch.parse.Parse)10 Metadata (org.apache.nutch.metadata.Metadata)7 Content (org.apache.nutch.protocol.Content)7 ParseData (org.apache.nutch.parse.ParseData)6 Configuration (org.apache.hadoop.conf.Configuration)5 ParseImpl (org.apache.nutch.parse.ParseImpl)5 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)5 Map (java.util.Map)4 Text ( Outlink (org.apache.nutch.parse.Outlink)4 ParseStatus (org.apache.nutch.parse.ParseStatus)4 ByteArrayInputStream ( FileInputStream ( MalformedURLException ( URL ( ArrayList (java.util.ArrayList)3 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)3 ParseText (org.apache.nutch.parse.ParseText)3 File (