Search in sources :

Example 6 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestMoreIndexingFilter method testContentDispositionTitle.

@Test
public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Text url = new Text("http://www.example.com/");
    ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
    NutchDocument doc = new NutchDocument();
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
    doc = new NutchDocument();
    doc.add("title", "title");
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 7 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class MimeTypeIndexingFilterTest method setUp.

@Before
public void setUp() throws Exception {
    for (int i = 0; i < MIME_TYPES.length; i++) {
        Metadata metadata = new Metadata();
        metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        parses[i] = parse;
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) Before(org.junit.Before)

Example 8 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class ExtParser method getParse.

public ParseResult getParse(Content content) {
    String contentType = content.getContentType();
    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
        return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
    String command = params[0];
    int timeout = Integer.parseInt(params[1]);
    String encoding = params[2];
    if (LOG.isTraceEnabled()) {
        LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
    }
    String text = null;
    String title = null;
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
        ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
        CommandRunner cr = new CommandRunner();
        cr.setCommand(command + " " + contentType);
        cr.setInputStream(new ByteArrayInputStream(raw));
        cr.setStdOutputStream(os);
        cr.setStdErrorStream(es);
        cr.setTimeout(timeout);
        cr.evaluate();
        if (cr.getExitValue() != 0)
            return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
        text = os.toString(encoding);
    } catch (Exception e) {
        // run time exception
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (text == null)
        text = "";
    if (title == null)
        title = "";
    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ParseStatus(org.apache.nutch.parse.ParseStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CommandRunner(org.apache.nutch.util.CommandRunner)

Example 9 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class HtmlParser method getParse.

public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Example 10 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class JSParseFilter method getParse.

public ParseResult getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
        return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
    if (outlinks == null)
        outlinks = new Outlink[0];
    // Title? use the first line of the script...
    String title;
    int idx = script.indexOf('\n');
    if (idx != -1) {
        if (idx > MAX_TITLE_LEN)
            idx = MAX_TITLE_LEN;
        title = script.substring(0, idx);
    } else {
        idx = Math.min(MAX_TITLE_LEN, script.length());
        title = script.substring(0, idx);
    }
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata());
    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Aggregations

ParseStatus (org.apache.nutch.parse.ParseStatus)25 ParseData (org.apache.nutch.parse.ParseData)23 ParseImpl (org.apache.nutch.parse.ParseImpl)21 Outlink (org.apache.nutch.parse.Outlink)16 Text (org.apache.hadoop.io.Text)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)13 Test (org.junit.Test)11 NutchDocument (org.apache.nutch.indexer.NutchDocument)10 Metadata (org.apache.nutch.metadata.Metadata)9 URL (java.net.URL)8 IOException (java.io.IOException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 MalformedURLException (java.net.MalformedURLException)5 ArrayList (java.util.ArrayList)5 Configuration (org.apache.hadoop.conf.Configuration)5 Parse (org.apache.nutch.parse.Parse)5 ParseText (org.apache.nutch.parse.ParseText)5 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)5 Inlink (org.apache.nutch.crawl.Inlink)4