Search in sources :

Example 16 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterInlinks.

@Test
public void testNoFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals("All inlinks must be indexed even those from the same host", inlinks.size(), doc.getField("inlinks").getValues().size());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 17 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterOutlinks.

@Test
public void testNoFilterOutlinks() throws Exception {
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks();
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("All outlinks must be indexed even those from the same host", outlinks.length, doc.getField("outlinks").getValues().size());
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 18 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

@Test
public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 19 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class MimeTypeIndexingFilter method main.

/**
 * Main method for invoking this tool
 *
 * @throws IOException
 * @throws IndexingException
 */
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    options.addOption(helpOpt).addOption(rulesOpt);
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
            return;
        }
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        return;
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
        return;
    }
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    filter.setConf(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
            System.out.println(line);
        } else {
            System.out.print("- ");
            System.out.println(line);
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader(java.io.InputStreamReader) NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException(java.io.IOException) IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader(java.io.BufferedReader) ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Example 20 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class SmallStack method getParse.

@Override
public ParseResult getParse(Content content) {
    String text = null;
    Vector<Outlink> outlinks = new Vector<>();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ExtractText extractor = new ExtractText();
        // TagParser implements SWFTags and drives a SWFTagTypes interface
        TagParser parser = new TagParser(extractor);
        // use this instead to debug the file
        // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
        // SWFReader reads an input file and drives a SWFTags interface
        SWFReader reader = new SWFReader(parser, new InStream(raw));
        // read the input SWF file and pass it through the interface pipeline
        reader.readFile();
        text = extractor.getText();
        String atext = extractor.getActionText();
        if (atext != null && atext.length() > 0)
            text += "\n--------\n" + atext;
        // harvest potential outlinks
        String[] links = extractor.getUrls();
        for (int i = 0; i < links.length; i++) {
            Outlink out = new Outlink(links[i], "");
            outlinks.add(out);
        }
        Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
        if (olinks != null)
            for (int i = 0; i < olinks.length; i++) {
                outlinks.add(olinks[i]);
            }
    } catch (Exception e) {
        // run time exception
        LOG.error("Error, runtime exception: ", e);
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (text == null)
        text = "";
    Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) TagParser(com.anotherbigidea.flash.readers.TagParser) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) SWFReader(com.anotherbigidea.flash.readers.SWFReader) ParseData(org.apache.nutch.parse.ParseData) InStream(com.anotherbigidea.io.InStream) ParseImpl(org.apache.nutch.parse.ParseImpl) Vector(java.util.Vector)

Aggregations

ParseStatus (org.apache.nutch.parse.ParseStatus)25 ParseData (org.apache.nutch.parse.ParseData)23 ParseImpl (org.apache.nutch.parse.ParseImpl)21 Outlink (org.apache.nutch.parse.Outlink)16 Text (org.apache.hadoop.io.Text)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)13 Test (org.junit.Test)11 NutchDocument (org.apache.nutch.indexer.NutchDocument)10 Metadata (org.apache.nutch.metadata.Metadata)9 URL (java.net.URL)8 IOException (java.io.IOException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 MalformedURLException (java.net.MalformedURLException)5 ArrayList (java.util.ArrayList)5 Configuration (org.apache.hadoop.conf.Configuration)5 Parse (org.apache.nutch.parse.Parse)5 ParseText (org.apache.nutch.parse.ParseText)5 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)5 Inlink (org.apache.nutch.crawl.Inlink)4