Search in sources :

Example 21 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class ZipTextExtractor method extractText.

public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
    String resultText = "";
    ZipInputStream zin = new ZipInputStream(input);
    ZipEntry entry;
    while ((entry = zin.getNextEntry()) != null) {
        if (!entry.isDirectory()) {
            int size = (int) entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; x++) {
                int err = zin.read();
                if (err != -1) {
                    b[x] = (byte) err;
                }
            }
            String newurl = url + "/";
            String fname = entry.getName();
            newurl += fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf('.');
            if (i != -1) {
                // Trying to resolve the Mime-Type
                Tika tika = new Tika();
                String contentType = tika.detect(fname);
                try {
                    Metadata metadata = new Metadata();
                    metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
                    metadata.set(Response.CONTENT_TYPE, contentType);
                    Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                    Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                    ParseData theParseData = parse.getData();
                    Outlink[] theOutlinks = theParseData.getOutlinks();
                    for (int count = 0; count < theOutlinks.length; count++) {
                        outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                    }
                    resultText += entry.getName() + " " + parse.getText() + " ";
                } catch (ParseException e) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
                    }
                }
            }
        }
    }
    return resultText;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) ZipEntry(java.util.zip.ZipEntry) Metadata(org.apache.nutch.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL) ZipInputStream(java.util.zip.ZipInputStream) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) ParseException(org.apache.nutch.parse.ParseException)

Example 22 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestRegexParseFilter method testPositiveFilter.

public void testPositiveFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    RegexParseFilter filter = new RegexParseFilter(file);
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("true", meta.get("first"));
    assertEquals("true", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 23 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestRegexParseFilter method testNegativeFilter.

public void testNegativeFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    RegexParseFilter filter = new RegexParseFilter(file);
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("false", meta.get("first"));
    assertEquals("false", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 24 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestFetcher method testFetch.

@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
    // generate seedlist
    ArrayList<String> urls = new ArrayList<String>();
    addUrl(urls, "index.html");
    addUrl(urls, "pagea.html");
    addUrl(urls, "pageb.html");
    addUrl(urls, "dup_of_pagea.html");
    addUrl(urls, "nested_spider_trap.html");
    addUrl(urls, "exception.html");
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
    // inject
    Injector injector = new Injector(conf);
    injector.inject(crawldbPath, urlPath);
    // generate
    Generator g = new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
    long time = System.currentTimeMillis();
    // fetch
    Fetcher fetcher = new Fetcher(conf);
    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);
    fetcher.fetch(generatedSegment[0], 1);
    time = System.currentTimeMillis() - time;
    // verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
    Assert.assertTrue(time > minimumTime);
    // verify content
    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
    @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
    ArrayList<String> handledurls = new ArrayList<String>();
    READ_CONTENT: do {
        Text key = new Text();
        Content value = new Content();
        if (!reader.next(key, value))
            break READ_CONTENT;
        String contentString = new String(value.getContent());
        if (contentString.indexOf("Nutch fetcher test page") != -1) {
            handledurls.add(key.toString());
        }
    } while (true);
    reader.close();
    Collections.sort(urls);
    Collections.sort(handledurls);
    // verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());
    // verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
    handledurls.clear();
    // verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
    READ_PARSE_DATA: do {
        Text key = new Text();
        ParseData value = new ParseData();
        if (!reader.next(key, value))
            break READ_PARSE_DATA;
        // make sure they all contain "nutch.segment.name" and
        // "nutch.content.digest"
        // keys in parse metadata
        Metadata contentMeta = value.getContentMeta();
        if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
            handledurls.add(key.toString());
        }
    } while (true);
    Collections.sort(handledurls);
    Assert.assertEquals(urls.size(), handledurls.size());
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) ParseData(org.apache.nutch.parse.ParseData) Injector(org.apache.nutch.crawl.Injector) Content(org.apache.nutch.protocol.Content) Generator(org.apache.nutch.crawl.Generator) Test(org.junit.Test)

Example 25 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestIndexingFilters method testNonExistingIndexingFilter.

/**
 * Test behaviour when defined filter does not exist.
 *
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4