Search in sources :

Example 1 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class TestAny23ParseFilter method extract.

public String[] extract(String urlString, File file, String contentType) {
    try {
        System.out.println(urlString);
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        content.setContentType(contentType);
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) ParseException(org.apache.nutch.parse.ParseException) IOException(java.io.IOException)

Example 2 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class TestCCParseFilter method pageTest.

public void pageTest(File file, String url, String license, String location, String type) throws Exception {
    String contentType = "text/html";
    InputStream in = new FileInputStream(file);
    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
    byte[] buffer = new byte[1024];
    int i;
    while ((i = in.read(buffer)) != -1) {
        out.write(buffer, 0, i);
    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();
    Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();
    Assert.assertEquals(license, metadata.get("License-Url"));
    Assert.assertEquals(location, metadata.get("License-Location"));
    Assert.assertEquals(type, metadata.get("Work-Type"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata)

Example 3 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class TestExtParser method testIt.

@Test
public void testIt() throws ParseException {
    String contentType;
    // now test only on linux platform
    if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
        System.err.println("Current OS is " + System.getProperty("os.name") + ".");
        System.err.println("No test is run on OS other than linux.");
        return;
    }
    Configuration conf = NutchConfiguration.create();
    // loop alternately, total 10*2 times of invoking external command
    for (int i = 0; i < 10; i++) {
        // check external parser that does 'cat'
        contentType = "application/vnd.nutch.example.cat";
        content.setContentType(contentType);
        parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
        Assert.assertEquals(expectedText, parse.getText());
        // check external parser that does 'md5sum'
        contentType = "application/vnd.nutch.example.md5sum";
        content.setContentType(contentType);
        parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
        Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
    }
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Test(org.junit.Test)

Example 4 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class TestRTFParser method testIt.

@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class ZipTextExtractor method extractText.

public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
    String resultText = "";
    ZipInputStream zin = new ZipInputStream(input);
    ZipEntry entry;
    while ((entry = zin.getNextEntry()) != null) {
        if (!entry.isDirectory()) {
            int size = (int) entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; x++) {
                int err = zin.read();
                if (err != -1) {
                    b[x] = (byte) err;
                }
            }
            String newurl = url + "/";
            String fname = entry.getName();
            newurl += fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf('.');
            if (i != -1) {
                // Trying to resolve the Mime-Type
                Tika tika = new Tika();
                String contentType = tika.detect(fname);
                try {
                    Metadata metadata = new Metadata();
                    metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
                    metadata.set(Response.CONTENT_TYPE, contentType);
                    Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                    Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                    ParseData theParseData = parse.getData();
                    Outlink[] theOutlinks = theParseData.getOutlinks();
                    for (int count = 0; count < theOutlinks.length; count++) {
                        outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                    }
                    resultText += entry.getName() + " " + parse.getText() + " ";
                } catch (ParseException e) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
                    }
                }
            }
        }
    }
    return resultText;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) ZipEntry(java.util.zip.ZipEntry) Metadata(org.apache.nutch.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL) ZipInputStream(java.util.zip.ZipInputStream) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) ParseException(org.apache.nutch.parse.ParseException)

Aggregations

ParseUtil (org.apache.nutch.parse.ParseUtil)17 Parse (org.apache.nutch.parse.Parse)16 Content (org.apache.nutch.protocol.Content)15 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Protocol (org.apache.nutch.protocol.Protocol)11 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)11 Configuration (org.apache.hadoop.conf.Configuration)10 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)10 Test (org.junit.Test)10 Metadata (org.apache.nutch.metadata.Metadata)4 Map (java.util.Map)2 Inlinks (org.apache.nutch.crawl.Inlinks)2 Outlink (org.apache.nutch.parse.Outlink)2 ParseData (org.apache.nutch.parse.ParseData)2 ParseException (org.apache.nutch.parse.ParseException)2 ParseResult (org.apache.nutch.parse.ParseResult)2 IOException (java.io.IOException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1