Search in sources :

Example 1 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class EncodingDetector method main.

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.err.println("Usage: EncodingDetector <file>");
        System.exit(1);
    }
    Configuration conf = NutchConfiguration.create();
    EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());
    // do everything as bytes; don't want any conversion
    BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
    ByteArrayOutputStream ostr = new ByteArrayOutputStream();
    byte[] bytes = new byte[1000];
    boolean more = true;
    while (more) {
        int len = istr.read(bytes);
        if (len < bytes.length) {
            more = false;
            if (len > 0) {
                ostr.write(bytes, 0, len);
            }
        } else {
            ostr.write(bytes);
        }
    }
    byte[] data = ostr.toByteArray();
    // make a fake Content
    Content content = new Content("", "", data, "text/html", new Metadata(), conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
    System.out.println("Guessed encoding: " + encoding);
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) BufferedInputStream(java.io.BufferedInputStream) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) FileInputStream(java.io.FileInputStream)

Example 2 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class Any23ParseFilter method filter.

/**
 * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
 */
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
    String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
    String contentType = content.getContentType();
    if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
        LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
        return parseResult;
    }
    Any23Parser parser;
    try {
        String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
        parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
    } catch (TripleHandlerException e) {
        throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
    }
    Set<String> triples = parser.getTriples();
    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();
    for (String triple : triples) {
        metadata.add(ANY23_TRIPLES, triple);
    }
    return parseResult;
}
Also used : TripleHandlerException(org.apache.any23.writer.TripleHandlerException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata)

Example 3 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestAny23IndexingFilter method testAny23TriplesFields.

@Test
public void testAny23TriplesFields() throws Exception {
    Configuration conf = NutchConfiguration.create();
    Any23IndexingFilter filter = new Any23IndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page", new Outlink[] {}, new Metadata());
    ParseImpl parse = new ParseImpl("test page", parseData);
    String[] triples = new String[] { "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2002/07/owl#sameAs> <http://rdf.freebase.com/ns/m.08966> .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://dbpedia.org/property/yearHumidity> \"77\" .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2000/01/rdf-schema#label> \"Zurique\"@pt ." };
    for (String triple : triples) {
        parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple);
    }
    try {
        doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks());
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    List<Object> docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues();
    Assert.assertEquals(docTriples.size(), triples.length);
    Object triple = docTriples.get(0);
    Assert.assertTrue(triple instanceof Map<?, ?>);
    @SuppressWarnings("unchecked") Map<String, String> structuredData = (Map<String, String>) triple;
    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
    Assert.assertEquals(structuredData.get("key"), "<http://www.w3.org/2002/07/owl#sameAs>");
    Assert.assertEquals(structuredData.get("short_key"), "sameAs");
    Assert.assertEquals(structuredData.get("value"), "<http://rdf.freebase.com/ns/m.08966>");
    triple = docTriples.get(1);
    Assert.assertTrue(triple instanceof Map<?, ?>);
    structuredData = (Map<String, String>) triple;
    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
    Assert.assertEquals(structuredData.get("key"), "<http://dbpedia.org/property/yearHumidity>");
    Assert.assertEquals(structuredData.get("short_key"), "yearHumidity");
    Assert.assertEquals(structuredData.get("value"), "\"77\"");
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map) Test(org.junit.Test)

Example 4 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class CCIndexingFilter method filter.

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    Metadata metadata = parse.getData().getParseMeta();
    // index the license
    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
    if (licenseUrl != null) {
        if (LOG.isInfoEnabled()) {
            LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
        }
        // add the entire license as cc:license=xxx
        addFeature(doc, "license=" + licenseUrl);
        // index license attributes extracted of the license url
        addUrlFeatures(doc, licenseUrl);
    }
    // index the license location as cc:meta=xxx
    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
    if (licenseLocation != null) {
        addFeature(doc, "meta=" + licenseLocation);
    }
    // index the work type cc:type=xxx
    String workType = metadata.get(CreativeCommons.WORK_TYPE);
    if (workType != null) {
        addFeature(doc, workType);
    }
    return doc;
}
Also used : Metadata(org.apache.nutch.metadata.Metadata)

Example 5 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestCCParseFilter method pageTest.

public void pageTest(File file, String url, String license, String location, String type) throws Exception {
    String contentType = "text/html";
    InputStream in = new FileInputStream(file);
    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
    byte[] buffer = new byte[1024];
    int i;
    while ((i = in.read(buffer)) != -1) {
        out.write(buffer, 0, i);
    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();
    Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();
    Assert.assertEquals(license, metadata.get("License-Url"));
    Assert.assertEquals(location, metadata.get("License-Location"));
    Assert.assertEquals(type, metadata.get("Work-Type"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4