use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class EncodingDetector method main.
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.err.println("Usage: EncodingDetector <file>");
System.exit(1);
}
Configuration conf = NutchConfiguration.create();
EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());
// do everything as bytes; don't want any conversion
BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
ByteArrayOutputStream ostr = new ByteArrayOutputStream();
byte[] bytes = new byte[1000];
boolean more = true;
while (more) {
int len = istr.read(bytes);
if (len < bytes.length) {
more = false;
if (len > 0) {
ostr.write(bytes, 0, len);
}
} else {
ostr.write(bytes);
}
}
byte[] data = ostr.toByteArray();
// make a fake Content
Content content = new Content("", "", data, "text/html", new Metadata(), conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
System.out.println("Guessed encoding: " + encoding);
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class Any23ParseFilter method filter.
/**
* @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
String contentType = content.getContentType();
if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
return parseResult;
}
Any23Parser parser;
try {
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
}
Set<String> triples = parser.getTriples();
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
for (String triple : triples) {
metadata.add(ANY23_TRIPLES, triple);
}
return parseResult;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestAny23IndexingFilter method testAny23TriplesFields.
@Test
public void testAny23TriplesFields() throws Exception {
Configuration conf = NutchConfiguration.create();
Any23IndexingFilter filter = new Any23IndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page", new Outlink[] {}, new Metadata());
ParseImpl parse = new ParseImpl("test page", parseData);
String[] triples = new String[] { "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2002/07/owl#sameAs> <http://rdf.freebase.com/ns/m.08966> .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://dbpedia.org/property/yearHumidity> \"77\" .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2000/01/rdf-schema#label> \"Zurique\"@pt ." };
for (String triple : triples) {
parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple);
}
try {
doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
List<Object> docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues();
Assert.assertEquals(docTriples.size(), triples.length);
Object triple = docTriples.get(0);
Assert.assertTrue(triple instanceof Map<?, ?>);
@SuppressWarnings("unchecked") Map<String, String> structuredData = (Map<String, String>) triple;
Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
Assert.assertEquals(structuredData.get("key"), "<http://www.w3.org/2002/07/owl#sameAs>");
Assert.assertEquals(structuredData.get("short_key"), "sameAs");
Assert.assertEquals(structuredData.get("value"), "<http://rdf.freebase.com/ns/m.08966>");
triple = docTriples.get(1);
Assert.assertTrue(triple instanceof Map<?, ?>);
structuredData = (Map<String, String>) triple;
Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
Assert.assertEquals(structuredData.get("key"), "<http://dbpedia.org/property/yearHumidity>");
Assert.assertEquals(structuredData.get("short_key"), "yearHumidity");
Assert.assertEquals(structuredData.get("value"), "\"77\"");
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class CCIndexingFilter method filter.
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
// index the license
String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
if (licenseUrl != null) {
if (LOG.isInfoEnabled()) {
LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
}
// add the entire license as cc:license=xxx
addFeature(doc, "license=" + licenseUrl);
// index license attributes extracted of the license url
addUrlFeatures(doc, licenseUrl);
}
// index the license location as cc:meta=xxx
String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
if (licenseLocation != null) {
addFeature(doc, "meta=" + licenseLocation);
}
// index the work type cc:type=xxx
String workType = metadata.get(CreativeCommons.WORK_TYPE);
if (workType != null) {
addFeature(doc, workType);
}
return doc;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestCCParseFilter method pageTest.
public void pageTest(File file, String url, String license, String location, String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
Aggregations