use of org.apache.tika.parser.AutoDetectParser in project stanbol by apache.
the class TikaEngine method activate.
@Override
protected void activate(ComponentContext ctx) throws ConfigurationException {
super.activate(ctx);
config = TikaConfig.getDefaultConfig();
this.detector = config.getDetector();
this.parser = new AutoDetectParser(config);
this.skipLinebreaks = getBoolean(ctx.getProperties(), SKIP_LINEBREAKS_WITHIN_CONTENT, DEFAULT_SKIP_LINEBREAKS);
this.ontologyMappings = new OntologyMappings();
if (getBoolean(ctx.getProperties(), MAPPING_MEDIA_RESOURCE, DEFAULT_MAPPING_MEDIA_RESOURCE_STATE)) {
addMediaResourceOntologyMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_DUBLIN_CORE_TERMS, DEFAULT_MAPPING_DUBLIN_CORE_TERMS_STATE)) {
addDcMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_MESSAGE, DEFAULT_MAPPING_NEPOMUK_MESSAGE_STATE)) {
addNepomukMessageMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_EXIF, DEFAULT_MAPPING_NEPOMUK_EXIF_STATE)) {
addNepomukExifMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_SKOS, DEFAULT_MAPPING_SKOS_STATE)) {
addSkosMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_RDFS, DEFAULT_MAPPING_RDFS_STATE)) {
addRdfsMappings(ontologyMappings);
}
if (getBoolean(ctx.getProperties(), MAPPING_GEO, DEFAULT_MAPPING_GEO_STATE)) {
addGeoMappings(ontologyMappings);
}
includeUnmappedProperties = getBoolean(ctx.getProperties(), UNMAPPED_PROPERTIES, DEFAULT_UNMAPPED_PROPERTIES_STATE);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TIAParsingExample method testHtmlMapper.
public static void testHtmlMapper() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(HtmlMapper.class, new IdentityHtmlMapper());
parser.parse(stream, handler, metadata, context);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class DumpTikaConfigExampleTest method testDump.
@Test
public void testDump() throws Exception {
DumpTikaConfigExample ex = new DumpTikaConfigExample();
for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
writer.flush();
writer.close();
TikaConfig c = new TikaConfig(configFile);
assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
CompositeParser p = (CompositeParser) c.getParser();
assertTrue("enough parsers?", p.getParsers().size() > 130);
CompositeDetector d = (CompositeDetector) c.getDetector();
assertTrue("enough detectors?", d.getDetectors().size() > 3);
//just try to load it into autodetect to make sure no errors are thrown
Parser auto = new AutoDetectParser(c);
assertNotNull(auto);
}
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TIAParsingExample method parseURLStream.
public static void parseURLStream(String address) throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
parser.parse(stream, handler, metadata, context);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TIAParsingExample method useAutoDetectParser.
public static void useAutoDetectParser() throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new AutoDetectParser();
parser.parse(stream, handler, metadata, context);
}
Aggregations