Search in sources :

Example 56 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project stanbol by apache.

the class TikaEngine method activate.

@Override
protected void activate(ComponentContext ctx) throws ConfigurationException {
    super.activate(ctx);
    config = TikaConfig.getDefaultConfig();
    this.detector = config.getDetector();
    this.parser = new AutoDetectParser(config);
    this.skipLinebreaks = getBoolean(ctx.getProperties(), SKIP_LINEBREAKS_WITHIN_CONTENT, DEFAULT_SKIP_LINEBREAKS);
    this.ontologyMappings = new OntologyMappings();
    if (getBoolean(ctx.getProperties(), MAPPING_MEDIA_RESOURCE, DEFAULT_MAPPING_MEDIA_RESOURCE_STATE)) {
        addMediaResourceOntologyMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_DUBLIN_CORE_TERMS, DEFAULT_MAPPING_DUBLIN_CORE_TERMS_STATE)) {
        addDcMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_MESSAGE, DEFAULT_MAPPING_NEPOMUK_MESSAGE_STATE)) {
        addNepomukMessageMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_EXIF, DEFAULT_MAPPING_NEPOMUK_EXIF_STATE)) {
        addNepomukExifMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_SKOS, DEFAULT_MAPPING_SKOS_STATE)) {
        addSkosMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_RDFS, DEFAULT_MAPPING_RDFS_STATE)) {
        addRdfsMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_GEO, DEFAULT_MAPPING_GEO_STATE)) {
        addGeoMappings(ontologyMappings);
    }
    includeUnmappedProperties = getBoolean(ctx.getProperties(), UNMAPPED_PROPERTIES, DEFAULT_UNMAPPED_PROPERTIES_STATE);
}
Also used : AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OntologyMappings.addMediaResourceOntologyMappings(org.apache.stanbol.enhancer.engines.tika.metadata.OntologyMappings.addMediaResourceOntologyMappings) OntologyMappings(org.apache.stanbol.enhancer.engines.tika.metadata.OntologyMappings)

Example 57 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TIAParsingExample method testHtmlMapper.

public static void testHtmlMapper() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(HtmlMapper.class, new IdentityHtmlMapper());
    parser.parse(stream, handler, metadata, context);
}
Also used : IdentityHtmlMapper(org.apache.tika.parser.html.IdentityHtmlMapper) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 58 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class DumpTikaConfigExampleTest method testDump.

@Test
public void testDump() throws Exception {
    DumpTikaConfigExample ex = new DumpTikaConfigExample();
    for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
        for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
            Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
            TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
            writer.flush();
            writer.close();
            TikaConfig c = new TikaConfig(configFile);
            assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
            assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
            CompositeParser p = (CompositeParser) c.getParser();
            assertTrue("enough parsers?", p.getParsers().size() > 130);
            CompositeDetector d = (CompositeDetector) c.getDetector();
            assertTrue("enough detectors?", d.getDetectors().size() > 3);
            //just try to load it into autodetect to make sure no errors are thrown
            Parser auto = new AutoDetectParser(c);
            assertNotNull(auto);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) FileOutputStream(java.io.FileOutputStream) Charset(java.nio.charset.Charset) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OutputStreamWriter(java.io.OutputStreamWriter) TikaConfigSerializer(org.apache.tika.config.TikaConfigSerializer) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 59 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TIAParsingExample method parseURLStream.

public static void parseURLStream(String address) throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
        parser.parse(stream, handler, metadata, context);
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) URL(java.net.URL) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 60 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TIAParsingExample method useAutoDetectParser.

public static void useAutoDetectParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new AutoDetectParser();
    parser.parse(stream, handler, metadata, context);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12