Search in sources :

Example 11 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class FontParsersTest method testTTFParsing.

@Test
public void testTTFParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));
    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 12 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class DBFParserTest method testVariants.

@Test
public void testVariants() throws Exception {
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    try (InputStream is = getResourceAsStream("/test-documents/testDBF.dbf")) {
        IOUtils.copy(is, bos);
    }
    byte[] bytes = bos.toByteArray();
    for (DBFReader.Version version : DBFReader.Version.values()) {
        //this cast happens to work because of the range of possible values
        bytes[0] = (byte) version.getId();
        XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata());
        assertEquals(version.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 13 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TikaResource method createParser.

@SuppressWarnings("serial")
public static Parser createParser() {
    final Parser parser = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    ((AutoDetectParser) parser).setParsers(parsers);
    ((AutoDetectParser) parser).setFallback(new Parser() {

        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
            return parser.getSupportedTypes(parseContext);
        }

        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
            throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
        }
    });
    if (digester != null) {
        return new DigestingParser(parser, digester);
    }
    return parser;
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) Set(java.util.Set) WebApplicationException(javax.ws.rs.WebApplicationException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) DigestingParser(org.apache.tika.parser.DigestingParser) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Example 14 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project stanbol by apache.

the class TikaEngine method activate.

@Override
protected void activate(ComponentContext ctx) throws ConfigurationException {
    super.activate(ctx);
    config = TikaConfig.getDefaultConfig();
    this.detector = config.getDetector();
    this.parser = new AutoDetectParser(config);
    this.skipLinebreaks = getBoolean(ctx.getProperties(), SKIP_LINEBREAKS_WITHIN_CONTENT, DEFAULT_SKIP_LINEBREAKS);
    this.ontologyMappings = new OntologyMappings();
    if (getBoolean(ctx.getProperties(), MAPPING_MEDIA_RESOURCE, DEFAULT_MAPPING_MEDIA_RESOURCE_STATE)) {
        addMediaResourceOntologyMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_DUBLIN_CORE_TERMS, DEFAULT_MAPPING_DUBLIN_CORE_TERMS_STATE)) {
        addDcMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_MESSAGE, DEFAULT_MAPPING_NEPOMUK_MESSAGE_STATE)) {
        addNepomukMessageMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_NEPOMUK_EXIF, DEFAULT_MAPPING_NEPOMUK_EXIF_STATE)) {
        addNepomukExifMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_SKOS, DEFAULT_MAPPING_SKOS_STATE)) {
        addSkosMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_RDFS, DEFAULT_MAPPING_RDFS_STATE)) {
        addRdfsMappings(ontologyMappings);
    }
    if (getBoolean(ctx.getProperties(), MAPPING_GEO, DEFAULT_MAPPING_GEO_STATE)) {
        addGeoMappings(ontologyMappings);
    }
    includeUnmappedProperties = getBoolean(ctx.getProperties(), UNMAPPED_PROPERTIES, DEFAULT_UNMAPPED_PROPERTIES_STATE);
}
Also used : AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OntologyMappings.addMediaResourceOntologyMappings(org.apache.stanbol.enhancer.engines.tika.metadata.OntologyMappings.addMediaResourceOntologyMappings) OntologyMappings(org.apache.stanbol.enhancer.engines.tika.metadata.OntologyMappings)

Example 15 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TIAParsingExample method testHtmlMapper.

public static void testHtmlMapper() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(HtmlMapper.class, new IdentityHtmlMapper());
    parser.parse(stream, handler, metadata, context);
}
Also used : IdentityHtmlMapper(org.apache.tika.parser.html.IdentityHtmlMapper) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)164 Metadata (org.apache.tika.metadata.Metadata)136 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)111 ParseContext (org.apache.tika.parser.ParseContext)103 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)96 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12 TikaException (org.apache.tika.exception.TikaException)11