Search in sources :

Example 56 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class FontParsersTest method testTTFParsing.

@Test
public void testTTFParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
    assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
    assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
    assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
    assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
    // Not extracted
    assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
    assertEquals(null, metadata.get(MET_FONT_WEIGHT));
    assertEquals(null, metadata.get(MET_FONT_VERSION));
    // Currently, the parser doesn't extract any contents
    String content = handler.toString();
    assertEquals("", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 57 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class HDFParserTest method testHDF4.

@Test
public void testHDF4() throws Exception {
    if (System.getProperty("java.version").startsWith("1.5")) {
        return;
    }
    Parser parser = new HDFParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    /*
       * this is a publicly available HDF4 file from the HD4 examples:
       * 
       * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
       */
    try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.hdf")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertNotNull(metadata);
    assertEquals("Direct read of HDF4 file through CDM library", metadata.get("_History"));
    assertEquals("Ascending", metadata.get("Pass"));
    assertEquals("Hierarchical Data Format, version 4", metadata.get("File-Type-Description"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) HDFParser(org.apache.tika.parser.hdf.HDFParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) HDFParser(org.apache.tika.parser.hdf.HDFParser) Test(org.junit.Test)

Example 58 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class GribParserTest method testParseGlobalMetadata.

@Test
public void testParseGlobalMetadata() throws Exception {
    Parser parser = new GribParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    try (InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertNotNull(metadata);
    String content = handler.toString();
    assertTrue(content.contains("dimensions:"));
    assertTrue(content.contains("variables:"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Example 59 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class DWGParserTest method testParser.

@SuppressWarnings("deprecation")
private void testParser(InputStream input) throws Exception {
    try {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new DWGParser().parse(input, handler, metadata);
        assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
        assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS));
        assertEquals("Lorem ipsum", metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
        assertEquals("http://www.alfresco.com", metadata.get(TikaCoreProperties.RELATION));
        // Check some of the old style metadata too
        assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
        String content = handler.toString();
        assertContains("The quick brown fox jumps over the lazy dog", content);
        assertContains("Gym class", content);
        assertContains("www.alfresco.com", content);
    } finally {
        input.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 60 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class DWGParserTest method testDWG2010CustomPropertiesParser.

@Test
public void testDWG2010CustomPropertiesParser() throws Exception {
    // Check that standard parsing works
    InputStream testInput = DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg");
    testParser(testInput);
    // Check that custom properties with alternate padding work
    try (InputStream input = DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new DWGParser().parse(input, handler, metadata, null);
        assertEquals("valueforcustomprop1", metadata.get("customprop1"));
        assertEquals("valueforcustomprop2", metadata.get("customprop2"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20