Search in sources :

Example 16 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class GribParserTest method testParseGlobalMetadata.

@Test
public void testParseGlobalMetadata() throws Exception {
    Parser parser = new GribParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    try (InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertNotNull(metadata);
    String content = handler.toString();
    assertTrue(content.contains("dimensions:"));
    assertTrue(content.contains("variables:"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Example 17 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class DWGParserTest method testParser.

@SuppressWarnings("deprecation")
private void testParser(InputStream input) throws Exception {
    try {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new DWGParser().parse(input, handler, metadata);
        assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
        assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS));
        assertEquals("Lorem ipsum", metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
        assertEquals("http://www.alfresco.com", metadata.get(TikaCoreProperties.RELATION));
        // Check some of the old style metadata too
        assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
        String content = handler.toString();
        assertContains("The quick brown fox jumps over the lazy dog", content);
        assertContains("Gym class", content);
        assertContains("www.alfresco.com", content);
    } finally {
        input.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 18 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class DWGParserTest method testDWG2010CustomPropertiesParser.

@Test
public void testDWG2010CustomPropertiesParser() throws Exception {
    // Check that standard parsing works
    InputStream testInput = DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg");
    testParser(testInput);
    // Check that custom properties with alternate padding work
    try (InputStream input = DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new DWGParser().parse(input, handler, metadata, null);
        assertEquals("valueforcustomprop1", metadata.get("customprop1"));
        assertEquals("valueforcustomprop2", metadata.get("customprop2"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 19 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class HtmlParserTest method testDetectOfCharset.

/**
     * Test case for TIKA-334
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
     */
@Test
public void testDetectOfCharset() throws Exception {
    String test = "<html><head><title>Ž</title></head><body></body></html>";
    Metadata metadata = new Metadata();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("Ž", metadata.get(TikaCoreProperties.TITLE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 20 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class HtmlParserTest method testParseEmpty.

@Test
public void testParseEmpty() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
    assertEquals("", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)251 Metadata (org.apache.tika.metadata.Metadata)242 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)202 InputStream (java.io.InputStream)189 ParseContext (org.apache.tika.parser.ParseContext)170 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)87 Parser (org.apache.tika.parser.Parser)81 ByteArrayInputStream (java.io.ByteArrayInputStream)65 TikaInputStream (org.apache.tika.io.TikaInputStream)65 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 TikaException (org.apache.tika.exception.TikaException)23 IOException (java.io.IOException)17 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)14 SAXException (org.xml.sax.SAXException)13 MediaType (org.apache.tika.mime.MediaType)10 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10