Search in sources :

Example 31 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class TXTParserTest method testUseIncomingCharsetAsHint.

/**
     * Test case for TIKA-335: using incoming charset
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
     */
@Test
public void testUseIncomingCharsetAsHint() throws Exception {
    // Could be ISO 8859-1 or ISO 8859-15 or ...
    // u00e1 is latin small letter a with acute
    final String test2 = "the name is ándre";
    Metadata metadata = new Metadata();
    parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
    // deprecated
    assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
    parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
    // deprecated
    assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 32 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class TXTParserTest method assertExtractText.

private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
    ContentHandler handler = new BodyContentHandler() {

        public void ignorableWhitespace(char[] ch, int off, int len) {
        // Ignore the whitespace added by XHTMLContentHandler
        }
    };
    Metadata metadata = new Metadata();
    parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
    assertEquals(msg, expected, handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 33 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class SolidworksParserTest method testPart2014SP0Parser.

/**
     * Test the parsing of an solidWorks part in version 2014SP0
     */
@Test
public void testPart2014SP0Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksPart2014SP0.SLDPRT")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-11-28T12:38:28Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 34 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class SolidworksParserTest method testPart2013SP2Parser.

/**
     * Test the parsing of an solidWorks part in version 2013SP2
     */
@Test
public void testPart2013SP2Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksPart2013SP2.SLDPRT")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-09-06T08:12:12Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 35 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class SolidworksParserTest method testDrawing2013SP2Parser.

/*
     * Test the parsing of an solidWorks drawing in version 2013SP2
     */
@Test
public void testDrawing2013SP2Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2013SP2.SLDDRW")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-09-06T08:06:57Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)251 Metadata (org.apache.tika.metadata.Metadata)242 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)202 InputStream (java.io.InputStream)189 ParseContext (org.apache.tika.parser.ParseContext)170 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)87 Parser (org.apache.tika.parser.Parser)81 ByteArrayInputStream (java.io.ByteArrayInputStream)65 TikaInputStream (org.apache.tika.io.TikaInputStream)65 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 TikaException (org.apache.tika.exception.TikaException)23 IOException (java.io.IOException)17 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)14 SAXException (org.xml.sax.SAXException)13 MediaType (org.apache.tika.mime.MediaType)10 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10