Search in sources :

Example 91 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class SolidworksParserTest method testDrawing2014SP0Parser.

/**
     * Test the parsing of an solidWorks drawing in version 2014SP0
     */
@Test
public void testDrawing2014SP0Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2014SP0.SLDDRW")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 92 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class SolidworksParserTest method testAssembly2013SP2Parser.

/**
     * Test the parsing of an solidWorks assembly in version 2013SP2
     */
@Test
public void testAssembly2013SP2Parser() throws Exception {
    try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksAssembly2013SP2.SLDASM")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        //Check content type
        assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
        //Check properties
        assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
        assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED));
        assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
        assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
        assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
        assertEquals("", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 93 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Latin1StringsParserTest method testParse.

@Test
public void testParse() throws Exception {
    String testStr = "These are Latin1 accented scripts: Â Ã É Ü â ã é ü";
    String smallStr = "ab";
    byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
    byte[] utf8Bytes = testStr.getBytes(UTF_8);
    byte[] utf16Bytes = testStr.getBytes(UTF_16);
    byte[] zeros = new byte[10];
    byte[] smallString = smallStr.getBytes(ISO_8859_1);
    byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    baos.write(iso8859Bytes);
    baos.write(zeros);
    baos.write(utf8Bytes);
    baos.write(trashBytes);
    baos.write(utf16Bytes);
    baos.write(zeros);
    baos.write(smallString);
    Parser parser = new Latin1StringsParser();
    ContentHandler handler = new BodyContentHandler();
    try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
        parser.parse(stream, handler, new Metadata(), new ParseContext());
    }
    String result = handler.toString();
    String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
    // Test if result contains only the test string appended 3 times
    assertTrue(result.equals(expected));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Example 94 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class DcXMLParserTest method testXMLParserAsciiChars.

@Test
public void testXMLParserAsciiChars() throws Exception {
    try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new DcXMLParser().parse(input, handler, metadata);
        assertEquals("application/xml", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
        // The file contains 5 dc:subject tags, which come through as
        //  a multi-valued Tika Metadata entry in file order
        assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
        assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
        assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
        assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
        assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
        assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
        assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
        assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
        assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
        assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
        assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
        assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
        assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
        assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
        assertEquals("Framework d\'indexation des documents XML, HTML, PDF etc..", metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("http://www.apache.org", metadata.get(TikaCoreProperties.IDENTIFIER));
        assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
        assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
        assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
        assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
        String content = handler.toString();
        assertContains("Tika test document", content);
        assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 95 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class EmptyAndDuplicateElementsXMLParserTest method testEmptiesAndRepeats.

@Test
public void testEmptiesAndRepeats() throws Exception {
    try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream("/test-documents/testXML3.xml")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
        assertEquals(4, metadata.getValues(FIRST_NAME).length);
        assertEquals(4, metadata.getValues(LAST_NAME).length);
        assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
        assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
        assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
        assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
        assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
        assertEquals("", metadata.getValues(LAST_NAME)[2]);
        assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
        assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10