Search in sources :

Example 46 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class WordParserTest method testNoFormat.

/**
     * TIKA-1044 - Handle documents where parts of the
     *  text have no formatting or styles applied to them
     */
@Test
public void testNoFormat() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Will generate an exception", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 47 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class WordParserTest method testWordParser.

@Test
public void testWordParser() throws Exception {
    try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        assertContains("Sample Word Document", handler.toString());
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 48 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ExcelParserTest method testExcelParser.

@Test
// Checks legacy Tika-1.0 style metadata keys
@SuppressWarnings("deprecation")
public void testExcelParser() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        // Mon Oct 01 17:13:56 BST 2007
        assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
        // Mon Oct 01 17:31:43 BST 2007
        assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
        String content = handler.toString();
        assertContains("Sample Excel Worksheet", content);
        assertContains("Numbers and their Squares", content);
        assertContains("\t\tNumber\tSquare", content);
        assertContains("9", content);
        assertNotContained("9.0", content);
        assertContains("196", content);
        assertNotContained("196.0", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 49 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ExcelParserTest method testHeaderAndFooterExtraction.

@Test
public void testHeaderAndFooterExtraction() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_headers_footers.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.UK);
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("John Smith1", content);
        assertContains("John Smith50", content);
        assertContains("1 Corporate HQ", content);
        assertContains("Header - Corporate Spreadsheet", content);
        assertContains("Header - For Internal Use Only", content);
        assertContains("Header - Author: John Smith", content);
        assertContains("Footer - Corporate Spreadsheet", content);
        assertContains("Footer - For Internal Use Only", content);
        assertContains("Footer - Author: John Smith", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 50 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ExcelParserTest method testWorksSpreadsheet70.

@Test
public void testWorksSpreadsheet70() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testWORKSSpreadsheet7.0.xlr")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        String content = handler.toString();
        assertContains("Microsoft Works", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10