use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class WordParserTest method testNoFormat.
/**
* TIKA-1044 - Handle documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
new OfficeParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Will generate an exception", content);
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class WordParserTest method testWordParser.
@Test
public void testWordParser() throws Exception {
try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertContains("Sample Word Document", handler.toString());
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ExcelParserTest method testExcelParser.
@Test
// Checks legacy Tika-1.0 style metadata keys
@SuppressWarnings("deprecation")
public void testExcelParser() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
// Mon Oct 01 17:13:56 BST 2007
assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
// Mon Oct 01 17:31:43 BST 2007
assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
String content = handler.toString();
assertContains("Sample Excel Worksheet", content);
assertContains("Numbers and their Squares", content);
assertContains("\t\tNumber\tSquare", content);
assertContains("9", content);
assertNotContained("9.0", content);
assertContains("196", content);
assertNotContained("196.0", content);
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ExcelParserTest method testHeaderAndFooterExtraction.
@Test
public void testHeaderAndFooterExtraction() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_headers_footers.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.UK);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("John Smith1", content);
assertContains("John Smith50", content);
assertContains("1 Corporate HQ", content);
assertContains("Header - Corporate Spreadsheet", content);
assertContains("Header - For Internal Use Only", content);
assertContains("Header - Author: John Smith", content);
assertContains("Footer - Corporate Spreadsheet", content);
assertContains("Footer - For Internal Use Only", content);
assertContains("Footer - Author: John Smith", content);
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ExcelParserTest method testWorksSpreadsheet70.
@Test
public void testWorksSpreadsheet70() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testWORKSSpreadsheet7.0.xlr")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
String content = handler.toString();
assertContains("Microsoft Works", content);
}
}
Aggregations