use of org.xml.sax.ContentHandler in project tika by apache.
the class TNEFParserTest method testMetadata.
@Test
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
TNEFParser tnef = new TNEFParser();
tnef.parse(stream, handler, metadata, new ParseContext());
assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class WordParserTest method testCustomProperties.
/**
* Ensures that custom OLE2 (HPSF) properties are extracted
*/
@Test
public void testCustomProperties() throws Exception {
Metadata metadata = new Metadata();
try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_custom_props.doc")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
}
assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
// TODO: Move to OO subject in Tika 2.0
assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class WordParserTest method testNoFormat.
/**
* TIKA-1044 - Handle documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
new OfficeParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Will generate an exception", content);
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class WordParserTest method testWordParser.
@Test
public void testWordParser() throws Exception {
try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertContains("Sample Word Document", handler.toString());
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ExcelParserTest method testExcelParser.
@Test
// Checks legacy Tika-1.0 style metadata keys
@SuppressWarnings("deprecation")
public void testExcelParser() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
// Mon Oct 01 17:13:56 BST 2007
assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
// Mon Oct 01 17:31:43 BST 2007
assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
String content = handler.toString();
assertContains("Sample Excel Worksheet", content);
assertContains("Numbers and their Squares", content);
assertContains("\t\tNumber\tSquare", content);
assertContains("9", content);
assertNotContained("9.0", content);
assertContains("196", content);
assertNotContained("196.0", content);
}
}
Aggregations