use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testProtectedExcelFile.
/**
* An excel document which is password protected.
* See TIKA-437.
*/
@Test
public void testProtectedExcelFile() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument("protectedFile.xlsx")) {
parser.parse(input, handler, metadata, context);
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
String content = handler.toString();
assertContains("Office", content);
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TNEFParserTest method testMetadata.
@Test
public void testMetadata() throws Exception {
TikaInputStream stream = getTestFile(file);
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
TNEFParser tnef = new TNEFParser();
tnef.parse(stream, handler, metadata, new ParseContext());
assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class WordParserTest method testCustomProperties.
/**
* Ensures that custom OLE2 (HPSF) properties are extracted
*/
@Test
public void testCustomProperties() throws Exception {
Metadata metadata = new Metadata();
try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_custom_props.doc")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
}
assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
// TODO: Move to OO subject in Tika 2.0
assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class WordParserTest method testNoFormat.
/**
* TIKA-1044 - Handle documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.doc")) {
new OfficeParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Will generate an exception", content);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class WordParserTest method testWordParser.
@Test
public void testWordParser() throws Exception {
try (InputStream input = WordParserTest.class.getResourceAsStream("/test-documents/testWORD.doc")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertContains("Sample Word Document", handler.toString());
}
}
Aggregations