use of org.apache.tika.parser.microsoft.OfficeParser in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.apache.tika.parser.microsoft.OfficeParser in project tika by apache.
the class SolidworksParserTest method testAssembly2014SP0Parser.
/**
* Test the parsing of an solidWorks assembly in version 2014SP0
*/
@Test
public void testAssembly2014SP0Parser() throws Exception {
InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksAssembly2014SP0.SLDASM");
try {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
} finally {
input.close();
}
}
use of org.apache.tika.parser.microsoft.OfficeParser in project tika by apache.
the class TikaToXMP method initialize.
/**
* Initializes the map with supported converters.
*/
private static void initialize() {
// No particular parsing context is needed
ParseContext parseContext = new ParseContext();
// MS Office Binary File Format
addConverter(new OfficeParser().getSupportedTypes(parseContext), MSOfficeBinaryConverter.class);
// Rich Text Format
addConverter(new RTFParser().getSupportedTypes(parseContext), RTFConverter.class);
// MS Open XML Format
addConverter(new OOXMLParser().getSupportedTypes(parseContext), MSOfficeXMLConverter.class);
// Open document format
addConverter(new OpenDocumentParser().getSupportedTypes(parseContext), OpenDocumentConverter.class);
}
Aggregations