use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.
the class TikaParsersTest method testGetHTML.
@Test
public void testGetHTML() throws Exception {
for (boolean details : new boolean[] { false, true }) {
Response response = WebClient.create(endPoint + getPath(details)).type("text/html").accept("text/html").get();
String text = getStringFromInputStream((InputStream) response.getEntity());
assertContains("<h2>DefaultParser</h2>", text);
assertContains("Composite", text);
assertContains("<h3>OpusParser", text);
assertContains("<h3>PackageParser", text);
assertContains("<h3>OOXMLParser", text);
assertContains(OpusParser.class.getName(), text);
assertContains(PackageParser.class.getName(), text);
assertContains(OOXMLParser.class.getName(), text);
if (details) {
// Should have the mimetypes they handle
assertContains("<li>text/plain", text);
assertContains("<li>application/pdf", text);
assertContains("<li>audio/ogg", text);
} else {
// Shouldn't do
assertNotFound("text/plain", text);
assertNotFound("application/pdf", text);
assertNotFound("audio/ogg", text);
}
}
}
use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.
the class TikaToXMP method initialize.
/**
* Initializes the map with supported converters.
*/
private static void initialize() {
// No particular parsing context is needed
ParseContext parseContext = new ParseContext();
// MS Office Binary File Format
addConverter(new OfficeParser().getSupportedTypes(parseContext), MSOfficeBinaryConverter.class);
// Rich Text Format
addConverter(new RTFParser().getSupportedTypes(parseContext), RTFConverter.class);
// MS Open XML Format
addConverter(new OOXMLParser().getSupportedTypes(parseContext), MSOfficeXMLConverter.class);
// Open document format
addConverter(new OpenDocumentParser().getSupportedTypes(parseContext), OpenDocumentConverter.class);
}
use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.
the class ExcelParserTest method testExcel95.
/**
* Excel 5 and 95 are older formats, and only get basic support
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
MediaType type;
Metadata m;
// First try detection of Excel 5
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// Now Excel 95
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// OfficeParser can handle it
assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// Parse the Excel 5 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet names
assertContains("Feuil1", content);
assertContains("Feuil3", content);
// Text
assertContains("Sample Excel", content);
assertContains("Number", content);
// Numbers
assertContains("15", content);
assertContains("225", content);
// Metadata was also fetched
assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
}
// Parse the Excel 95 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet name
assertContains("Foglio1", content);
// Very boring file, no actual text or numbers!
// Metadata was also fetched
assertEquals(null, m.get(TikaCoreProperties.TITLE));
assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
}
}
use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.
the class OfficeParser method parse.
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
// Parse summary entries first, to make metadata available early
new SummaryExtractor(metadata).parseSummaries(root);
// Parse remaining document entries
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type != POIFSDocumentType.UNKNOWN) {
setType(metadata, type.getType());
}
switch(type) {
case SOLIDWORKS_PART:
case SOLIDWORKS_ASSEMBLY:
case SOLIDWORKS_DRAWING:
break;
case PUBLISHER:
PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
new HSLFExtractor(context, metadata).parse(root, xhtml);
break;
case WORKBOOK:
case XLR:
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
break;
case PROJECT:
// We currently can't do anything beyond the metadata
break;
case VISIO:
VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
for (String text : visioTextExtractor.getAllText()) {
xhtml.element("p", text);
}
break;
case OUTLOOK:
OutlookExtractor extractor = new OutlookExtractor(root, context);
extractor.parse(xhtml, metadata);
break;
case ENCRYPTED:
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
// By default, use the default Office Password
String password = Decryptor.DEFAULT_PASSWORD;
// If they supplied a Password Provider, ask that for the password,
// and use the provider given one if available (stick with default if not)
PasswordProvider passwordProvider = context.get(PasswordProvider.class);
if (passwordProvider != null) {
String suppliedPassword = passwordProvider.getPassword(metadata);
if (suppliedPassword != null) {
password = suppliedPassword;
}
}
// Check if we've the right password or not
if (!d.verifyPassword(password)) {
throw new EncryptedDocumentException();
}
// Decrypt the OLE2 stream, and delegate the resulting OOXML
// file to the regular OOXML parser for normal handling
OOXMLParser parser = new OOXMLParser();
parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
} catch (GeneralSecurityException ex) {
throw new EncryptedDocumentException(ex);
}
default:
// is extracted, which happened above
break;
}
}
Aggregations