use of org.apache.poi.poifs.filesystem.NPOIFSFileSystem in project poi by apache.
the class TestExtractor method testDifferentPOIFS.
/**
* Tests that we can work with both {@link POIFSFileSystem}
* and {@link NPOIFSFileSystem}
*/
@SuppressWarnings("resource")
@Test
public void testDifferentPOIFS() throws IOException {
// Open the two filesystems
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
InputStream is1 = new FileInputStream(pptFile);
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
is1.close();
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
// Open directly
for (DirectoryNode dir : files) {
PowerPointExtractor extractor = new PowerPointExtractor(dir);
assertEquals(expectText, extractor.getText());
}
// Open via a HSLFSlideShow
for (DirectoryNode dir : files) {
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
assertEquals(expectText, extractor.getText());
extractor.close();
slideshow.close();
}
npoifs.close();
}
use of org.apache.poi.poifs.filesystem.NPOIFSFileSystem in project tika by apache.
the class TestContainerAwareDetector method testOpenContainer.
@Test
public void testOpenContainer() throws Exception {
try (TikaInputStream stream = TikaInputStream.get(TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) {
assertNull(stream.getOpenContainer());
assertEquals(MediaType.parse("application/vnd.ms-powerpoint"), detector.detect(stream, new Metadata()));
assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
}
}
use of org.apache.poi.poifs.filesystem.NPOIFSFileSystem in project tika by apache.
the class OfficeParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
configure(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
NPOIFSFileSystem mustCloseFs = null;
try {
if (tstream == null) {
mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
root = mustCloseFs.getRoot();
} else {
final Object container = tstream.getOpenContainer();
if (container instanceof NPOIFSFileSystem) {
root = ((NPOIFSFileSystem) container).getRoot();
} else if (container instanceof DirectoryNode) {
root = (DirectoryNode) container;
} else {
NPOIFSFileSystem fs = null;
if (tstream.hasFile()) {
fs = new NPOIFSFileSystem(tstream.getFile(), true);
} else {
fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
}
//tstream will close the fs, no need to close this below
tstream.setOpenContainer(fs);
root = fs.getRoot();
}
}
parse(root, context, metadata, xhtml);
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
if (officeParserConfig.getExtractMacros()) {
//now try to get macros
extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
}
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
xhtml.endDocument();
}
use of org.apache.poi.poifs.filesystem.NPOIFSFileSystem in project tika by apache.
the class POIFSContainerDetector method getTopLevelNames.
private static Set<String> getTopLevelNames(TikaInputStream stream) throws IOException {
// Force the document stream to a (possibly temporary) file
// so we don't modify the current position of the stream
File file = stream.getFile();
try {
NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
stream.setOpenContainer(fs);
return getTopLevelNames(fs.getRoot());
} catch (IOException e) {
// Parse error in POI, so we don't know the file type
return Collections.emptySet();
} catch (RuntimeException e) {
// Another problem in POI
return Collections.emptySet();
}
}
use of org.apache.poi.poifs.filesystem.NPOIFSFileSystem in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedResources.
private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
// Sometimes HSLF hits problems
// Please open POI bugs for any you come across!
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
if (data != null) {
String objID = Integer.toString(oleShape.getObjectID());
// Embedded Object: add a <div
// class="embedded" id="X"/> so consumer can see where
// in the main text each embedded document
// occurred:
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
InputStream dataStream = null;
try {
dataStream = data.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
} else {
handleEmbeddedResource(stream, objID, objID, mediaType, xhtml, false);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
}
}
}
Aggregations