use of org.apache.poi.poifs.filesystem.Entry in project tika by apache.
the class POIFSContainerDetector method processCompObjFormatType.
/**
* Is this one of the kinds of formats which uses CompObj to
* store all of their data, eg Star Draw, Star Impress or
* (older) Works?
* If not, it's likely an embedded resource
*/
private static MediaType processCompObjFormatType(DirectoryEntry root) {
try {
Entry e = root.getEntry("CompObj");
if (e != null && e.isDocumentEntry()) {
DocumentNode dn = (DocumentNode) e;
DocumentInputStream stream = new DocumentInputStream(dn);
byte[] bytes = IOUtils.toByteArray(stream);
/*
* This array contains a string with a normal ASCII name of the
* application used to create this file. We want to search for that
* name.
*/
if (arrayContains(bytes, MS_GRAPH_CHART_BYTES)) {
return MS_GRAPH_CHART;
} else if (arrayContains(bytes, STAR_DRAW)) {
return SDA;
} else if (arrayContains(bytes, STAR_IMPRESS)) {
return SDD;
} else if (arrayContains(bytes, WORKS_QUILL96)) {
return WPS;
}
}
} catch (Exception e) {
/*
* "root.getEntry" can throw FileNotFoundException. The code inside
* "if" can throw IOExceptions. Theoretically. Practically no
* exceptions will likely ever appear.
*
* Swallow all of them. If any occur, we just assume that we can't
* distinguish between Draw and Impress and return something safe:
* x-tika-msoffice
*/
}
return OLE;
}
use of org.apache.poi.poifs.filesystem.Entry in project OpenOLAT by OpenOLAT.
the class WordDocument method readContent.
@Override
protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
LimitedContentWriter sb = new LimitedContentWriter((int) leaf.getSize(), FileDocumentFactory.getMaxFileSize());
try (InputStream bis = new BufferedInputStream(leaf.getInputStream())) {
POIFSFileSystem filesystem = new POIFSFileSystem(bis);
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
if (!(entry instanceof DocumentEntry)) {
// Skip directory entries
} else if ("WordDocument".equals(name)) {
collectWordDocument(leaf, filesystem, sb);
}
}
return new FileContent(sb.toString());
} catch (Exception e) {
log.warn("could not read in word document: " + leaf + " please check, that this is not an docx/rtf/html file!");
throw new DocumentException(e.getMessage());
}
}
use of org.apache.poi.poifs.filesystem.Entry in project openolat by klemens.
the class WordDocument method readContent.
@Override
protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
LimitedContentWriter sb = new LimitedContentWriter((int) leaf.getSize(), FileDocumentFactory.getMaxFileSize());
try (InputStream bis = new BufferedInputStream(leaf.getInputStream())) {
POIFSFileSystem filesystem = new POIFSFileSystem(bis);
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
if (!(entry instanceof DocumentEntry)) {
// Skip directory entries
} else if ("WordDocument".equals(name)) {
collectWordDocument(leaf, filesystem, sb);
}
}
return new FileContent(sb.toString());
} catch (Exception e) {
log.warn("could not read in word document: " + leaf + " please check, that this is not an docx/rtf/html file!");
throw new DocumentException(e.getMessage());
}
}
use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class OLE2ExtractorFactory method getEmbededDocsTextExtractors.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else {
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
}
// Create the extractors
if (dirs.size() == 0 && nonPOIFS.size() == 0) {
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) {
e.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
e.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) {
// Ignore, invalid format
LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[e.size()]);
}
use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class PropertySet method write.
/**
* Writes a property set to a document in a POI filesystem directory.
*
* @param dir The directory in the POI filesystem to write the document to.
* @param name The document's name. If there is already a document with the
* same name in the directory the latter will be overwritten.
*
* @throws WritingNotSupportedException if the filesystem doesn't support writing
* @throws IOException if the old entry can't be deleted or the new entry be written
*/
public void write(final DirectoryEntry dir, final String name) throws WritingNotSupportedException, IOException {
/* If there is already an entry with the same name, remove it. */
if (dir.hasEntry(name)) {
final Entry e = dir.getEntry(name);
e.delete();
}
/* Create the new entry. */
dir.createDocument(name, toInputStream());
}
Aggregations