use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.
the class DataExtraction method main.
public static void main(String[] args) throws Exception {
if (args.length == 0) {
usage();
return;
}
FileInputStream is = new FileInputStream(args[0]);
HSLFSlideShow ppt = new HSLFSlideShow(is);
is.close();
//extract all sound files embedded in this presentation
HSLFSoundData[] sound = ppt.getSoundData();
for (int i = 0; i < sound.length; i++) {
//*.wav
String type = sound[i].getSoundType();
//typically file name
String name = sound[i].getSoundName();
//raw bytes
byte[] data = sound[i].getData();
//save the sound on disk
FileOutputStream out = new FileOutputStream(name + type);
out.write(data);
out.close();
}
int oleIdx = -1, picIdx = -1;
for (HSLFSlide slide : ppt.getSlides()) {
//extract embedded OLE documents
for (HSLFShape shape : slide.getShapes()) {
if (shape instanceof OLEShape) {
oleIdx++;
OLEShape ole = (OLEShape) shape;
HSLFObjectData data = ole.getObjectData();
String name = ole.getInstanceName();
if ("Worksheet".equals(name)) {
//read xls
@SuppressWarnings({ "unused", "resource" }) HSSFWorkbook wb = new HSSFWorkbook(data.getData());
} else if ("Document".equals(name)) {
HWPFDocument doc = new HWPFDocument(data.getData());
//read the word document
Range r = doc.getRange();
for (int k = 0; k < r.numParagraphs(); k++) {
Paragraph p = r.getParagraph(k);
System.out.println(p.text());
}
//save on disk
FileOutputStream out = new FileOutputStream(name + "-(" + (oleIdx) + ").doc");
doc.write(out);
out.close();
doc.close();
} else {
FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (oleIdx + 1) + ".dat");
InputStream dis = data.getData();
byte[] chunk = new byte[2048];
int count;
while ((count = dis.read(chunk)) >= 0) {
out.write(chunk, 0, count);
}
is.close();
out.close();
}
} else //Pictures
if (shape instanceof HSLFPictureShape) {
picIdx++;
HSLFPictureShape p = (HSLFPictureShape) shape;
HSLFPictureData data = p.getPictureData();
String ext = data.getType().extension;
FileOutputStream out = new FileOutputStream("pict-" + picIdx + ext);
out.write(data.getData());
out.close();
}
}
}
ppt.close();
}
use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.
the class HWPFDocument method getRange.
private Range getRange(SubdocumentType subdocument) {
int startCp = 0;
for (SubdocumentType previos : SubdocumentType.ORDERED) {
int length = getFileInformationBlock().getSubdocumentTextStreamLength(previos);
if (subdocument == previos)
return new Range(startCp, startCp + length, this);
startCp += length;
}
throw new UnsupportedOperationException("Subdocument type not supported: " + subdocument);
}
use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.
the class AbstractWordConverter method processDocument.
public void processDocument(HWPFDocumentCore wordDocument) {
try {
final SummaryInformation summaryInformation = wordDocument.getSummaryInformation();
if (summaryInformation != null) {
processDocumentInformation(summaryInformation);
}
} catch (Exception exc) {
logger.log(POILogger.WARN, "Unable to process document summary information: ", exc, exc);
}
final Range docRange = wordDocument.getRange();
if (docRange.numSections() == 1) {
processSingleSection(wordDocument, docRange.getSection(0));
afterProcess();
return;
}
processDocumentPart(wordDocument, docRange);
afterProcess();
}
use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.
the class AbstractWordConverter method processCharacters.
protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
if (range == null)
return false;
boolean haveAnyText = false;
/*
* In text there can be fields, bookmarks, may be other structures (code
* below allows extension). Those structures can overlaps, so either we
* should process char-by-char (slow) or find a correct way to
* reconstruct the structure of range -- sergey
*/
List<Structure> structures = new LinkedList<Structure>();
if (wordDocument instanceof HWPFDocument) {
final HWPFDocument doc = (HWPFDocument) wordDocument;
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
if (rangeBookmarks != null) {
for (List<Bookmark> lists : rangeBookmarks.values()) {
for (Bookmark bookmark : lists) {
if (!bookmarkStack.contains(bookmark))
addToStructures(structures, new Structure(bookmark));
}
}
}
// TODO: dead fields?
int skipUntil = -1;
for (int c = 0; c < range.numCharacterRuns(); c++) {
CharacterRun characterRun = range.getCharacterRun(c);
if (characterRun == null)
throw new AssertionError();
if (characterRun.getStartOffset() < skipUntil)
continue;
String text = characterRun.text();
if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
continue;
Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
if (aliveField != null) {
addToStructures(structures, new Structure(aliveField));
} else {
int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
if (separatorEnd != null) {
addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
c = separatorEnd[1];
}
}
}
}
structures = new ArrayList<Structure>(structures);
Collections.sort(structures);
int previous = range.getStartOffset();
for (Structure structure : structures) {
if (structure.start != previous) {
Range subrange = new Range(previous, structure.start, range) {
@Override
public String toString() {
return "BetweenStructuresSubrange " + super.toString();
}
};
processCharacters(wordDocument, currentTableLevel, subrange, block);
}
if (structure.structure instanceof Bookmark) {
// other bookmarks with same boundaries
List<Bookmark> bookmarks = new LinkedList<Bookmark>();
for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
bookmarks.add(bookmark);
}
}
bookmarkStack.addAll(bookmarks);
try {
int end = Math.min(range.getEndOffset(), structure.end);
Range subrange = new Range(structure.start, end, range) {
@Override
public String toString() {
return "BookmarksSubrange " + super.toString();
}
};
processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
} finally {
bookmarkStack.removeAll(bookmarks);
}
} else if (structure.structure instanceof Field) {
Field field = (Field) structure.structure;
processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
} else if (structure.structure instanceof DeadFieldBoundaries) {
DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
} else {
throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
}
previous = Math.min(range.getEndOffset(), structure.end);
}
if (previous != range.getStartOffset()) {
if (previous > range.getEndOffset()) {
logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
return true;
}
if (previous < range.getEndOffset()) {
Range subrange = new Range(previous, range.getEndOffset(), range) {
@Override
public String toString() {
return "AfterStructureSubrange " + super.toString();
}
};
processCharacters(wordDocument, currentTableLevel, subrange, block);
}
return true;
}
for (int c = 0; c < range.numCharacterRuns(); c++) {
CharacterRun characterRun = range.getCharacterRun(c);
if (characterRun == null)
throw new AssertionError();
if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
HWPFDocument newFormat = (HWPFDocument) wordDocument;
Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
processImage(block, characterRun.text().charAt(0) == 0x01, picture);
continue;
}
String text = characterRun.text();
if (text.isEmpty())
continue;
if (characterRun.isSpecialCharacter()) {
if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
HWPFDocument doc = (HWPFDocument) wordDocument;
processNoteAnchor(doc, characterRun, block);
continue;
}
if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
HWPFDocument doc = (HWPFDocument) wordDocument;
processDrawnObject(doc, characterRun, block);
continue;
}
if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
HWPFDocument doc = (HWPFDocument) wordDocument;
processOle2(doc, characterRun, block);
continue;
}
if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
HWPFDocument doc = (HWPFDocument) wordDocument;
processSymbol(doc, characterRun, block);
continue;
}
}
if (text.charAt(0) == FIELD_BEGIN_MARK) {
if (wordDocument instanceof HWPFDocument) {
Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
if (aliveField != null) {
processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
int continueAfter = aliveField.getFieldEndOffset();
while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
if (c < range.numCharacterRuns())
c--;
continue;
}
}
int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
if (skipTo != c) {
c = skipTo;
continue;
}
continue;
}
if (text.charAt(0) == FIELD_SEPARATOR_MARK) {
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if (text.charAt(0) == FIELD_END_MARK) {
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
continue;
}
if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
text = text.substring(0, text.length() - 1);
{
// line breaks
StringBuilder stringBuilder = new StringBuilder();
for (char charChar : text.toCharArray()) {
if (charChar == 11) {
if (stringBuilder.length() > 0) {
outputCharacters(block, characterRun, stringBuilder.toString());
stringBuilder.setLength(0);
}
processLineBreak(block, characterRun);
} else if (charChar == 30) {
// Non-breaking hyphens are stored as ASCII 30
stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
} else if (charChar == 31) {
// Non-required hyphens to zero-width space
stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
} else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
stringBuilder.append(charChar);
}
}
if (stringBuilder.length() > 0) {
outputCharacters(block, characterRun, stringBuilder.toString());
stringBuilder.setLength(0);
}
}
haveAnyText |= text.trim().length() != 0;
}
return haveAnyText;
}
use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.
the class AbstractWordConverter method processDeadField.
protected void processDeadField(HWPFDocumentCore wordDocument, Element currentBlock, Range range, int currentTableLevel, int beginMark, int separatorMark, int endMark) {
if (beginMark + 1 < separatorMark && separatorMark + 1 < endMark) {
Range formulaRange = new Range(range.getCharacterRun(beginMark + 1).getStartOffset(), range.getCharacterRun(separatorMark - 1).getEndOffset(), range) {
@Override
public String toString() {
return "Dead field formula subrange: " + super.toString();
}
};
Range valueRange = new Range(range.getCharacterRun(separatorMark + 1).getStartOffset(), range.getCharacterRun(endMark - 1).getEndOffset(), range) {
@Override
public String toString() {
return "Dead field value subrange: " + super.toString();
}
};
String formula = formulaRange.text();
final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher(formula);
if (matcher.matches()) {
String localref = matcher.group(1);
processPageref(wordDocument, currentBlock, valueRange, currentTableLevel, localref);
return;
}
}
StringBuilder debug = new StringBuilder("Unsupported field type: \n");
for (int i = beginMark; i <= endMark; i++) {
debug.append("\t");
debug.append(range.getCharacterRun(i));
debug.append("\n");
}
logger.log(POILogger.WARN, debug);
Range deadFieldValueSubrage = new Range(range.getCharacterRun(separatorMark).getStartOffset() + 1, range.getCharacterRun(endMark).getStartOffset(), range) {
@Override
public String toString() {
return "DeadFieldValueSubrange (" + super.toString() + ")";
}
};
// just output field value
if (separatorMark + 1 < endMark)
processCharacters(wordDocument, currentTableLevel, deadFieldValueSubrage, currentBlock);
return;
}
Aggregations