use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.
the class DataExtraction method main.
public static void main(String[] args) throws Exception {
if (args.length == 0) {
usage();
return;
}
FileInputStream is = new FileInputStream(args[0]);
HSLFSlideShow ppt = new HSLFSlideShow(is);
is.close();
//extract all sound files embedded in this presentation
HSLFSoundData[] sound = ppt.getSoundData();
for (int i = 0; i < sound.length; i++) {
//*.wav
String type = sound[i].getSoundType();
//typically file name
String name = sound[i].getSoundName();
//raw bytes
byte[] data = sound[i].getData();
//save the sound on disk
FileOutputStream out = new FileOutputStream(name + type);
out.write(data);
out.close();
}
int oleIdx = -1, picIdx = -1;
for (HSLFSlide slide : ppt.getSlides()) {
//extract embedded OLE documents
for (HSLFShape shape : slide.getShapes()) {
if (shape instanceof OLEShape) {
oleIdx++;
OLEShape ole = (OLEShape) shape;
HSLFObjectData data = ole.getObjectData();
String name = ole.getInstanceName();
if ("Worksheet".equals(name)) {
//read xls
@SuppressWarnings({ "unused", "resource" }) HSSFWorkbook wb = new HSSFWorkbook(data.getData());
} else if ("Document".equals(name)) {
HWPFDocument doc = new HWPFDocument(data.getData());
//read the word document
Range r = doc.getRange();
for (int k = 0; k < r.numParagraphs(); k++) {
Paragraph p = r.getParagraph(k);
System.out.println(p.text());
}
//save on disk
FileOutputStream out = new FileOutputStream(name + "-(" + (oleIdx) + ").doc");
doc.write(out);
out.close();
doc.close();
} else {
FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (oleIdx + 1) + ".dat");
InputStream dis = data.getData();
byte[] chunk = new byte[2048];
int count;
while ((count = dis.read(chunk)) >= 0) {
out.write(chunk, 0, count);
}
is.close();
out.close();
}
} else //Pictures
if (shape instanceof HSLFPictureShape) {
picIdx++;
HSLFPictureShape p = (HSLFPictureShape) shape;
HSLFPictureData data = p.getPictureData();
String ext = data.getType().extension;
FileOutputStream out = new FileOutputStream("pict-" + picIdx + ext);
out.write(data.getData());
out.close();
}
}
}
ppt.close();
}
use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.
the class AbstractWordConverter method processParagraphes.
protected void processParagraphes(HWPFDocumentCore wordDocument, Element flow, Range range, int currentTableLevel) {
final int paragraphs = range.numParagraphs();
for (int p = 0; p < paragraphs; p++) {
Paragraph paragraph = range.getParagraph(p);
if (paragraph.isInTable() && paragraph.getTableLevel() != currentTableLevel) {
if (paragraph.getTableLevel() < currentTableLevel)
throw new IllegalStateException("Trying to process table cell with higher level (" + paragraph.getTableLevel() + ") than current table level (" + currentTableLevel + ") as inner table part");
Table table = range.getTable(paragraph);
processTable(wordDocument, flow, table);
p += table.numParagraphs();
p--;
continue;
}
if (paragraph.text().equals("")) {
processPageBreak(wordDocument, flow);
}
boolean processed = false;
if (paragraph.isInList()) {
try {
HWPFList hwpfList = paragraph.getList();
String label = AbstractWordUtils.getBulletText(numberingState, hwpfList, (char) paragraph.getIlvl());
processParagraph(wordDocument, flow, currentTableLevel, paragraph, label);
processed = true;
} catch (Exception exc) {
log.log(POILogger.WARN, "Can't process paragraph as list entry, will be processed without list information", exc);
}
}
if (processed == false) {
processParagraph(wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY);
}
}
}
use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.
the class HWPFLister method dumpPapx.
public void dumpPapx(boolean withProperties, boolean withSprms) throws Exception {
if (_doc instanceof HWPFDocument) {
System.out.println("binary PAP pages ");
HWPFDocument doc = (HWPFDocument) _doc;
byte[] mainStream = _doc.getMainStream();
PlexOfCps binTable = new PlexOfCps(doc.getTableStream(), doc.getFileInformationBlock().getFcPlcfbtePapx(), doc.getFileInformationBlock().getLcbPlcfbtePapx(), 4);
List<PAPX> papxs = new ArrayList<PAPX>();
int length = binTable.length();
for (int x = 0; x < length; x++) {
GenericPropertyNode node = binTable.getProperty(x);
int pageNum = LittleEndian.getInt(node.getBytes());
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(mainStream, doc.getDataStream(), pageOffset, doc.getTextTable());
System.out.println("* PFKP: " + pfkp);
for (PAPX papx : pfkp.getPAPXs()) {
System.out.println("** " + papx);
papxs.add(papx);
if (papx != null && withSprms) {
SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
dumpSprms(sprmIt, "*** ");
}
}
}
Collections.sort(papxs);
System.out.println("* Sorted by END");
for (PAPX papx : papxs) {
System.out.println("** " + papx);
if (papx != null && withSprms) {
SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
dumpSprms(sprmIt, "*** ");
}
}
}
for (PAPX papx : _doc.getParagraphTable().getParagraphs()) {
System.out.println(papx);
if (withProperties) {
Paragraph paragraph = Paragraph.newParagraph(_doc.getOverallRange(), papx);
System.out.println(paragraph.getProps());
}
SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
dumpSprms(sprmIt, "\t");
}
}
use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.
the class TestDifferentRoutes method testExtractFromModel.
/**
* Test model based extraction
*/
@Test
public void testExtractFromModel() {
Range r = doc.getRange();
String[] text = new String[r.numParagraphs()];
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
text[i] = p.text();
}
assertArrayEquals(p_text, text);
}
use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.
the class TestBug46610 method runExtract.
private static String runExtract(String sampleName) throws Exception {
HWPFDocument doc = HWPFTestDataSamples.openSampleFile(sampleName);
StringBuffer out = new StringBuffer();
Range globalRange = doc.getRange();
for (int i = 0; i < globalRange.numParagraphs(); i++) {
Paragraph p = globalRange.getParagraph(i);
out.append(p.text());
out.append("\n");
for (int j = 0; j < p.numCharacterRuns(); j++) {
CharacterRun characterRun = p.getCharacterRun(j);
characterRun.text();
}
doc.close();
}
return out.toString();
}
Aggregations