Search in sources :

Example 1 with Paragraph

use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.

the class DataExtraction method main.

public static void main(String[] args) throws Exception {
    if (args.length == 0) {
        usage();
        return;
    }
    FileInputStream is = new FileInputStream(args[0]);
    HSLFSlideShow ppt = new HSLFSlideShow(is);
    is.close();
    //extract all sound files embedded in this presentation
    HSLFSoundData[] sound = ppt.getSoundData();
    for (int i = 0; i < sound.length; i++) {
        //*.wav
        String type = sound[i].getSoundType();
        //typically file name
        String name = sound[i].getSoundName();
        //raw bytes
        byte[] data = sound[i].getData();
        //save the sound  on disk
        FileOutputStream out = new FileOutputStream(name + type);
        out.write(data);
        out.close();
    }
    int oleIdx = -1, picIdx = -1;
    for (HSLFSlide slide : ppt.getSlides()) {
        //extract embedded OLE documents
        for (HSLFShape shape : slide.getShapes()) {
            if (shape instanceof OLEShape) {
                oleIdx++;
                OLEShape ole = (OLEShape) shape;
                HSLFObjectData data = ole.getObjectData();
                String name = ole.getInstanceName();
                if ("Worksheet".equals(name)) {
                    //read xls
                    @SuppressWarnings({ "unused", "resource" }) HSSFWorkbook wb = new HSSFWorkbook(data.getData());
                } else if ("Document".equals(name)) {
                    HWPFDocument doc = new HWPFDocument(data.getData());
                    //read the word document
                    Range r = doc.getRange();
                    for (int k = 0; k < r.numParagraphs(); k++) {
                        Paragraph p = r.getParagraph(k);
                        System.out.println(p.text());
                    }
                    //save on disk
                    FileOutputStream out = new FileOutputStream(name + "-(" + (oleIdx) + ").doc");
                    doc.write(out);
                    out.close();
                    doc.close();
                } else {
                    FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (oleIdx + 1) + ".dat");
                    InputStream dis = data.getData();
                    byte[] chunk = new byte[2048];
                    int count;
                    while ((count = dis.read(chunk)) >= 0) {
                        out.write(chunk, 0, count);
                    }
                    is.close();
                    out.close();
                }
            } else //Pictures
            if (shape instanceof HSLFPictureShape) {
                picIdx++;
                HSLFPictureShape p = (HSLFPictureShape) shape;
                HSLFPictureData data = p.getPictureData();
                String ext = data.getType().extension;
                FileOutputStream out = new FileOutputStream("pict-" + picIdx + ext);
                out.write(data.getData());
                out.close();
            }
        }
    }
    ppt.close();
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) HSLFObjectData(org.apache.poi.hslf.usermodel.HSLFObjectData) Range(org.apache.poi.hwpf.usermodel.Range) HSLFSlideShow(org.apache.poi.hslf.usermodel.HSLFSlideShow) FileInputStream(java.io.FileInputStream) OLEShape(org.apache.poi.hslf.model.OLEShape) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HSLFShape(org.apache.poi.hslf.usermodel.HSLFShape) HSLFPictureShape(org.apache.poi.hslf.usermodel.HSLFPictureShape) FileOutputStream(java.io.FileOutputStream) HSLFSoundData(org.apache.poi.hslf.usermodel.HSLFSoundData) HSLFPictureData(org.apache.poi.hslf.usermodel.HSLFPictureData) HSLFSlide(org.apache.poi.hslf.usermodel.HSLFSlide)

Example 2 with Paragraph

use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.

the class AbstractWordConverter method processParagraphes.

protected void processParagraphes(HWPFDocumentCore wordDocument, Element flow, Range range, int currentTableLevel) {
    final int paragraphs = range.numParagraphs();
    for (int p = 0; p < paragraphs; p++) {
        Paragraph paragraph = range.getParagraph(p);
        if (paragraph.isInTable() && paragraph.getTableLevel() != currentTableLevel) {
            if (paragraph.getTableLevel() < currentTableLevel)
                throw new IllegalStateException("Trying to process table cell with higher level (" + paragraph.getTableLevel() + ") than current table level (" + currentTableLevel + ") as inner table part");
            Table table = range.getTable(paragraph);
            processTable(wordDocument, flow, table);
            p += table.numParagraphs();
            p--;
            continue;
        }
        if (paragraph.text().equals("")) {
            processPageBreak(wordDocument, flow);
        }
        boolean processed = false;
        if (paragraph.isInList()) {
            try {
                HWPFList hwpfList = paragraph.getList();
                String label = AbstractWordUtils.getBulletText(numberingState, hwpfList, (char) paragraph.getIlvl());
                processParagraph(wordDocument, flow, currentTableLevel, paragraph, label);
                processed = true;
            } catch (Exception exc) {
                log.log(POILogger.WARN, "Can't process paragraph as list entry, will be processed without list information", exc);
            }
        }
        if (processed == false) {
            processParagraph(wordDocument, flow, currentTableLevel, paragraph, AbstractWordUtils.EMPTY);
        }
    }
}
Also used : Table(org.apache.poi.hwpf.usermodel.Table) HWPFList(org.apache.poi.hwpf.usermodel.HWPFList) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 3 with Paragraph

use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.

the class HWPFLister method dumpPapx.

public void dumpPapx(boolean withProperties, boolean withSprms) throws Exception {
    if (_doc instanceof HWPFDocument) {
        System.out.println("binary PAP pages ");
        HWPFDocument doc = (HWPFDocument) _doc;
        byte[] mainStream = _doc.getMainStream();
        PlexOfCps binTable = new PlexOfCps(doc.getTableStream(), doc.getFileInformationBlock().getFcPlcfbtePapx(), doc.getFileInformationBlock().getLcbPlcfbtePapx(), 4);
        List<PAPX> papxs = new ArrayList<PAPX>();
        int length = binTable.length();
        for (int x = 0; x < length; x++) {
            GenericPropertyNode node = binTable.getProperty(x);
            int pageNum = LittleEndian.getInt(node.getBytes());
            int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
            PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(mainStream, doc.getDataStream(), pageOffset, doc.getTextTable());
            System.out.println("* PFKP: " + pfkp);
            for (PAPX papx : pfkp.getPAPXs()) {
                System.out.println("** " + papx);
                papxs.add(papx);
                if (papx != null && withSprms) {
                    SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
                    dumpSprms(sprmIt, "*** ");
                }
            }
        }
        Collections.sort(papxs);
        System.out.println("* Sorted by END");
        for (PAPX papx : papxs) {
            System.out.println("** " + papx);
            if (papx != null && withSprms) {
                SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
                dumpSprms(sprmIt, "*** ");
            }
        }
    }
    for (PAPX papx : _doc.getParagraphTable().getParagraphs()) {
        System.out.println(papx);
        if (withProperties) {
            Paragraph paragraph = Paragraph.newParagraph(_doc.getOverallRange(), papx);
            System.out.println(paragraph.getProps());
        }
        SprmIterator sprmIt = new SprmIterator(papx.getGrpprl(), 2);
        dumpSprms(sprmIt, "\t");
    }
}
Also used : HWPFDocument(org.apache.poi.hwpf.HWPFDocument) SprmIterator(org.apache.poi.hwpf.sprm.SprmIterator) PlexOfCps(org.apache.poi.hwpf.model.PlexOfCps) ArrayList(java.util.ArrayList) PAPFormattedDiskPage(org.apache.poi.hwpf.model.PAPFormattedDiskPage) PAPX(org.apache.poi.hwpf.model.PAPX) GenericPropertyNode(org.apache.poi.hwpf.model.GenericPropertyNode) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 4 with Paragraph

use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.

the class TestDifferentRoutes method testExtractFromModel.

/**
	 * Test model based extraction
	 */
@Test
public void testExtractFromModel() {
    Range r = doc.getRange();
    String[] text = new String[r.numParagraphs()];
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        text[i] = p.text();
    }
    assertArrayEquals(p_text, text);
}
Also used : Range(org.apache.poi.hwpf.usermodel.Range) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) Test(org.junit.Test)

Example 5 with Paragraph

use of org.apache.poi.hwpf.usermodel.Paragraph in project poi by apache.

the class TestBug46610 method runExtract.

private static String runExtract(String sampleName) throws Exception {
    HWPFDocument doc = HWPFTestDataSamples.openSampleFile(sampleName);
    StringBuffer out = new StringBuffer();
    Range globalRange = doc.getRange();
    for (int i = 0; i < globalRange.numParagraphs(); i++) {
        Paragraph p = globalRange.getParagraph(i);
        out.append(p.text());
        out.append("\n");
        for (int j = 0; j < p.numCharacterRuns(); j++) {
            CharacterRun characterRun = p.getCharacterRun(j);
            characterRun.text();
        }
        doc.close();
    }
    return out.toString();
}
Also used : HWPFDocument(org.apache.poi.hwpf.HWPFDocument) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Aggregations

Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)12 Range (org.apache.poi.hwpf.usermodel.Range)8 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)4 CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)3 FileInputStream (java.io.FileInputStream)2 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)2 Picture (org.apache.poi.hwpf.usermodel.Picture)2 Table (org.apache.poi.hwpf.usermodel.Table)2 FileNotFoundException (java.io.FileNotFoundException)1 FileOutputStream (java.io.FileOutputStream)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 OLEShape (org.apache.poi.hslf.model.OLEShape)1 HSLFObjectData (org.apache.poi.hslf.usermodel.HSLFObjectData)1 HSLFPictureData (org.apache.poi.hslf.usermodel.HSLFPictureData)1 HSLFPictureShape (org.apache.poi.hslf.usermodel.HSLFPictureShape)1 HSLFShape (org.apache.poi.hslf.usermodel.HSLFShape)1 HSLFSlide (org.apache.poi.hslf.usermodel.HSLFSlide)1 HSLFSlideShow (org.apache.poi.hslf.usermodel.HSLFSlideShow)1 HSLFSoundData (org.apache.poi.hslf.usermodel.HSLFSoundData)1