Search in sources :

Example 1 with Range

use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.

the class DataExtraction method main.

public static void main(String[] args) throws Exception {
    if (args.length == 0) {
        usage();
        return;
    }
    FileInputStream is = new FileInputStream(args[0]);
    HSLFSlideShow ppt = new HSLFSlideShow(is);
    is.close();
    //extract all sound files embedded in this presentation
    HSLFSoundData[] sound = ppt.getSoundData();
    for (int i = 0; i < sound.length; i++) {
        //*.wav
        String type = sound[i].getSoundType();
        //typically file name
        String name = sound[i].getSoundName();
        //raw bytes
        byte[] data = sound[i].getData();
        //save the sound  on disk
        FileOutputStream out = new FileOutputStream(name + type);
        out.write(data);
        out.close();
    }
    int oleIdx = -1, picIdx = -1;
    for (HSLFSlide slide : ppt.getSlides()) {
        //extract embedded OLE documents
        for (HSLFShape shape : slide.getShapes()) {
            if (shape instanceof OLEShape) {
                oleIdx++;
                OLEShape ole = (OLEShape) shape;
                HSLFObjectData data = ole.getObjectData();
                String name = ole.getInstanceName();
                if ("Worksheet".equals(name)) {
                    //read xls
                    @SuppressWarnings({ "unused", "resource" }) HSSFWorkbook wb = new HSSFWorkbook(data.getData());
                } else if ("Document".equals(name)) {
                    HWPFDocument doc = new HWPFDocument(data.getData());
                    //read the word document
                    Range r = doc.getRange();
                    for (int k = 0; k < r.numParagraphs(); k++) {
                        Paragraph p = r.getParagraph(k);
                        System.out.println(p.text());
                    }
                    //save on disk
                    FileOutputStream out = new FileOutputStream(name + "-(" + (oleIdx) + ").doc");
                    doc.write(out);
                    out.close();
                    doc.close();
                } else {
                    FileOutputStream out = new FileOutputStream(ole.getProgID() + "-" + (oleIdx + 1) + ".dat");
                    InputStream dis = data.getData();
                    byte[] chunk = new byte[2048];
                    int count;
                    while ((count = dis.read(chunk)) >= 0) {
                        out.write(chunk, 0, count);
                    }
                    is.close();
                    out.close();
                }
            } else //Pictures
            if (shape instanceof HSLFPictureShape) {
                picIdx++;
                HSLFPictureShape p = (HSLFPictureShape) shape;
                HSLFPictureData data = p.getPictureData();
                String ext = data.getType().extension;
                FileOutputStream out = new FileOutputStream("pict-" + picIdx + ext);
                out.write(data.getData());
                out.close();
            }
        }
    }
    ppt.close();
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) HSLFObjectData(org.apache.poi.hslf.usermodel.HSLFObjectData) Range(org.apache.poi.hwpf.usermodel.Range) HSLFSlideShow(org.apache.poi.hslf.usermodel.HSLFSlideShow) FileInputStream(java.io.FileInputStream) OLEShape(org.apache.poi.hslf.model.OLEShape) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HSLFShape(org.apache.poi.hslf.usermodel.HSLFShape) HSLFPictureShape(org.apache.poi.hslf.usermodel.HSLFPictureShape) FileOutputStream(java.io.FileOutputStream) HSLFSoundData(org.apache.poi.hslf.usermodel.HSLFSoundData) HSLFPictureData(org.apache.poi.hslf.usermodel.HSLFPictureData) HSLFSlide(org.apache.poi.hslf.usermodel.HSLFSlide)

Example 2 with Range

use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.

the class HWPFDocument method getRange.

private Range getRange(SubdocumentType subdocument) {
    int startCp = 0;
    for (SubdocumentType previos : SubdocumentType.ORDERED) {
        int length = getFileInformationBlock().getSubdocumentTextStreamLength(previos);
        if (subdocument == previos)
            return new Range(startCp, startCp + length, this);
        startCp += length;
    }
    throw new UnsupportedOperationException("Subdocument type not supported: " + subdocument);
}
Also used : Range(org.apache.poi.hwpf.usermodel.Range)

Example 3 with Range

use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.

the class AbstractWordConverter method processDocument.

public void processDocument(HWPFDocumentCore wordDocument) {
    try {
        final SummaryInformation summaryInformation = wordDocument.getSummaryInformation();
        if (summaryInformation != null) {
            processDocumentInformation(summaryInformation);
        }
    } catch (Exception exc) {
        logger.log(POILogger.WARN, "Unable to process document summary information: ", exc, exc);
    }
    final Range docRange = wordDocument.getRange();
    if (docRange.numSections() == 1) {
        processSingleSection(wordDocument, docRange.getSection(0));
        afterProcess();
        return;
    }
    processDocumentPart(wordDocument, docRange);
    afterProcess();
}
Also used : SummaryInformation(org.apache.poi.hpsf.SummaryInformation) Range(org.apache.poi.hwpf.usermodel.Range)

Example 4 with Range

use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.

the class AbstractWordConverter method processCharacters.

protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.isEmpty())
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
            if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processSymbol(doc, characterRun, block);
                continue;
            }
        }
        if (text.charAt(0) == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.charAt(0) == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.charAt(0) == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Also used : CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) LinkedList(java.util.LinkedList) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) Field(org.apache.poi.hwpf.usermodel.Field) Bookmark(org.apache.poi.hwpf.usermodel.Bookmark) Picture(org.apache.poi.hwpf.usermodel.Picture) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) HWPFList(org.apache.poi.hwpf.usermodel.HWPFList)

Example 5 with Range

use of org.apache.poi.hwpf.usermodel.Range in project poi by apache.

the class AbstractWordConverter method processDeadField.

protected void processDeadField(HWPFDocumentCore wordDocument, Element currentBlock, Range range, int currentTableLevel, int beginMark, int separatorMark, int endMark) {
    if (beginMark + 1 < separatorMark && separatorMark + 1 < endMark) {
        Range formulaRange = new Range(range.getCharacterRun(beginMark + 1).getStartOffset(), range.getCharacterRun(separatorMark - 1).getEndOffset(), range) {

            @Override
            public String toString() {
                return "Dead field formula subrange: " + super.toString();
            }
        };
        Range valueRange = new Range(range.getCharacterRun(separatorMark + 1).getStartOffset(), range.getCharacterRun(endMark - 1).getEndOffset(), range) {

            @Override
            public String toString() {
                return "Dead field value subrange: " + super.toString();
            }
        };
        String formula = formulaRange.text();
        final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher(formula);
        if (matcher.matches()) {
            String localref = matcher.group(1);
            processPageref(wordDocument, currentBlock, valueRange, currentTableLevel, localref);
            return;
        }
    }
    StringBuilder debug = new StringBuilder("Unsupported field type: \n");
    for (int i = beginMark; i <= endMark; i++) {
        debug.append("\t");
        debug.append(range.getCharacterRun(i));
        debug.append("\n");
    }
    logger.log(POILogger.WARN, debug);
    Range deadFieldValueSubrage = new Range(range.getCharacterRun(separatorMark).getStartOffset() + 1, range.getCharacterRun(endMark).getStartOffset(), range) {

        @Override
        public String toString() {
            return "DeadFieldValueSubrange (" + super.toString() + ")";
        }
    };
    // just output field value
    if (separatorMark + 1 < endMark)
        processCharacters(wordDocument, currentTableLevel, deadFieldValueSubrage, currentBlock);
    return;
}
Also used : Matcher(java.util.regex.Matcher) Range(org.apache.poi.hwpf.usermodel.Range)

Aggregations

Range (org.apache.poi.hwpf.usermodel.Range)24 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)9 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)8 Bookmark (org.apache.poi.hwpf.usermodel.Bookmark)4 CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)4 Picture (org.apache.poi.hwpf.usermodel.Picture)3 FileInputStream (java.io.FileInputStream)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 FileNotFoundException (java.io.FileNotFoundException)1 FileOutputStream (java.io.FileOutputStream)1 InputStream (java.io.InputStream)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Matcher (java.util.regex.Matcher)1 SummaryInformation (org.apache.poi.hpsf.SummaryInformation)1 OLEShape (org.apache.poi.hslf.model.OLEShape)1 HSLFObjectData (org.apache.poi.hslf.usermodel.HSLFObjectData)1 HSLFPictureData (org.apache.poi.hslf.usermodel.HSLFPictureData)1 HSLFPictureShape (org.apache.poi.hslf.usermodel.HSLFPictureShape)1