Search in sources :

Example 1 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class AbstractWordConverter method processCharacters.

protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.isEmpty())
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
            if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processSymbol(doc, characterRun, block);
                continue;
            }
        }
        if (text.charAt(0) == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.charAt(0) == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.charAt(0) == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Also used : CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) LinkedList(java.util.LinkedList) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) Field(org.apache.poi.hwpf.usermodel.Field) Bookmark(org.apache.poi.hwpf.usermodel.Bookmark) Picture(org.apache.poi.hwpf.usermodel.Picture) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) HWPFList(org.apache.poi.hwpf.usermodel.HWPFList)

Example 2 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class AbstractWordConverter method tryDeadField_lookupFieldSeparatorEnd.

private int[] tryDeadField_lookupFieldSeparatorEnd(HWPFDocumentCore wordDocument, Range range, int beginMark) {
    int separatorMark = -1;
    int endMark = -1;
    for (int c = beginMark + 1; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        String text = characterRun.text();
        if (text.isEmpty())
            continue;
        final char firstByte = text.charAt(0);
        if (firstByte == FIELD_BEGIN_MARK) {
            int[] nested = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
            if (nested != null) {
                c = nested[1];
            }
            continue;
        }
        if (firstByte == FIELD_SEPARATOR_MARK) {
            if (separatorMark != -1) {
                // double; incorrect format
                return null;
            }
            separatorMark = c;
            continue;
        }
        if (firstByte == FIELD_END_MARK) {
            if (endMark != -1) {
                // double;
                return null;
            }
            endMark = c;
            break;
        }
    }
    if (separatorMark == -1 || endMark == -1)
        return null;
    return new int[] { separatorMark, endMark };
}
Also used : CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun)

Example 3 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class WordToHtmlConverter method processParagraph.

@Override
protected void processParagraph(HWPFDocumentCore hwpfDocument, Element parentElement, int currentTableLevel, Paragraph paragraph, String bulletText) {
    final Element pElement = htmlDocumentFacade.createParagraph();
    parentElement.appendChild(pElement);
    StringBuilder style = new StringBuilder();
    WordToHtmlUtils.addParagraphProperties(paragraph, style);
    final int charRuns = paragraph.numCharacterRuns();
    if (charRuns == 0) {
        return;
    }
    {
        final String pFontName;
        final int pFontSize;
        final CharacterRun characterRun = paragraph.getCharacterRun(0);
        if (characterRun != null) {
            Triplet triplet = getCharacterRunTriplet(characterRun);
            pFontSize = characterRun.getFontSize() / 2;
            pFontName = triplet.fontName;
            WordToHtmlUtils.addFontFamily(pFontName, style);
            WordToHtmlUtils.addFontSize(pFontSize, style);
        } else {
            pFontSize = -1;
            pFontName = AbstractWordUtils.EMPTY;
        }
        blocksProperies.push(new BlockProperies(pFontName, pFontSize));
    }
    try {
        if (AbstractWordUtils.isNotEmpty(bulletText)) {
            if (bulletText.endsWith("\t")) {
                /*
                     * We don't know how to handle all cases in HTML, but at
                     * least simplest case shall be handled
                     */
                final float defaultTab = TWIPS_PER_INCH / 2;
                // char have some space
                float firstLinePosition = paragraph.getIndentFromLeft() + paragraph.getFirstLineIndent() + 20f;
                float nextStop = (float) (Math.ceil(firstLinePosition / defaultTab) * defaultTab);
                final float spanMinWidth = nextStop - firstLinePosition;
                Element span = htmlDocumentFacade.getDocument().createElement("span");
                htmlDocumentFacade.addStyleClass(span, "s", "display: inline-block; text-indent: 0; min-width: " + (spanMinWidth / TWIPS_PER_INCH) + "in;");
                pElement.appendChild(span);
                Text textNode = htmlDocumentFacade.createText(bulletText.substring(0, bulletText.length() - 1) + UNICODECHAR_ZERO_WIDTH_SPACE + UNICODECHAR_NO_BREAK_SPACE);
                span.appendChild(textNode);
            } else {
                Text textNode = htmlDocumentFacade.createText(bulletText.substring(0, bulletText.length() - 1));
                pElement.appendChild(textNode);
            }
        }
        processCharacters(hwpfDocument, currentTableLevel, paragraph, pElement);
    } finally {
        blocksProperies.pop();
    }
    if (style.length() > 0) {
        htmlDocumentFacade.addStyleClass(pElement, "p", style.toString());
    }
    WordToHtmlUtils.compactSpans(pElement);
    return;
}
Also used : Triplet(org.apache.poi.hwpf.converter.FontReplacer.Triplet) Element(org.w3c.dom.Element) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Text(org.w3c.dom.Text)

Example 4 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class TestBug46610 method runExtract.

private static String runExtract(String sampleName) throws Exception {
    HWPFDocument doc = HWPFTestDataSamples.openSampleFile(sampleName);
    StringBuffer out = new StringBuffer();
    Range globalRange = doc.getRange();
    for (int i = 0; i < globalRange.numParagraphs(); i++) {
        Paragraph p = globalRange.getParagraph(i);
        out.append(p.text());
        out.append("\n");
        for (int j = 0; j < p.numCharacterRuns(); j++) {
            CharacterRun characterRun = p.getCharacterRun(j);
            characterRun.text();
        }
        doc.close();
    }
    return out.toString();
}
Also used : HWPFDocument(org.apache.poi.hwpf.HWPFDocument) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 5 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class PicturesTable method getAllPictures.

/**
   * Not all documents have all the images concatenated in the data stream
   * although MS claims so. The best approach is to scan all character runs.
   *
   * @return a list of Picture objects found in current document
   */
public List<Picture> getAllPictures() {
    ArrayList<Picture> pictures = new ArrayList<Picture>();
    Range range = _document.getOverallRange();
    for (int i = 0; i < range.numCharacterRuns(); i++) {
        CharacterRun run = range.getCharacterRun(i);
        if (run == null) {
            continue;
        }
        Picture picture = extractPicture(run, false);
        if (picture != null) {
            pictures.add(picture);
        }
    }
    searchForPictures(_dgg.getEscherRecords(), pictures);
    return pictures;
}
Also used : Picture(org.apache.poi.hwpf.usermodel.Picture) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) ArrayList(java.util.ArrayList) Range(org.apache.poi.hwpf.usermodel.Range)

Aggregations

CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)8 Picture (org.apache.poi.hwpf.usermodel.Picture)4 Range (org.apache.poi.hwpf.usermodel.Range)4 ArrayList (java.util.ArrayList)3 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)3 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)2 Field (org.apache.poi.hwpf.usermodel.Field)2 FileInputStream (java.io.FileInputStream)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Triplet (org.apache.poi.hwpf.converter.FontReplacer.Triplet)1 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)1 SavedByTable (org.apache.poi.hwpf.model.SavedByTable)1 StyleDescription (org.apache.poi.hwpf.model.StyleDescription)1 Bookmark (org.apache.poi.hwpf.usermodel.Bookmark)1 HWPFList (org.apache.poi.hwpf.usermodel.HWPFList)1 Section (org.apache.poi.hwpf.usermodel.Section)1 Table (org.apache.poi.hwpf.usermodel.Table)1 TableCell (org.apache.poi.hwpf.usermodel.TableCell)1 TableRow (org.apache.poi.hwpf.usermodel.TableRow)1