Search in sources :

Example 1 with Field

use of org.apache.poi.hwpf.usermodel.Field in project poi by apache.

the class AbstractWordConverter method processCharacters.

protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.isEmpty())
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
            if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processSymbol(doc, characterRun, block);
                continue;
            }
        }
        if (text.charAt(0) == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.charAt(0) == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.charAt(0) == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Also used : CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) LinkedList(java.util.LinkedList) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) Field(org.apache.poi.hwpf.usermodel.Field) Bookmark(org.apache.poi.hwpf.usermodel.Bookmark) Picture(org.apache.poi.hwpf.usermodel.Picture) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) HWPFList(org.apache.poi.hwpf.usermodel.HWPFList)

Example 2 with Field

use of org.apache.poi.hwpf.usermodel.Field in project tika by apache.

the class WordExtractor method handleParagraph.

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");
                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }
    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }
    TagAndStyle tas;
    String numbering = null;
    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }
    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }
    if (numbering != null) {
        xhtml.characters(numbering);
    }
    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);
        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_unknown_id";
                //this can return null (TIKA-1956)
                CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
                if (mscr != null) {
                    id = "_" + mscr.getPicOffset();
                }
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }
        if (cr.text().equals("")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();
                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }
    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }
    xhtml.endElement(tas.getTag());
    return 0;
}
Also used : Field(org.apache.poi.hwpf.usermodel.Field) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) SavedByTable(org.apache.poi.hwpf.model.SavedByTable) Table(org.apache.poi.hwpf.usermodel.Table) TableCell(org.apache.poi.hwpf.usermodel.TableCell) AttributesImpl(org.xml.sax.helpers.AttributesImpl) Picture(org.apache.poi.hwpf.usermodel.Picture) TableRow(org.apache.poi.hwpf.usermodel.TableRow) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) StyleDescription(org.apache.poi.hwpf.model.StyleDescription) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Aggregations

CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)2 Field (org.apache.poi.hwpf.usermodel.Field)2 Picture (org.apache.poi.hwpf.usermodel.Picture)2 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)1 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)1 SavedByTable (org.apache.poi.hwpf.model.SavedByTable)1 StyleDescription (org.apache.poi.hwpf.model.StyleDescription)1 Bookmark (org.apache.poi.hwpf.usermodel.Bookmark)1 HWPFList (org.apache.poi.hwpf.usermodel.HWPFList)1 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)1 Range (org.apache.poi.hwpf.usermodel.Range)1 Table (org.apache.poi.hwpf.usermodel.Table)1 TableCell (org.apache.poi.hwpf.usermodel.TableCell)1 TableRow (org.apache.poi.hwpf.usermodel.TableRow)1 AttributesImpl (org.xml.sax.helpers.AttributesImpl)1