Search in sources :

Example 6 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project poi by apache.

the class QuickTest method main.

public static void main(String[] args) throws IOException {
    HWPFDocument doc = new HWPFDocument(new FileInputStream(args[0]));
    Range r = doc.getRange();
    System.out.println("Example you supplied:");
    System.out.println("---------------------");
    for (int x = 0; x < r.numSections(); x++) {
        Section s = r.getSection(x);
        for (int y = 0; y < s.numParagraphs(); y++) {
            Paragraph p = s.getParagraph(y);
            for (int z = 0; z < p.numCharacterRuns(); z++) {
                // character run
                CharacterRun run = p.getCharacterRun(z);
                // character run text
                String text = run.text();
                // show us the text
                System.out.print(text);
            }
            // use a new line at the paragraph break
            System.out.println();
        }
    }
    doc.close();
}
Also used : CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) Range(org.apache.poi.hwpf.usermodel.Range) Section(org.apache.poi.hwpf.usermodel.Section) FileInputStream(java.io.FileInputStream) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 7 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project tika by apache.

the class WordExtractor method handleParagraph.

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");
                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }
    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }
    TagAndStyle tas;
    String numbering = null;
    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }
    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }
    if (numbering != null) {
        xhtml.characters(numbering);
    }
    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);
        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_unknown_id";
                //this can return null (TIKA-1956)
                CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
                if (mscr != null) {
                    id = "_" + mscr.getPicOffset();
                }
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }
        if (cr.text().equals("")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();
                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }
    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }
    xhtml.endElement(tas.getTag());
    return 0;
}
Also used : Field(org.apache.poi.hwpf.usermodel.Field) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) SavedByTable(org.apache.poi.hwpf.model.SavedByTable) Table(org.apache.poi.hwpf.usermodel.Table) TableCell(org.apache.poi.hwpf.usermodel.TableCell) AttributesImpl(org.xml.sax.helpers.AttributesImpl) Picture(org.apache.poi.hwpf.usermodel.Picture) TableRow(org.apache.poi.hwpf.usermodel.TableRow) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) StyleDescription(org.apache.poi.hwpf.model.StyleDescription) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 8 with CharacterRun

use of org.apache.poi.hwpf.usermodel.CharacterRun in project tika by apache.

the class WordExtractor method handleSpecialCharacterRuns.

/**
     * Can be \13..text..\15 or \13..control..\14..text..\15 .
     * Nesting is allowed
     */
private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
    List<CharacterRun> controls = new ArrayList<CharacterRun>();
    List<CharacterRun> texts = new ArrayList<CharacterRun>();
    boolean has14 = false;
    // Split it into before and after the 14
    int i;
    for (i = index + 1; i < p.numCharacterRuns(); i++) {
        CharacterRun cr = p.getCharacterRun(i);
        if (cr.text().equals("")) {
            // Nested, oh joy...
            int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml);
            i += increment;
        } else if (cr.text().equals("")) {
            has14 = true;
        } else if (cr.text().equals("")) {
            if (!has14) {
                texts = controls;
                controls = new ArrayList<CharacterRun>();
            }
            break;
        } else {
            if (has14) {
                texts.add(cr);
            } else {
                controls.add(cr);
            }
        }
    }
    // Do we need to do something special with this?
    if (controls.size() > 0) {
        String text = controls.get(0).text();
        for (int j = 1; j < controls.size(); j++) {
            text += controls.get(j).text();
        }
        if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) {
            int start = text.indexOf('"') + 1;
            int end = findHyperlinkEnd(text, start);
            String url = "";
            if (start >= 0 && start < end && end <= text.length()) {
                url = text.substring(start, end);
            }
            xhtml.startElement("a", "href", url);
            closeStyleElements(skipStyling, xhtml);
            for (CharacterRun cr : texts) {
                handleCharacterRun(cr, skipStyling, xhtml);
            }
            closeStyleElements(skipStyling, xhtml);
            xhtml.endElement("a");
        } else {
            // Just output the text ones
            for (CharacterRun cr : texts) {
                if (pictures.hasPicture(cr)) {
                    Picture picture = pictures.getFor(cr);
                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
                } else {
                    handleCharacterRun(cr, skipStyling, xhtml);
                }
            }
        }
    } else {
        // Output as-is
        for (CharacterRun cr : texts) {
            handleCharacterRun(cr, skipStyling, xhtml);
        }
    }
    // Tell them how many to skip over
    return i - index;
}
Also used : Picture(org.apache.poi.hwpf.usermodel.Picture) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) ArrayList(java.util.ArrayList)

Aggregations

CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)8 Picture (org.apache.poi.hwpf.usermodel.Picture)4 Range (org.apache.poi.hwpf.usermodel.Range)4 ArrayList (java.util.ArrayList)3 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)3 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)2 Field (org.apache.poi.hwpf.usermodel.Field)2 FileInputStream (java.io.FileInputStream)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Triplet (org.apache.poi.hwpf.converter.FontReplacer.Triplet)1 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)1 SavedByTable (org.apache.poi.hwpf.model.SavedByTable)1 StyleDescription (org.apache.poi.hwpf.model.StyleDescription)1 Bookmark (org.apache.poi.hwpf.usermodel.Bookmark)1 HWPFList (org.apache.poi.hwpf.usermodel.HWPFList)1 Section (org.apache.poi.hwpf.usermodel.Section)1 Table (org.apache.poi.hwpf.usermodel.Table)1 TableCell (org.apache.poi.hwpf.usermodel.TableCell)1 TableRow (org.apache.poi.hwpf.usermodel.TableRow)1