Search in sources :

Example 6 with Picture

use of org.apache.poi.hwpf.usermodel.Picture in project poi by apache.

the class PicturesTable method getAllPictures.

/**
   * Not all documents have all the images concatenated in the data stream
   * although MS claims so. The best approach is to scan all character runs.
   *
   * @return a list of Picture objects found in current document
   */
public List<Picture> getAllPictures() {
    ArrayList<Picture> pictures = new ArrayList<Picture>();
    Range range = _document.getOverallRange();
    for (int i = 0; i < range.numCharacterRuns(); i++) {
        CharacterRun run = range.getCharacterRun(i);
        if (run == null) {
            continue;
        }
        Picture picture = extractPicture(run, false);
        if (picture != null) {
            pictures.add(picture);
        }
    }
    searchForPictures(_dgg.getEscherRecords(), pictures);
    return pictures;
}
Also used : Picture(org.apache.poi.hwpf.usermodel.Picture) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) ArrayList(java.util.ArrayList) Range(org.apache.poi.hwpf.usermodel.Range)

Example 7 with Picture

use of org.apache.poi.hwpf.usermodel.Picture in project poi by apache.

the class PicturesTable method searchForPictures.

/**
     * Performs a recursive search for pictures in the given list of escher records.
     *
     * @param escherRecords the escher records.
     * @param pictures the list to populate with the pictures.
     */
private void searchForPictures(List<EscherRecord> escherRecords, List<Picture> pictures) {
    for (EscherRecord escherRecord : escherRecords) {
        if (escherRecord instanceof EscherBSERecord) {
            EscherBSERecord bse = (EscherBSERecord) escherRecord;
            EscherBlipRecord blip = bse.getBlipRecord();
            if (blip != null) {
                pictures.add(new Picture(blip));
            } else if (bse.getOffset() > 0) {
                try {
                    // Blip stored in delay stream, which in a word doc, is
                    // the main stream
                    EscherRecordFactory recordFactory = new DefaultEscherRecordFactory();
                    EscherRecord record = recordFactory.createRecord(_mainStream, bse.getOffset());
                    if (record instanceof EscherBlipRecord) {
                        record.fillFields(_mainStream, bse.getOffset(), recordFactory);
                        blip = (EscherBlipRecord) record;
                        pictures.add(new Picture(blip));
                    }
                } catch (Exception exc) {
                    logger.log(POILogger.WARN, "Unable to load picture from BLIB record at offset #", Integer.valueOf(bse.getOffset()), exc);
                }
            }
        }
        // Recursive call.
        searchForPictures(escherRecord.getChildRecords(), pictures);
    }
}
Also used : Picture(org.apache.poi.hwpf.usermodel.Picture) EscherRecord(org.apache.poi.ddf.EscherRecord) DefaultEscherRecordFactory(org.apache.poi.ddf.DefaultEscherRecordFactory) EscherBSERecord(org.apache.poi.ddf.EscherBSERecord) EscherBlipRecord(org.apache.poi.ddf.EscherBlipRecord) EscherRecordFactory(org.apache.poi.ddf.EscherRecordFactory) DefaultEscherRecordFactory(org.apache.poi.ddf.DefaultEscherRecordFactory)

Example 8 with Picture

use of org.apache.poi.hwpf.usermodel.Picture in project tika by apache.

the class WordExtractor method handleParagraph.

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");
                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }
    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }
    TagAndStyle tas;
    String numbering = null;
    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }
    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }
    if (numbering != null) {
        xhtml.characters(numbering);
    }
    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);
        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_unknown_id";
                //this can return null (TIKA-1956)
                CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
                if (mscr != null) {
                    id = "_" + mscr.getPicOffset();
                }
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }
        if (cr.text().equals("")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();
                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }
    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }
    xhtml.endElement(tas.getTag());
    return 0;
}
Also used : Field(org.apache.poi.hwpf.usermodel.Field) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) SavedByTable(org.apache.poi.hwpf.model.SavedByTable) Table(org.apache.poi.hwpf.usermodel.Table) TableCell(org.apache.poi.hwpf.usermodel.TableCell) AttributesImpl(org.xml.sax.helpers.AttributesImpl) Picture(org.apache.poi.hwpf.usermodel.Picture) TableRow(org.apache.poi.hwpf.usermodel.TableRow) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) StyleDescription(org.apache.poi.hwpf.model.StyleDescription) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph)

Example 9 with Picture

use of org.apache.poi.hwpf.usermodel.Picture in project tika by apache.

the class WordExtractor method parse.

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    HWPFDocument document;
    try {
        document = new HWPFDocument(root);
    } catch (org.apache.poi.EncryptedDocumentException e) {
        throw new EncryptedDocumentException(e);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    extractSavedByMetadata(document);
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
    HeaderStories headerFooter = new HeaderStories(document);
    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);
    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
    }
    if (officeParserConfig.getIncludeShapeBasedContent()) {
        // Do everything else
        for (String paragraph : wordExtractor.getMainTextboxText()) {
            xhtml.element("p", paragraph);
        }
    }
    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }
    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null; ) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }
    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) FileNotFoundException(java.io.FileNotFoundException) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) Range(org.apache.poi.hwpf.usermodel.Range) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) SavedByEntry(org.apache.poi.hwpf.model.SavedByEntry) Picture(org.apache.poi.hwpf.usermodel.Picture) OldWordFileFormatException(org.apache.poi.hwpf.OldWordFileFormatException)

Example 10 with Picture

use of org.apache.poi.hwpf.usermodel.Picture in project tika by apache.

the class WordExtractor method handleSpecialCharacterRuns.

/**
     * Can be \13..text..\15 or \13..control..\14..text..\15 .
     * Nesting is allowed
     */
private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
    List<CharacterRun> controls = new ArrayList<CharacterRun>();
    List<CharacterRun> texts = new ArrayList<CharacterRun>();
    boolean has14 = false;
    // Split it into before and after the 14
    int i;
    for (i = index + 1; i < p.numCharacterRuns(); i++) {
        CharacterRun cr = p.getCharacterRun(i);
        if (cr.text().equals("")) {
            // Nested, oh joy...
            int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml);
            i += increment;
        } else if (cr.text().equals("")) {
            has14 = true;
        } else if (cr.text().equals("")) {
            if (!has14) {
                texts = controls;
                controls = new ArrayList<CharacterRun>();
            }
            break;
        } else {
            if (has14) {
                texts.add(cr);
            } else {
                controls.add(cr);
            }
        }
    }
    // Do we need to do something special with this?
    if (controls.size() > 0) {
        String text = controls.get(0).text();
        for (int j = 1; j < controls.size(); j++) {
            text += controls.get(j).text();
        }
        if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) {
            int start = text.indexOf('"') + 1;
            int end = findHyperlinkEnd(text, start);
            String url = "";
            if (start >= 0 && start < end && end <= text.length()) {
                url = text.substring(start, end);
            }
            xhtml.startElement("a", "href", url);
            closeStyleElements(skipStyling, xhtml);
            for (CharacterRun cr : texts) {
                handleCharacterRun(cr, skipStyling, xhtml);
            }
            closeStyleElements(skipStyling, xhtml);
            xhtml.endElement("a");
        } else {
            // Just output the text ones
            for (CharacterRun cr : texts) {
                if (pictures.hasPicture(cr)) {
                    Picture picture = pictures.getFor(cr);
                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
                } else {
                    handleCharacterRun(cr, skipStyling, xhtml);
                }
            }
        }
    } else {
        // Output as-is
        for (CharacterRun cr : texts) {
            handleCharacterRun(cr, skipStyling, xhtml);
        }
    }
    // Tell them how many to skip over
    return i - index;
}
Also used : Picture(org.apache.poi.hwpf.usermodel.Picture) CharacterRun(org.apache.poi.hwpf.usermodel.CharacterRun) ArrayList(java.util.ArrayList)

Aggregations

Picture (org.apache.poi.hwpf.usermodel.Picture)13 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)6 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)4 CharacterRun (org.apache.poi.hwpf.usermodel.CharacterRun)4 ArrayList (java.util.ArrayList)3 Range (org.apache.poi.hwpf.usermodel.Range)3 FileNotFoundException (java.io.FileNotFoundException)2 List (java.util.List)2 Transformer (javax.xml.transform.Transformer)2 TransformerFactory (javax.xml.transform.TransformerFactory)2 DOMSource (javax.xml.transform.dom.DOMSource)2 StreamResult (javax.xml.transform.stream.StreamResult)2 PicturesManager (org.apache.poi.hwpf.converter.PicturesManager)2 WordToHtmlConverter (org.apache.poi.hwpf.converter.WordToHtmlConverter)2 Field (org.apache.poi.hwpf.usermodel.Field)2 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)2 PictureType (org.apache.poi.hwpf.usermodel.PictureType)2 Document (org.w3c.dom.Document)2 BufferedImage (java.awt.image.BufferedImage)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1