Search in sources :

Example 1 with HeaderStories

use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.

the class WordExtractor method getText.

/**
     * Grab the text, based on the WordToTextConverter. Shouldn't include any
     * crud, but slower than getTextFromPieces().
     */
public String getText() {
    try {
        WordToTextConverter wordToTextConverter = new WordToTextConverter();
        HeaderStories hs = new HeaderStories(doc);
        if (hs.getFirstHeaderSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getFirstHeaderSubrange());
        if (hs.getEvenHeaderSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getEvenHeaderSubrange());
        if (hs.getOddHeaderSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getOddHeaderSubrange());
        wordToTextConverter.processDocument(doc);
        wordToTextConverter.processDocumentPart(doc, doc.getMainTextboxRange());
        if (hs.getFirstFooterSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getFirstFooterSubrange());
        if (hs.getEvenFooterSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getEvenFooterSubrange());
        if (hs.getOddFooterSubrange() != null)
            wordToTextConverter.processDocumentPart(doc, hs.getOddFooterSubrange());
        return wordToTextConverter.getText();
    } catch (RuntimeException e) {
        throw e;
    } catch (Exception exc) {
        throw new RuntimeException(exc);
    }
}
Also used : HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories) WordToTextConverter(org.apache.poi.hwpf.converter.WordToTextConverter) IOException(java.io.IOException)

Example 2 with HeaderStories

use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.

the class WordExtractor method getHeaderText.

/**
     * Grab the text from the headers
     * @deprecated 3.8 beta 4
     */
@Deprecated
public String getHeaderText() {
    HeaderStories hs = new HeaderStories(doc);
    StringBuffer ret = new StringBuffer();
    if (hs.getFirstHeader() != null) {
        appendHeaderFooter(hs.getFirstHeader(), ret);
    }
    if (hs.getEvenHeader() != null) {
        appendHeaderFooter(hs.getEvenHeader(), ret);
    }
    if (hs.getOddHeader() != null) {
        appendHeaderFooter(hs.getOddHeader(), ret);
    }
    return ret.toString();
}
Also used : HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories)

Example 3 with HeaderStories

use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.

the class WordExtractor method getFooterText.

/**
     * Grab the text from the footers
     * @deprecated 3.8 beta 4
     */
@Deprecated
public String getFooterText() {
    HeaderStories hs = new HeaderStories(doc);
    StringBuffer ret = new StringBuffer();
    if (hs.getFirstFooter() != null) {
        appendHeaderFooter(hs.getFirstFooter(), ret);
    }
    if (hs.getEvenFooter() != null) {
        appendHeaderFooter(hs.getEvenFooter(), ret);
    }
    if (hs.getOddFooter() != null) {
        appendHeaderFooter(hs.getOddFooter(), ret);
    }
    return ret.toString();
}
Also used : HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories)

Example 4 with HeaderStories

use of org.apache.poi.hwpf.usermodel.HeaderStories in project tika by apache.

the class WordExtractor method parse.

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    HWPFDocument document;
    try {
        document = new HWPFDocument(root);
    } catch (org.apache.poi.EncryptedDocumentException e) {
        throw new EncryptedDocumentException(e);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    extractSavedByMetadata(document);
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
    HeaderStories headerFooter = new HeaderStories(document);
    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);
    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
    }
    if (officeParserConfig.getIncludeShapeBasedContent()) {
        // Do everything else
        for (String paragraph : wordExtractor.getMainTextboxText()) {
            xhtml.element("p", paragraph);
        }
    }
    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }
    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null; ) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }
    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) FileNotFoundException(java.io.FileNotFoundException) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) Range(org.apache.poi.hwpf.usermodel.Range) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) SavedByEntry(org.apache.poi.hwpf.model.SavedByEntry) Picture(org.apache.poi.hwpf.usermodel.Picture) OldWordFileFormatException(org.apache.poi.hwpf.OldWordFileFormatException)

Aggregations

HeaderStories (org.apache.poi.hwpf.usermodel.HeaderStories)4 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)1 OldWordFileFormatException (org.apache.poi.hwpf.OldWordFileFormatException)1 WordToTextConverter (org.apache.poi.hwpf.converter.WordToTextConverter)1 PicturesTable (org.apache.poi.hwpf.model.PicturesTable)1 SavedByEntry (org.apache.poi.hwpf.model.SavedByEntry)1 Paragraph (org.apache.poi.hwpf.usermodel.Paragraph)1 Picture (org.apache.poi.hwpf.usermodel.Picture)1 Range (org.apache.poi.hwpf.usermodel.Range)1 DirectoryEntry (org.apache.poi.poifs.filesystem.DirectoryEntry)1 Entry (org.apache.poi.poifs.filesystem.Entry)1 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)1