use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.
the class WordExtractor method getText.
/**
* Grab the text, based on the WordToTextConverter. Shouldn't include any
* crud, but slower than getTextFromPieces().
*/
public String getText() {
try {
WordToTextConverter wordToTextConverter = new WordToTextConverter();
HeaderStories hs = new HeaderStories(doc);
if (hs.getFirstHeaderSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getFirstHeaderSubrange());
if (hs.getEvenHeaderSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getEvenHeaderSubrange());
if (hs.getOddHeaderSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getOddHeaderSubrange());
wordToTextConverter.processDocument(doc);
wordToTextConverter.processDocumentPart(doc, doc.getMainTextboxRange());
if (hs.getFirstFooterSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getFirstFooterSubrange());
if (hs.getEvenFooterSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getEvenFooterSubrange());
if (hs.getOddFooterSubrange() != null)
wordToTextConverter.processDocumentPart(doc, hs.getOddFooterSubrange());
return wordToTextConverter.getText();
} catch (RuntimeException e) {
throw e;
} catch (Exception exc) {
throw new RuntimeException(exc);
}
}
use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.
the class WordExtractor method getHeaderText.
/**
* Grab the text from the headers
* @deprecated 3.8 beta 4
*/
@Deprecated
public String getHeaderText() {
HeaderStories hs = new HeaderStories(doc);
StringBuffer ret = new StringBuffer();
if (hs.getFirstHeader() != null) {
appendHeaderFooter(hs.getFirstHeader(), ret);
}
if (hs.getEvenHeader() != null) {
appendHeaderFooter(hs.getEvenHeader(), ret);
}
if (hs.getOddHeader() != null) {
appendHeaderFooter(hs.getOddHeader(), ret);
}
return ret.toString();
}
use of org.apache.poi.hwpf.usermodel.HeaderStories in project poi by apache.
the class WordExtractor method getFooterText.
/**
* Grab the text from the footers
* @deprecated 3.8 beta 4
*/
@Deprecated
public String getFooterText() {
HeaderStories hs = new HeaderStories(doc);
StringBuffer ret = new StringBuffer();
if (hs.getFirstFooter() != null) {
appendHeaderFooter(hs.getFirstFooter(), ret);
}
if (hs.getEvenFooter() != null) {
appendHeaderFooter(hs.getEvenFooter(), ret);
}
if (hs.getOddFooter() != null) {
appendHeaderFooter(hs.getOddFooter(), ret);
}
return ret.toString();
}
use of org.apache.poi.hwpf.usermodel.HeaderStories in project tika by apache.
the class WordExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
document = new HWPFDocument(root);
} catch (org.apache.poi.EncryptedDocumentException e) {
throw new EncryptedDocumentException(e);
} catch (OldWordFileFormatException e) {
parseWord6(root, xhtml);
return;
}
extractSavedByMetadata(document);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
// Grab the list of pictures. As far as we can tell,
// the pictures should be in order, and may be directly
// placed or referenced from an anchor
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
// Do any headers, if present
Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
// Do the main paragraph text
Range r = document.getRange();
ListManager listManager = new ListManager(document);
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
}
if (officeParserConfig.getIncludeShapeBasedContent()) {
// Do everything else
for (String paragraph : wordExtractor.getMainTextboxText()) {
xhtml.element("p", paragraph);
}
}
for (String paragraph : wordExtractor.getFootnoteText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getCommentsText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getEndnoteText()) {
xhtml.element("p", paragraph);
}
// Do any footers, if present
Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
// Handle any pictures that we haven't output yet
for (Picture p = pictures.nextUnclaimed(); p != null; ) {
handlePictureCharacterRun(null, p, pictures, xhtml);
p = pictures.nextUnclaimed();
}
// Handle any embeded office documents
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
}
}
} catch (FileNotFoundException e) {
}
}
Aggregations