use of org.apache.poi.hwpf.usermodel.Paragraph in project tika by apache.
the class WordExtractor method handleParagraph.
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
// into nested tables, so currently we don't
if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
Table t = r.getTable(p);
xhtml.startElement("table");
xhtml.startElement("tbody");
for (int rn = 0; rn < t.numRows(); rn++) {
TableRow row = t.getRow(rn);
xhtml.startElement("tr");
for (int cn = 0; cn < row.numCells(); cn++) {
TableCell cell = row.getCell(cn);
xhtml.startElement("td");
for (int pn = 0; pn < cell.numParagraphs(); pn++) {
Paragraph cellP = cell.getParagraph(pn);
handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
}
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
return (t.numParagraphs() - 1);
}
String text = p.text();
if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
// Skip empty paragraphs
return 0;
}
TagAndStyle tas;
String numbering = null;
if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
if (style != null && style.getName() != null && style.getName().length() > 0) {
if (p.isInList()) {
numbering = listManager.getFormattedNumber(p);
}
tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
} else {
tas = new TagAndStyle("p", null);
}
} else {
tas = new TagAndStyle("p", null);
}
if (tas.getStyleClass() != null) {
xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
} else {
xhtml.startElement(tas.getTag());
}
if (numbering != null) {
xhtml.characters(numbering);
}
for (int j = 0; j < p.numCharacterRuns(); j++) {
CharacterRun cr = p.getCharacterRun(j);
// FIELD_BEGIN_MARK:
if (cr.text().getBytes(UTF_8)[0] == 0x13) {
Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
// 56 is a document link
if (field != null && (field.getType() == 58 || field.getType() == 56)) {
// Embedded Object: add a <div
// class="embedded" id="_X"/> so consumer can see where
// in the main text each embedded document
// occurred:
String id = "_unknown_id";
//this can return null (TIKA-1956)
CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
if (mscr != null) {
id = "_" + mscr.getPicOffset();
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", id);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
if (cr.text().equals("")) {
j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
} else if (cr.text().startsWith("")) {
// Floating Picture(s)
for (int pn = 0; pn < cr.text().length(); pn++) {
// Assume they're in the order from the unclaimed list...
Picture picture = pictures.nextUnclaimed();
// Output
handlePictureCharacterRun(cr, picture, pictures, xhtml);
}
} else if (pictureTable.hasPicture(cr)) {
// Inline Picture
Picture picture = pictures.getFor(cr);
handlePictureCharacterRun(cr, picture, pictures, xhtml);
} else {
handleCharacterRun(cr, tas.isHeading(), xhtml);
}
}
// Close any still open style tags
if (curStrikeThrough) {
xhtml.endElement("s");
curStrikeThrough = false;
}
if (curItalic) {
xhtml.endElement("i");
curItalic = false;
}
if (curBold) {
xhtml.endElement("b");
curBold = false;
}
xhtml.endElement(tas.getTag());
return 0;
}
use of org.apache.poi.hwpf.usermodel.Paragraph in project tika by apache.
the class WordExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
document = new HWPFDocument(root);
} catch (org.apache.poi.EncryptedDocumentException e) {
throw new EncryptedDocumentException(e);
} catch (OldWordFileFormatException e) {
parseWord6(root, xhtml);
return;
}
extractSavedByMetadata(document);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
// Grab the list of pictures. As far as we can tell,
// the pictures should be in order, and may be directly
// placed or referenced from an anchor
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
// Do any headers, if present
Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
// Do the main paragraph text
Range r = document.getRange();
ListManager listManager = new ListManager(document);
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
}
if (officeParserConfig.getIncludeShapeBasedContent()) {
// Do everything else
for (String paragraph : wordExtractor.getMainTextboxText()) {
xhtml.element("p", paragraph);
}
}
for (String paragraph : wordExtractor.getFootnoteText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getCommentsText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : wordExtractor.getEndnoteText()) {
xhtml.element("p", paragraph);
}
// Do any footers, if present
Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
// Handle any pictures that we haven't output yet
for (Picture p = pictures.nextUnclaimed(); p != null; ) {
handlePictureCharacterRun(null, p, pictures, xhtml);
p = pictures.nextUnclaimed();
}
// Handle any embeded office documents
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
}
}
} catch (FileNotFoundException e) {
}
}
Aggregations