use of org.apache.poi.xssf.usermodel.XSSFShape in project tika by apache.
the class XSSFBExcelExtractorDecorator method buildXHTML.
/**
* @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException {
OPCPackage container = extractor.getPackage();
XSSFBSharedStringsTable strings;
XSSFBReader.SheetIterator iter;
XSSFBReader xssfReader;
XSSFBStylesTable styles;
try {
xssfReader = new XSSFBReader(container);
styles = xssfReader.getXSSFBStylesTable();
iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData();
strings = new XSSFBSharedStringsTable(container);
} catch (InvalidFormatException e) {
throw new XmlException(e);
} catch (OpenXML4JException oe) {
throw new XmlException(oe);
}
while (iter.hasNext()) {
InputStream stream = iter.next();
PackagePart sheetPart = iter.getSheetPart();
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
XSSFBCommentsTable comments = iter.getXSSFBSheetComments();
// Start, and output the sheet name
xhtml.startElement("div");
xhtml.element("h1", iter.getSheetName());
// Extract the main sheet contents
xhtml.startElement("table");
xhtml.startElement("tbody");
processSheet(sheetExtractor, comments, styles, strings, stream);
xhtml.endElement("tbody");
xhtml.endElement("table");
// do the headers before the contents)
for (String header : sheetExtractor.headers) {
extractHeaderFooter(header, xhtml);
}
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
List<XSSFShape> shapes = iter.getShapes();
processShapes(shapes, xhtml);
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
//step 2: process as we do now, but with cached hyperlink relationship info
extractHyperLinks(sheetPart, xhtml);
// All done with this sheet
xhtml.endElement("div");
}
}
use of org.apache.poi.xssf.usermodel.XSSFShape in project tika by apache.
the class XSSFExcelExtractorDecorator method processShapes.
private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
if (shapes == null) {
return;
}
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
if (sText != null && sText.length() > 0) {
xhtml.element("p", sText);
}
extractHyperLinksFromShape(((XSSFSimpleShape) shape).getCTShape(), xhtml);
}
}
}
use of org.apache.poi.xssf.usermodel.XSSFShape in project tika by apache.
the class XSSFExcelExtractorDecorator method buildXHTML.
/**
* @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException {
OPCPackage container = extractor.getPackage();
ReadOnlySharedStringsTable strings;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
StylesTable styles;
try {
xssfReader = new XSSFReader(container);
styles = xssfReader.getStylesTable();
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
strings = new ReadOnlySharedStringsTable(container);
} catch (InvalidFormatException e) {
throw new XmlException(e);
} catch (OpenXML4JException oe) {
throw new XmlException(oe);
}
//temporary workaround for POI-61034
//remove once POI 3.17-beta1 is released
Set<String> seen = new HashSet<>();
while (iter.hasNext()) {
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
PackagePart sheetPart = null;
try (InputStream stream = iter.next()) {
sheetPart = iter.getSheetPart();
final String partName = sheetPart.getPartName().toString();
if (seen.contains(partName)) {
continue;
}
seen.add(partName);
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
CommentsTable comments = iter.getSheetComments();
// Start, and output the sheet name
xhtml.startElement("div");
xhtml.element("h1", iter.getSheetName());
// Extract the main sheet contents
xhtml.startElement("table");
xhtml.startElement("tbody");
processSheet(sheetExtractor, comments, styles, strings, stream);
}
xhtml.endElement("tbody");
xhtml.endElement("table");
// do the headers before the contents)
for (String header : sheetExtractor.headers) {
extractHeaderFooter(header, xhtml);
}
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
// Do text held in shapes, if required
if (config.getIncludeShapeBasedContent()) {
List<XSSFShape> shapes = iter.getShapes();
processShapes(shapes, xhtml);
}
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
//step 2: process as we do now, but with cached hyperlink relationship info
extractHyperLinks(sheetPart, xhtml);
// All done with this sheet
xhtml.endElement("div");
}
}
use of org.apache.poi.xssf.usermodel.XSSFShape in project tika by apache.
the class XSSFBExcelExtractorDecorator method processShapes.
private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
if (shapes == null) {
return;
}
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
if (sText != null && sText.length() > 0) {
xhtml.element("p", sText);
}
extractHyperLinksFromShape(((XSSFSimpleShape) shape).getCTShape(), xhtml);
}
}
}
use of org.apache.poi.xssf.usermodel.XSSFShape in project poi by apache.
the class XSSFExcelExtractor method getText.
/**
* Retrieves the text contents of the file
*/
public String getText() {
DataFormatter formatter;
if (locale == null) {
formatter = new DataFormatter();
} else {
formatter = new DataFormatter(locale);
}
StringBuffer text = new StringBuffer();
for (Sheet sh : workbook) {
XSSFSheet sheet = (XSSFSheet) sh;
if (includeSheetNames) {
text.append(sheet.getSheetName()).append("\n");
}
// Header(s), if present
if (includeHeadersFooters) {
text.append(extractHeaderFooter(sheet.getFirstHeader()));
text.append(extractHeaderFooter(sheet.getOddHeader()));
text.append(extractHeaderFooter(sheet.getEvenHeader()));
}
// Rows and cells
for (Object rawR : sheet) {
Row row = (Row) rawR;
for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext(); ) {
Cell cell = ri.next();
// Is it a formula one?
if (cell.getCellTypeEnum() == CellType.FORMULA) {
if (formulasNotResults) {
String contents = cell.getCellFormula();
checkMaxTextSize(text, contents);
text.append(contents);
} else {
if (cell.getCachedFormulaResultTypeEnum() == CellType.STRING) {
handleStringCell(text, cell);
} else {
handleNonStringCell(text, cell, formatter);
}
}
} else if (cell.getCellTypeEnum() == CellType.STRING) {
handleStringCell(text, cell);
} else {
handleNonStringCell(text, cell, formatter);
}
// Output the comment, if requested and exists
Comment comment = cell.getCellComment();
if (includeCellComments && comment != null) {
// Replace any newlines with spaces, otherwise it
// breaks the output
String commentText = comment.getString().getString().replace('\n', ' ');
checkMaxTextSize(text, commentText);
text.append(" Comment by ").append(comment.getAuthor()).append(": ").append(commentText);
}
if (ri.hasNext()) {
text.append("\t");
}
}
text.append("\n");
}
// add textboxes
if (includeTextBoxes) {
XSSFDrawing drawing = sheet.getDrawingPatriarch();
if (drawing != null) {
for (XSSFShape shape : drawing.getShapes()) {
if (shape instanceof XSSFSimpleShape) {
String boxText = ((XSSFSimpleShape) shape).getText();
if (boxText.length() > 0) {
text.append(boxText);
text.append('\n');
}
}
}
}
}
// Finally footer(s), if present
if (includeHeadersFooters) {
text.append(extractHeaderFooter(sheet.getFirstFooter()));
text.append(extractHeaderFooter(sheet.getOddFooter()));
text.append(extractHeaderFooter(sheet.getEvenFooter()));
}
}
return text.toString();
}
Aggregations