use of org.apache.tika.exception.TikaException in project tika by apache.
the class SummaryExtractor method parseSummaryEntryIfExists.
private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException {
try {
DocumentEntry entry = (DocumentEntry) root.getEntry(entryName);
PropertySet properties = new PropertySet(new DocumentInputStream(entry));
if (properties.isSummaryInformation()) {
parse(new SummaryInformation(properties));
}
if (properties.isDocumentSummaryInformation()) {
parse(new DocumentSummaryInformation(properties));
}
} catch (FileNotFoundException e) {
// entry does not exist, just skip it
} catch (NoPropertySetStreamException e) {
// no property stream, just skip it
} catch (UnexpectedPropertySetTypeException e) {
throw new TikaException("Unexpected HPSF document", e);
} catch (MarkUnsupportedException e) {
throw new TikaException("Invalid DocumentInputStream", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class WMFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HwmfPicture picture = new HwmfPicture(stream);
//to determine when to keep two text parts on the same line
for (HwmfRecord record : picture.getRecords()) {
Charset charset = LocaleUtil.CHARSET_1252;
//This fix should be done within POI
if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
charset = (font.getCharSet() == null || font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
}
if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
xhtml.endElement("p");
} else if (record.getRecordType().equals(HwmfRecordType.textOut)) {
HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
xhtml.endElement("p");
}
}
} catch (RecordFormatException e) {
//POI's hwmfparser can throw these for "parse exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) {
//convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
} catch (AssertionError e) {
//POI's hwmfparser can throw these for parse exceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ExcelExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException {
if (!root.hasEntry(WORKBOOK_ENTRY)) {
if (root.hasEntry(BOOK_ENTRY)) {
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
// different parser to process them
OldExcelExtractor extractor = new OldExcelExtractor(root);
OldExcelParser.parse(extractor, xhtml);
return;
} else {
// Corrupt file / very old file, just skip text extraction
return;
}
}
// If a password was supplied, use it, otherwise the default
Biff8EncryptionKey.setCurrentUserPassword(getPassword());
// Have the file processed in event mode
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
try {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
} catch (TikaException e) {
// ignore parse errors from embedded documents
}
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedResources.
private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
// Sometimes HSLF hits problems
// Please open POI bugs for any you come across!
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
if (data != null) {
String objID = Integer.toString(oleShape.getObjectID());
// Embedded Object: add a <div
// class="embedded" id="X"/> so consumer can see where
// in the main text each embedded document
// occurred:
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
InputStream dataStream = null;
try {
dataStream = data.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
} else {
handleEmbeddedResource(stream, objID, objID, mediaType, xhtml, false);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class JackcessExtractor method handleCompoundContent.
private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
InputStream is = null;
NPOIFSFileSystem nfs = null;
try {
try {
is = cc.getStream();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
try {
nfs = new NPOIFSFileSystem(is);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
} finally {
if (nfs != null) {
try {
nfs.close();
} catch (IOException e) {
//swallow
}
}
if (is != null) {
IOUtils.closeQuietly(is);
}
}
}
Aggregations