use of org.apache.tika.exception.TikaException in project tika by apache.
the class OCR2XHTML method process.
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
try {
ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (ocr2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract all PDF content", ocr2XHTML.exceptions.get(0));
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PDF2XHTML method process.
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
config.configure(pdf2XHTML);
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (pdf2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PDFParser method loadDOM.
//can return null!
private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
if (pdMetadata == null) {
return null;
}
InputStream is = null;
try {
try {
is = pdMetadata.exportXMPMetadata();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return null;
}
DocumentBuilder documentBuilder = context.getDocumentBuilder();
documentBuilder.setErrorHandler((ErrorHandler) null);
return documentBuilder.parse(is);
} catch (IOException | SAXException | TikaException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
} finally {
IOUtils.closeQuietly(is);
}
return null;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class RarParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
throw new EncryptedDocumentException();
}
//Without this BodyContentHandler does not work
xhtml.element("div", " ");
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
if (!header.isDirectory()) {
try (InputStream subFile = rar.getInputStream(header)) {
Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(subFile, handler, entrydata, true);
}
}
}
header = rar.nextFileHeader();
}
} catch (RarException e) {
throw new TikaException("RarParser Exception", e);
} finally {
if (rar != null)
rar.close();
}
xhtml.endDocument();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class LanguageProfilerBuilder method getSimilarity.
/**
* Calculates a score how well NGramProfiles match each other
*
* @param another
* ngram profile to compare against
* @return similarity 0=exact match
* @throws TikaException
* if could not calculate a score
*/
public float getSimilarity(LanguageProfilerBuilder another) throws TikaException {
float sum = 0;
try {
Iterator<NGramEntry> i = another.getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
if (ngrams.containsKey(other.seq)) {
sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
} else {
sum += other.frequency;
}
}
i = getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = i.next();
if (another.ngrams.containsKey(other.seq)) {
sum += Math.abs((other.frequency - another.ngrams.get(other.seq).frequency)) / 2;
} else {
sum += other.frequency;
}
}
} catch (Exception e) {
throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
}
return sum;
}
Aggregations