use of org.apache.tika.exception.TikaException in project tika by apache.
the class RFC822ParserTest method testExtractAttachments.
@Test
public void testExtractAttachments() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser p = new RFC822Parser();
ParseContext context = new ParseContext();
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Try again with attachment detecting and fetching
final Detector detector = new DefaultDetector();
final Parser extParser = new AutoDetectParser();
final List<MediaType> seenTypes = new ArrayList<MediaType>();
final List<String> seenText = new ArrayList<String>();
EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
@Override
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
seenTypes.add(detector.detect(stream, metadata));
ContentHandler h = new BodyContentHandler();
try {
extParser.parse(stream, h, metadata, new ParseContext());
} catch (TikaException e) {
throw new RuntimeException(e);
}
seenText.add(h.toString());
}
};
context.set(EmbeddedDocumentExtractor.class, ext);
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(2, seenTypes.size());
assertEquals(2, seenText.size());
assertEquals("text/plain", seenTypes.get(0).toString());
assertEquals("image/png", seenTypes.get(1).toString());
assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
}
use of org.apache.tika.exception.TikaException in project cxf by apache.
the class TikaContentExtractor method extract.
/**
* Extract the content and metadata from the input stream with a media type hint
* type of content.
* @param in input stream to extract the metadata from
* @param handler custom ContentHandler
* @param mt JAX-RS MediaType of the stream content
* @param context custom context
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, ContentHandler handler, javax.ws.rs.core.MediaType mtHint, ParseContext context) {
if (in == null) {
return null;
}
final Metadata metadata = new Metadata();
try {
// Try to validate that input stream media type is supported by the parser
MediaType mediaType = null;
if (mtHint != null) {
mediaType = MediaType.parse(mtHint.toString());
} else if (detector != null && in.markSupported()) {
mediaType = detector.detect(in, metadata);
}
if (mediaType != null) {
metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString());
}
Parser parser = null;
if (parsers.size() == 1) {
parser = parsers.get(0);
} else {
for (Parser p : parsers) {
if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
continue;
}
parser = p;
break;
}
}
if (parser == null) {
return null;
}
if (context == null) {
context = new ParseContext();
}
if (context.get(Parser.class) == null) {
// to process the embedded attachments
context.set(Parser.class, parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
}
try {
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {
// not ready to accept null handlers so lets retry with IgnoreContentHandler.
if (handler == null) {
handler = new IgnoreContentHandler();
parser.parse(in, handler, metadata, context);
} else {
throw ex;
}
}
return new TikaContent(handler, metadata, mediaType);
} catch (final IOException ex) {
LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
} catch (final SAXException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
} catch (final TikaException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
}
return null;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class Tika method parseToString.
/**
* Parses the given document and returns the extracted text content.
* The given input stream is closed by this method.
* <p>
* To avoid unpredictable excess memory use, the returned string contains
* only up to {@link #getMaxStringLength()} first characters extracted
* from the input document. Use the {@link #setMaxStringLength(int)}
* method to adjust this limitation.
* <p>
* <strong>NOTE:</strong> Unlike most other Tika methods that take an
* {@link InputStream}, this method will close the given stream for
* you as a convenience. With other methods you are still responsible
* for closing the stream or a wrapper instance returned by Tika.
*
* @param stream the document to be parsed
* @param metadata document metadata
* @return extracted text content
* @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ExternalEmbedderTest method embedInTempFile.
/**
* Tests embedding metadata then reading metadata to verify the results.
*
* @param isResultExpectedInOutput whether or not results are expected in command line output
*/
protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) {
Embedder embedder = getEmbedder();
// TODO Move this check to ExternalEmbedder
String os = System.getProperty("os.name", "");
if (os.contains("Windows")) {
// Skip test on Windows
return;
}
Date timestamp = new Date();
Metadata metadataToEmbed = getMetadataToEmbed(timestamp);
try {
File tempOutputFile = tmp.createTemporaryFile();
FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile);
// Embed the metadata into a copy of the original output stream
embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null);
ParseContext context = new ParseContext();
Parser parser = getParser();
context.set(Parser.class, parser);
// Setup the extracting content handler
ByteArrayOutputStream result = new ByteArrayOutputStream();
OutputStreamWriter outputWriter = new OutputStreamWriter(result, DEFAULT_CHARSET);
ContentHandler handler = new BodyContentHandler(outputWriter);
// Create a new metadata object to read the new metadata into
Metadata embeddedMetadata = new Metadata();
// Setup a re-read of the now embeded temp file
FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);
parser.parse(embeddedFileInputStream, handler, embeddedMetadata, context);
tmp.dispose();
String outputString = null;
if (isResultExpectedInOutput) {
outputString = result.toString(DEFAULT_CHARSET);
} else {
assertTrue("no metadata found", embeddedMetadata.size() > 0);
}
// Check each metadata property for the expected value
for (String metadataName : metadataToEmbed.names()) {
if (metadataToEmbed.get(metadataName) != null) {
String expectedValue = metadataToEmbed.get(metadataName);
boolean foundExpectedValue = false;
if (isResultExpectedInOutput) {
// just check that the entire output contains the expected string
foundExpectedValue = outputString.contains(expectedValue);
} else {
if (embeddedMetadata.isMultiValued(metadataName)) {
for (String embeddedValue : embeddedMetadata.getValues(metadataName)) {
if (embeddedValue != null) {
if (embeddedValue.contains(expectedValue)) {
foundExpectedValue = true;
break;
}
}
}
} else {
String embeddedValue = embeddedMetadata.get(metadataName);
assertNotNull("expected metadata for " + metadataName + " not found", embeddedValue);
foundExpectedValue = embeddedValue.contains(expectedValue);
}
}
assertTrue("result did not contain expected appended metadata " + metadataName + "=" + expectedValue, foundExpectedValue);
}
}
} catch (IOException e) {
fail(e.getMessage());
} catch (TikaException e) {
fail(e.getMessage());
} catch (SAXException e) {
fail(e.getMessage());
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class AutoDetectParserTest method testZipBombPrevention.
/**
* Make sure that zip bomb attacks are prevented.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
*/
@Test
public void testZipBombPrevention() throws Exception {
try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream("/test-documents/TIKA-216.tgz")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler(-1);
new AutoDetectParser(tika).parse(tgz, handler, metadata);
fail("Zip bomb was not detected");
} catch (TikaException e) {
// expected
}
}
Aggregations