use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class SQLite3ParserTest method testNotAddingEmbeddedParserToParseContext.
//test what happens if the user does not want embedded docs handled
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
p.parse(is, handler, metadata, parseContext);
}
String xml = handler.toString();
//just includes headers for embedded documents
assertContains("<table name=\"my_table1\"><thead><tr>", xml);
assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
//but no other content
assertNotContained("dog", xml);
assertNotContained("alt=\"image1.png\"", xml);
//second embedded doc's image tag
assertNotContained("alt=\"A description...\"", xml);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RFC822ParserTest method testExtractAttachments.
@Test
public void testExtractAttachments() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser p = new RFC822Parser();
ParseContext context = new ParseContext();
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Try again with attachment detecting and fetching
final Detector detector = new DefaultDetector();
final Parser extParser = new AutoDetectParser();
final List<MediaType> seenTypes = new ArrayList<MediaType>();
final List<String> seenText = new ArrayList<String>();
EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
@Override
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
seenTypes.add(detector.detect(stream, metadata));
ContentHandler h = new BodyContentHandler();
try {
extParser.parse(stream, h, metadata, new ParseContext());
} catch (TikaException e) {
throw new RuntimeException(e);
}
seenText.add(h.toString());
}
};
context.set(EmbeddedDocumentExtractor.class, ext);
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(2, seenTypes.size());
assertEquals(2, seenText.size());
assertEquals("text/plain", seenTypes.get(0).toString());
assertEquals("image/png", seenTypes.get(1).toString());
assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class IWorkParserTest method setUp.
@Before
public void setUp() {
iWorkParser = new IWorkPackageParser();
parseContext = new ParseContext();
parseContext.set(Parser.class, new AutoDetectParser());
}
use of org.apache.tika.parser.AutoDetectParser in project winery by eclipse.
the class BackendUtils method getMimeType.
/**
* Detect the mime type of the stream. The stream is marked at the beginning and reset at the end
*
* @param bis the stream
* @param fn the fileName of the file belonging to the stream
*/
public static MediaType getMimeType(BufferedInputStream bis, String fn) throws IOException {
AutoDetectParser parser = new AutoDetectParser();
Detector detector = parser.getDetector();
Metadata md = new Metadata();
md.add(Metadata.RESOURCE_NAME_KEY, fn);
final MediaType mediaType = detector.detect(bis, md);
return mediaType;
}
use of org.apache.tika.parser.AutoDetectParser in project cxf by apache.
the class TikaContentExtractor method extract.
/**
* Extract the content and metadata from the input stream with a media type hint
* type of content.
* @param in input stream to extract the metadata from
* @param handler custom ContentHandler
* @param mt JAX-RS MediaType of the stream content
* @param context custom context
* @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, ContentHandler handler, javax.ws.rs.core.MediaType mtHint, ParseContext context) {
if (in == null) {
return null;
}
final Metadata metadata = new Metadata();
try {
// Try to validate that input stream media type is supported by the parser
MediaType mediaType = null;
if (mtHint != null) {
mediaType = MediaType.parse(mtHint.toString());
} else if (detector != null && in.markSupported()) {
mediaType = detector.detect(in, metadata);
}
if (mediaType != null) {
metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString());
}
Parser parser = null;
if (parsers.size() == 1) {
parser = parsers.get(0);
} else {
for (Parser p : parsers) {
if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
continue;
}
parser = p;
break;
}
}
if (parser == null) {
return null;
}
if (context == null) {
context = new ParseContext();
}
if (context.get(Parser.class) == null) {
// to process the embedded attachments
context.set(Parser.class, parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
}
try {
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {
// not ready to accept null handlers so lets retry with IgnoreContentHandler.
if (handler == null) {
handler = new IgnoreContentHandler();
parser.parse(in, handler, metadata, context);
} else {
throw ex;
}
}
return new TikaContent(handler, metadata, mediaType);
} catch (final IOException ex) {
LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
} catch (final SAXException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
} catch (final TikaException ex) {
LOG.log(Level.WARNING, "Unable to parse input stream", ex);
}
return null;
}
Aggregations