use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaDetectors method detectorAsHTML.
private void detectorAsHTML(Detector d, StringBuffer html, int level) {
html.append("<h");
html.append(level);
html.append(">");
String name = d.getClass().getName();
html.append(name.substring(name.lastIndexOf('.') + 1));
html.append("</h");
html.append(level);
html.append(">");
html.append("<p>Class: ");
html.append(name);
html.append("</p>");
if (d instanceof CompositeDetector) {
html.append("<p>Composite Detector</p>");
for (Detector cd : ((CompositeDetector) d).getDetectors()) {
detectorAsHTML(cd, html, level + 1);
}
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaDetectors method renderDetector.
private void renderDetector(Detector d, StringBuffer text, int indent) {
boolean isComposite = (d instanceof CompositeDetector);
String name = d.getClass().getName();
for (int i = 0; i < indent; i++) {
text.append(" ");
}
text.append(name);
if (isComposite) {
text.append(" (Composite Detector):\n");
List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
for (Detector sd : subDetectors) {
renderDetector(sd, text, indent + 1);
}
} else {
text.append("\n");
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class RFC822ParserTest method testExtractAttachments.
@Test
public void testExtractAttachments() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser p = new RFC822Parser();
ParseContext context = new ParseContext();
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Try again with attachment detecting and fetching
final Detector detector = new DefaultDetector();
final Parser extParser = new AutoDetectParser();
final List<MediaType> seenTypes = new ArrayList<MediaType>();
final List<String> seenText = new ArrayList<String>();
EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
@Override
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
seenTypes.add(detector.detect(stream, metadata));
ContentHandler h = new BodyContentHandler();
try {
extParser.parse(stream, h, metadata, new ParseContext());
} catch (TikaException e) {
throw new RuntimeException(e);
}
seenText.add(h.toString());
}
};
context.set(EmbeddedDocumentExtractor.class, ext);
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(2, seenTypes.size());
assertEquals(2, seenText.size());
assertEquals("text/plain", seenTypes.get(0).toString());
assertEquals("image/png", seenTypes.get(1).toString());
assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
}
use of org.apache.tika.detect.Detector in project ddf by codice.
the class OperationsMetacardSupport method guessMimeType.
// package-private for unit testing
String guessMimeType(String mimeTypeRaw, String fileName, Path tmpContentPath) throws IOException {
if (ContentItem.DEFAULT_MIME_TYPE.equals(mimeTypeRaw)) {
try (InputStream inputStreamMessageCopy = com.google.common.io.Files.asByteSource(tmpContentPath.toFile()).openStream()) {
String mimeTypeGuess = frameworkProperties.getMimeTypeMapper().guessMimeType(inputStreamMessageCopy, FilenameUtils.getExtension(fileName));
if (StringUtils.isNotEmpty(mimeTypeGuess)) {
mimeTypeRaw = mimeTypeGuess;
}
} catch (MimeTypeResolutionException e) {
LOGGER.debug(MIME_TYPE_MSG, e);
}
if (ContentItem.DEFAULT_MIME_TYPE.equals(mimeTypeRaw)) {
Detector detector = new DefaultProbDetector();
try (InputStream inputStreamMessageCopy = TikaInputStream.get(tmpContentPath)) {
MediaType mediaType = detector.detect(inputStreamMessageCopy, new Metadata());
mimeTypeRaw = mediaType.toString();
} catch (IOException e) {
LOGGER.debug(MIME_TYPE_MSG, e);
}
}
if (mimeTypeRaw.equals("text/plain")) {
try (InputStream inputStreamMessageCopy = com.google.common.io.Files.asByteSource(tmpContentPath.toFile()).openStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStreamMessageCopy, Charset.forName("UTF-8")))) {
String line = bufferedReader.lines().map(String::trim).filter(StringUtils::isNotEmpty).findFirst().orElse("");
if (line.startsWith("<")) {
mimeTypeRaw = "text/xml";
} else if (line.startsWith("{") || line.startsWith("[")) {
mimeTypeRaw = "application/json";
}
} catch (IOException e) {
LOGGER.debug(MIME_TYPE_MSG, e);
}
}
}
return mimeTypeRaw;
}
use of org.apache.tika.detect.Detector in project winery by eclipse.
the class BackendUtils method getMimeType.
/**
* Detect the mime type of the stream. The stream is marked at the beginning and reset at the end
*
* @param bis the stream
* @param fn the fileName of the file belonging to the stream
*/
public static MediaType getMimeType(BufferedInputStream bis, String fn) throws IOException {
AutoDetectParser parser = new AutoDetectParser();
Detector detector = parser.getDetector();
Metadata md = new Metadata();
md.add(Metadata.RESOURCE_NAME_KEY, fn);
return detector.detect(bis, md);
}
Aggregations