use of org.apache.tika.io.TikaInputStream in project Xponents by OpenSextant.
the class EmbeddedContentConverter method conversionImplementation.
/**
* Convert Embedded documents in the supported types to a folder of the embedded items.
* Trivial embedded icons and other components will not be extracted
*
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument compoundDoc = super.conversionImplementation(in, doc);
String ext = FilenameUtils.getExtension(doc.getName());
if (!isSupported(ext)) {
// Not really compound by our standards here.
return compoundDoc;
}
ParserContainerExtractor extractor = new ParserContainerExtractor();
EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true);
TikaInputStream tikaStream = null;
try {
tikaStream = TikaInputStream.get(doc.toPath());
extractor.extract(tikaStream, extractor, objExtractor);
compoundDoc.is_converted = true;
if (compoundDoc.hasRawChildren()) {
// Create text buffer for this compound document here.
// If raw children should be post-processed by some other means, that is up to caller.
// This parent document at least contains a complete text representation of the content in the original doc.
StringBuilder completeText = new StringBuilder();
completeText.append(compoundDoc.getText());
completeText.append("\n==Embedded Objects==\n");
completeText.append(renderText(compoundDoc.getRawChildren()));
compoundDoc.setText(completeText.toString());
compoundDoc.is_converted = true;
return compoundDoc;
} else {
// Try the simple approach.
return compoundDoc;
}
} catch (Exception e) {
throw new IOException("Stream parsing problem", e);
} finally {
tikaStream.close();
}
}
use of org.apache.tika.io.TikaInputStream in project nifi by apache.
the class ExtractMediaMetadata method tika_parse.
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException {
final Metadata metadata = new Metadata();
final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
try {
autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
} finally {
tikaInputStream.close();
}
final Map<String, String> results = new HashMap<>();
final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
final StringBuilder dataBuilder = new StringBuilder();
for (final String key : metadata.names()) {
if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
continue;
}
dataBuilder.setLength(0);
if (metadata.isMultiValued(key)) {
for (String val : metadata.getValues(key)) {
if (dataBuilder.length() > 1) {
dataBuilder.append(", ");
}
if (dataBuilder.length() + val.length() < maxAttribLen) {
dataBuilder.append(val);
} else {
dataBuilder.append("...");
break;
}
}
} else {
dataBuilder.append(metadata.get(key));
}
if (prefix == null) {
results.put(key, dataBuilder.toString().trim());
} else {
results.put(prefix + key, dataBuilder.toString().trim());
}
// cutoff at max if provided
if (maxAttribs != null && results.size() >= maxAttribs) {
break;
}
}
return results;
}
use of org.apache.tika.io.TikaInputStream in project nifi by apache.
the class IdentifyMimeType method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final ComponentLog logger = getLogger();
final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream stream) throws IOException {
try (final InputStream in = new BufferedInputStream(stream)) {
TikaInputStream tikaStream = TikaInputStream.get(in);
Metadata metadata = new Metadata();
if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
}
// Get mime type
MediaType mediatype = detector.detect(tikaStream, metadata);
mimeTypeRef.set(mediatype.toString());
}
}
});
String mimeType = mimeTypeRef.get();
String extension = "";
try {
MimeType mimetype;
mimetype = config.getMimeRepository().forName(mimeType);
extension = mimetype.getExtension();
} catch (MimeTypeException ex) {
logger.warn("MIME type extension lookup failed: {}", new Object[] { ex });
}
// Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
extension = ".gz";
}
if (mimeType == null) {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
flowFile = session.putAttribute(flowFile, "mime.extension", "");
logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[] { flowFile });
} else {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
flowFile = session.putAttribute(flowFile, "mime.extension", extension);
logger.info("Identified {} as having MIME Type {}", new Object[] { flowFile, mimeType });
}
session.getProvenanceReporter().modifyAttributes(flowFile);
session.transfer(flowFile, REL_SUCCESS);
}
use of org.apache.tika.io.TikaInputStream in project nifi by apache.
the class ContentViewerController method doGet.
/**
* Gets the content and defers to registered viewers to generate the markup.
*
* @param request servlet request
* @param response servlet response
* @throws ServletException if a servlet-specific error occurs
* @throws IOException if an I/O error occurs
*/
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException {
// specify the charset in a response header
response.addHeader("Content-Type", "text/html; charset=UTF-8");
// get the content
final ServletContext servletContext = request.getServletContext();
final ContentAccess contentAccess = (ContentAccess) servletContext.getAttribute("nifi-content-access");
final ContentRequestContext contentRequest;
try {
contentRequest = getContentRequest(request);
} catch (final Exception e) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Unable to interpret content request.");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
if (contentRequest.getDataUri() == null) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "The data reference must be specified.");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// get the content
final DownloadableContent downloadableContent;
try {
downloadableContent = contentAccess.getContent(contentRequest);
} catch (final ResourceNotFoundException rnfe) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Unable to find the specified content");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
} catch (final AccessDeniedException ade) {
request.setAttribute("title", "Access Denied");
request.setAttribute("messages", "Unable to approve access to the specified content: " + ade.getMessage());
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
} catch (final Exception e) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "An unexpected error has occurred: " + e.getMessage());
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// determine how we want to view the data
String mode = request.getParameter("mode");
// if the name isn't set, use original
if (mode == null) {
mode = DisplayMode.Original.name();
}
// determine the display mode
final DisplayMode displayMode;
try {
displayMode = DisplayMode.valueOf(mode);
} catch (final IllegalArgumentException iae) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Invalid display mode: " + mode);
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// buffer the content to support resetting in case we need to detect the content type or char encoding
try (final BufferedInputStream bis = new BufferedInputStream(downloadableContent.getContent())) {
final String mimeType;
final String normalizedMimeType;
// when clustered and we don't know the type set to octet stream since the content was retrieved from the node's rest endpoint
if (downloadableContent.getType() == null || StringUtils.startsWithIgnoreCase(downloadableContent.getType(), MediaType.OCTET_STREAM.toString())) {
// attempt to detect the content stream if we don't know what it is ()
final DefaultDetector detector = new DefaultDetector();
// create the stream for tika to process, buffered to support reseting
final TikaInputStream tikaStream = TikaInputStream.get(bis);
// provide a hint based on the filename
final Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
// Get mime type
final MediaType mediatype = detector.detect(tikaStream, metadata);
mimeType = mediatype.toString();
} else {
mimeType = downloadableContent.getType();
}
// Extract only mime type and subtype from content type (anything after the first ; are parameters)
// Lowercase so subsequent code does not need to implement case insensitivity
normalizedMimeType = mimeType.split(";", 2)[0].toLowerCase();
// add attributes needed for the header
request.setAttribute("filename", downloadableContent.getFilename());
request.setAttribute("contentType", mimeType);
// generate the header
request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response);
// remove the attributes needed for the header
request.removeAttribute("filename");
request.removeAttribute("contentType");
// generate the markup for the content based on the display mode
if (DisplayMode.Hex.equals(displayMode)) {
final byte[] buffer = new byte[BUFFER_LENGTH];
final int read = StreamUtils.fillBuffer(bis, buffer, false);
// trim the byte array if necessary
byte[] bytes = buffer;
if (read != buffer.length) {
bytes = new byte[read];
System.arraycopy(buffer, 0, bytes, 0, read);
}
// convert bytes into the base 64 bytes
final String base64 = Base64.encodeBase64String(bytes);
// defer to the jsp
request.setAttribute("content", base64);
request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response);
} else {
// lookup a viewer for the content
final String contentViewerUri = servletContext.getInitParameter(normalizedMimeType);
// handle no viewer for content type
if (contentViewerUri == null) {
request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response);
} else {
// create a request attribute for accessing the content
request.setAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE, new ViewableContent() {
@Override
public InputStream getContentStream() {
return bis;
}
@Override
public String getContent() throws IOException {
// detect the charset
final CharsetDetector detector = new CharsetDetector();
detector.setText(bis);
detector.enableInputFilter(true);
final CharsetMatch match = detector.detect();
// ensure we were able to detect the charset
if (match == null) {
throw new IOException("Unable to detect character encoding.");
}
// convert the stream using the detected charset
return IOUtils.toString(bis, match.getName());
}
@Override
public ViewableContent.DisplayMode getDisplayMode() {
return displayMode;
}
@Override
public String getFileName() {
return downloadableContent.getFilename();
}
@Override
public String getContentType() {
return normalizedMimeType;
}
@Override
public String getRawContentType() {
return mimeType;
}
});
try {
// generate the content
final ServletContext viewerContext = servletContext.getContext(contentViewerUri);
viewerContext.getRequestDispatcher("/view-content").include(request, response);
} catch (final Exception e) {
String message = e.getMessage() != null ? e.getMessage() : e.toString();
message = "Unable to generate view of data: " + message;
// log the error
logger.error(message);
if (logger.isDebugEnabled()) {
logger.error(StringUtils.EMPTY, e);
}
// populate the request attributes
request.setAttribute("title", "Error");
request.setAttribute("messages", message);
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// remove the request attribute
request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE);
}
}
// generate footer
request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ExternalParser method parse.
private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp) throws IOException, SAXException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
File output = null;
// Build our command
String[] cmd;
if (command.length == 1) {
cmd = command[0].split(" ");
} else {
cmd = new String[command.length];
System.arraycopy(command, 0, cmd, 0, command.length);
}
for (int i = 0; i < cmd.length; i++) {
if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
inputToStdIn = false;
}
if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
output = tmp.createTemporaryFile();
outputFromStdOut = false;
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
}
}
// Execute
Process process = null;
try {
if (cmd.length == 1) {
process = Runtime.getRuntime().exec(cmd[0]);
} else {
process = Runtime.getRuntime().exec(cmd);
}
} catch (Exception e) {
e.printStackTrace();
}
try {
if (inputToStdIn) {
sendInput(process, stream);
} else {
process.getOutputStream().close();
}
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
if (hasPatterns) {
extractMetadata(err, metadata);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
extractMetadata(out, metadata);
}
} else {
ignoreStream(err);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
ignoreStream(out);
}
}
} finally {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
}
// Grab the output if we haven't already
if (!outputFromStdOut) {
extractOutput(new FileInputStream(output), xhtml);
}
}
Aggregations