use of com.ibm.icu.text.CharsetMatch in project Xponents by OpenSextant.
the class TextTranscodingConverter method conversionImplementation.
/**
* A converter that tries to get a decent encoding ASCII, UTF-8 or other,
* and then the buffer converted or not.
*
* IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer...
* ELSE file must be read in and converted.
*
* CAVEAT: If file is short and low-confidence for encoding detection ALSO
* do not convert. Treat as a plain text file.
*/
@Override
protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException {
ConvertedDocument textdoc = new ConvertedDocument(doc);
byte[] data = null;
if (in != null) {
// Get byte data from input stream or file
if (doc != null) {
data = FileUtility.readBytesFrom(doc);
} else {
data = IOUtils.toByteArray(in);
}
in.close();
}
// Encoding heuristics here.....
//
// Objective: mark small plain text payloads with unknown character set
// as not worthy of conversion. Leave them as plain/text
// indeed they might even be straight Unicode
//
// Test for ASCII only first, otherwise try to detect the best charset for the text
//
textdoc.is_plaintext = true;
boolean is_ascii = TextUtils.isASCII(data);
if (is_ascii) {
textdoc.do_convert = false;
textdoc.setEncoding("ASCII");
textdoc.setText(new String(data));
} else {
chardet.setText(data);
CharsetMatch cs = chardet.detect();
if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) {
textdoc.do_convert = false;
} else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) {
textdoc.do_convert = false;
}
textdoc.setEncoding(cs.getName());
textdoc.setText(new String(data, cs.getName()));
}
return textdoc;
}
use of com.ibm.icu.text.CharsetMatch in project nifi by apache.
the class ContentViewerController method doGet.
/**
* Gets the content and defers to registered viewers to generate the markup.
*
* @param request servlet request
* @param response servlet response
* @throws ServletException if a servlet-specific error occurs
* @throws IOException if an I/O error occurs
*/
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException {
// specify the charset in a response header
response.addHeader("Content-Type", "text/html; charset=UTF-8");
// get the content
final ServletContext servletContext = request.getServletContext();
final ContentAccess contentAccess = (ContentAccess) servletContext.getAttribute("nifi-content-access");
final ContentRequestContext contentRequest;
try {
contentRequest = getContentRequest(request);
} catch (final Exception e) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Unable to interpret content request.");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
if (contentRequest.getDataUri() == null) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "The data reference must be specified.");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// get the content
final DownloadableContent downloadableContent;
try {
downloadableContent = contentAccess.getContent(contentRequest);
} catch (final ResourceNotFoundException rnfe) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Unable to find the specified content");
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
} catch (final AccessDeniedException ade) {
request.setAttribute("title", "Access Denied");
request.setAttribute("messages", "Unable to approve access to the specified content: " + ade.getMessage());
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
} catch (final Exception e) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "An unexpected error has occurred: " + e.getMessage());
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// determine how we want to view the data
String mode = request.getParameter("mode");
// if the name isn't set, use original
if (mode == null) {
mode = DisplayMode.Original.name();
}
// determine the display mode
final DisplayMode displayMode;
try {
displayMode = DisplayMode.valueOf(mode);
} catch (final IllegalArgumentException iae) {
request.setAttribute("title", "Error");
request.setAttribute("messages", "Invalid display mode: " + mode);
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// buffer the content to support resetting in case we need to detect the content type or char encoding
try (final BufferedInputStream bis = new BufferedInputStream(downloadableContent.getContent())) {
final String mimeType;
final String normalizedMimeType;
// when clustered and we don't know the type set to octet stream since the content was retrieved from the node's rest endpoint
if (downloadableContent.getType() == null || StringUtils.startsWithIgnoreCase(downloadableContent.getType(), MediaType.OCTET_STREAM.toString())) {
// attempt to detect the content stream if we don't know what it is ()
final DefaultDetector detector = new DefaultDetector();
// create the stream for tika to process, buffered to support reseting
final TikaInputStream tikaStream = TikaInputStream.get(bis);
// provide a hint based on the filename
final Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
// Get mime type
final MediaType mediatype = detector.detect(tikaStream, metadata);
mimeType = mediatype.toString();
} else {
mimeType = downloadableContent.getType();
}
// Extract only mime type and subtype from content type (anything after the first ; are parameters)
// Lowercase so subsequent code does not need to implement case insensitivity
normalizedMimeType = mimeType.split(";", 2)[0].toLowerCase();
// add attributes needed for the header
request.setAttribute("filename", downloadableContent.getFilename());
request.setAttribute("contentType", mimeType);
// generate the header
request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response);
// remove the attributes needed for the header
request.removeAttribute("filename");
request.removeAttribute("contentType");
// generate the markup for the content based on the display mode
if (DisplayMode.Hex.equals(displayMode)) {
final byte[] buffer = new byte[BUFFER_LENGTH];
final int read = StreamUtils.fillBuffer(bis, buffer, false);
// trim the byte array if necessary
byte[] bytes = buffer;
if (read != buffer.length) {
bytes = new byte[read];
System.arraycopy(buffer, 0, bytes, 0, read);
}
// convert bytes into the base 64 bytes
final String base64 = Base64.encodeBase64String(bytes);
// defer to the jsp
request.setAttribute("content", base64);
request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response);
} else {
// lookup a viewer for the content
final String contentViewerUri = servletContext.getInitParameter(normalizedMimeType);
// handle no viewer for content type
if (contentViewerUri == null) {
request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response);
} else {
// create a request attribute for accessing the content
request.setAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE, new ViewableContent() {
@Override
public InputStream getContentStream() {
return bis;
}
@Override
public String getContent() throws IOException {
// detect the charset
final CharsetDetector detector = new CharsetDetector();
detector.setText(bis);
detector.enableInputFilter(true);
final CharsetMatch match = detector.detect();
// ensure we were able to detect the charset
if (match == null) {
throw new IOException("Unable to detect character encoding.");
}
// convert the stream using the detected charset
return IOUtils.toString(bis, match.getName());
}
@Override
public ViewableContent.DisplayMode getDisplayMode() {
return displayMode;
}
@Override
public String getFileName() {
return downloadableContent.getFilename();
}
@Override
public String getContentType() {
return normalizedMimeType;
}
@Override
public String getRawContentType() {
return mimeType;
}
});
try {
// generate the content
final ServletContext viewerContext = servletContext.getContext(contentViewerUri);
viewerContext.getRequestDispatcher("/view-content").include(request, response);
} catch (final Exception e) {
String message = e.getMessage() != null ? e.getMessage() : e.toString();
message = "Unable to generate view of data: " + message;
// log the error
logger.error(message);
if (logger.isDebugEnabled()) {
logger.error(StringUtils.EMPTY, e);
}
// populate the request attributes
request.setAttribute("title", "Error");
request.setAttribute("messages", message);
// forward to the error page
final ServletContext viewerContext = servletContext.getContext("/nifi");
viewerContext.getRequestDispatcher("/message").forward(request, response);
return;
}
// remove the request attribute
request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE);
}
}
// generate footer
request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response);
}
}
use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.
the class FileUtil method getFileCharsetName.
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding
* @return The name of the detected charset or <code>null</code> if not detected
* @throws IOException
*/
public static String getFileCharsetName(File file) throws IOException {
CharsetMatch match = getFileCharsetMatch(file);
if (match != null) {
LOGGER.debug("Detected charset \"{}\" in file {}", match.getName(), file.getAbsolutePath());
return match.getName().toUpperCase(PMS.getLocale());
}
LOGGER.debug("Found no matching charset for file {}", file.getAbsolutePath());
return null;
}
use of com.ibm.icu.text.CharsetMatch in project data-access by pentaho.
the class CsvUtils method getEncoding.
public String getEncoding(String fileName) throws Exception {
String path;
if (fileName.endsWith(".tmp")) {
// $NON-NLS-1$
path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
} else {
String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path", // $NON-NLS-1$
String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH));
path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
}
String fileLocation = path + fileName;
String encoding;
try {
byte[] bytes = new byte[1024];
InputStream inputStream = new FileInputStream(new File(fileLocation));
inputStream.read(bytes);
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(bytes);
CharsetMatch charsetMatch = charsetDetector.detect();
encoding = charsetMatch.getName();
inputStream.close();
} catch (Exception e) {
log.error(e);
throw e;
}
return encoding;
}
use of com.ibm.icu.text.CharsetMatch in project yyl_example by Relucent.
the class Icu4jCharsetDetectorExample method getEncoding.
private static String getEncoding(byte[] data) {
CharsetDetector detector = new CharsetDetector();
detector.setText(data);
// 获得全部可能的编码匹配
CharsetMatch[] matches = detector.detectAll();
// 根据可信度和优先级进行排序(可信度高的排在前面,可信度相同优先级高的排在前面)
Arrays.sort(matches, COMPARATOR);
if (matches == null || matches.length == 0) {
return DEFAULT_ENCODING;
}
return matches[0].getName();
}
Aggregations