Search in sources :

Example 1 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project Xponents by OpenSextant.

the class TextTranscodingConverter method conversionImplementation.

/**
     * A converter that tries to get a decent encoding ASCII, UTF-8 or other,
     * and then the buffer converted or not.
     *
     * IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer...
     * ELSE file must be read in and converted.
     *
     * CAVEAT: If file is short and low-confidence for encoding detection ALSO
     * do not convert. Treat as a plain text file.
     */
@Override
protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException {
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    byte[] data = null;
    if (in != null) {
        // Get byte data from input stream or file
        if (doc != null) {
            data = FileUtility.readBytesFrom(doc);
        } else {
            data = IOUtils.toByteArray(in);
        }
        in.close();
    }
    // Encoding heuristics here.....
    //
    // Objective:  mark small plain text payloads with unknown character set
    //             as not worthy of conversion.  Leave them as plain/text
    //             indeed they might even be straight Unicode
    //
    // Test for ASCII only first, otherwise try to detect the best charset for the text
    //
    textdoc.is_plaintext = true;
    boolean is_ascii = TextUtils.isASCII(data);
    if (is_ascii) {
        textdoc.do_convert = false;
        textdoc.setEncoding("ASCII");
        textdoc.setText(new String(data));
    } else {
        chardet.setText(data);
        CharsetMatch cs = chardet.detect();
        if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) {
            textdoc.do_convert = false;
        } else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) {
            textdoc.do_convert = false;
        }
        textdoc.setEncoding(cs.getName());
        textdoc.setText(new String(data, cs.getName()));
    }
    return textdoc;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 2 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project nifi by apache.

the class ContentViewerController method doGet.

/**
 * Gets the content and defers to registered viewers to generate the markup.
 *
 * @param request servlet request
 * @param response servlet response
 * @throws ServletException if a servlet-specific error occurs
 * @throws IOException if an I/O error occurs
 */
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException {
    // specify the charset in a response header
    response.addHeader("Content-Type", "text/html; charset=UTF-8");
    // get the content
    final ServletContext servletContext = request.getServletContext();
    final ContentAccess contentAccess = (ContentAccess) servletContext.getAttribute("nifi-content-access");
    final ContentRequestContext contentRequest;
    try {
        contentRequest = getContentRequest(request);
    } catch (final Exception e) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Unable to interpret content request.");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    if (contentRequest.getDataUri() == null) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "The data reference must be specified.");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // get the content
    final DownloadableContent downloadableContent;
    try {
        downloadableContent = contentAccess.getContent(contentRequest);
    } catch (final ResourceNotFoundException rnfe) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Unable to find the specified content");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    } catch (final AccessDeniedException ade) {
        request.setAttribute("title", "Access Denied");
        request.setAttribute("messages", "Unable to approve access to the specified content: " + ade.getMessage());
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    } catch (final Exception e) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "An unexpected error has occurred: " + e.getMessage());
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // determine how we want to view the data
    String mode = request.getParameter("mode");
    // if the name isn't set, use original
    if (mode == null) {
        mode = DisplayMode.Original.name();
    }
    // determine the display mode
    final DisplayMode displayMode;
    try {
        displayMode = DisplayMode.valueOf(mode);
    } catch (final IllegalArgumentException iae) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Invalid display mode: " + mode);
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // buffer the content to support resetting in case we need to detect the content type or char encoding
    try (final BufferedInputStream bis = new BufferedInputStream(downloadableContent.getContent())) {
        final String mimeType;
        final String normalizedMimeType;
        // when clustered and we don't know the type set to octet stream since the content was retrieved from the node's rest endpoint
        if (downloadableContent.getType() == null || StringUtils.startsWithIgnoreCase(downloadableContent.getType(), MediaType.OCTET_STREAM.toString())) {
            // attempt to detect the content stream if we don't know what it is ()
            final DefaultDetector detector = new DefaultDetector();
            // create the stream for tika to process, buffered to support reseting
            final TikaInputStream tikaStream = TikaInputStream.get(bis);
            // provide a hint based on the filename
            final Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
            // Get mime type
            final MediaType mediatype = detector.detect(tikaStream, metadata);
            mimeType = mediatype.toString();
        } else {
            mimeType = downloadableContent.getType();
        }
        // Extract only mime type and subtype from content type (anything after the first ; are parameters)
        // Lowercase so subsequent code does not need to implement case insensitivity
        normalizedMimeType = mimeType.split(";", 2)[0].toLowerCase();
        // add attributes needed for the header
        request.setAttribute("filename", downloadableContent.getFilename());
        request.setAttribute("contentType", mimeType);
        // generate the header
        request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response);
        // remove the attributes needed for the header
        request.removeAttribute("filename");
        request.removeAttribute("contentType");
        // generate the markup for the content based on the display mode
        if (DisplayMode.Hex.equals(displayMode)) {
            final byte[] buffer = new byte[BUFFER_LENGTH];
            final int read = StreamUtils.fillBuffer(bis, buffer, false);
            // trim the byte array if necessary
            byte[] bytes = buffer;
            if (read != buffer.length) {
                bytes = new byte[read];
                System.arraycopy(buffer, 0, bytes, 0, read);
            }
            // convert bytes into the base 64 bytes
            final String base64 = Base64.encodeBase64String(bytes);
            // defer to the jsp
            request.setAttribute("content", base64);
            request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response);
        } else {
            // lookup a viewer for the content
            final String contentViewerUri = servletContext.getInitParameter(normalizedMimeType);
            // handle no viewer for content type
            if (contentViewerUri == null) {
                request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response);
            } else {
                // create a request attribute for accessing the content
                request.setAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE, new ViewableContent() {

                    @Override
                    public InputStream getContentStream() {
                        return bis;
                    }

                    @Override
                    public String getContent() throws IOException {
                        // detect the charset
                        final CharsetDetector detector = new CharsetDetector();
                        detector.setText(bis);
                        detector.enableInputFilter(true);
                        final CharsetMatch match = detector.detect();
                        // ensure we were able to detect the charset
                        if (match == null) {
                            throw new IOException("Unable to detect character encoding.");
                        }
                        // convert the stream using the detected charset
                        return IOUtils.toString(bis, match.getName());
                    }

                    @Override
                    public ViewableContent.DisplayMode getDisplayMode() {
                        return displayMode;
                    }

                    @Override
                    public String getFileName() {
                        return downloadableContent.getFilename();
                    }

                    @Override
                    public String getContentType() {
                        return normalizedMimeType;
                    }

                    @Override
                    public String getRawContentType() {
                        return mimeType;
                    }
                });
                try {
                    // generate the content
                    final ServletContext viewerContext = servletContext.getContext(contentViewerUri);
                    viewerContext.getRequestDispatcher("/view-content").include(request, response);
                } catch (final Exception e) {
                    String message = e.getMessage() != null ? e.getMessage() : e.toString();
                    message = "Unable to generate view of data: " + message;
                    // log the error
                    logger.error(message);
                    if (logger.isDebugEnabled()) {
                        logger.error(StringUtils.EMPTY, e);
                    }
                    // populate the request attributes
                    request.setAttribute("title", "Error");
                    request.setAttribute("messages", message);
                    // forward to the error page
                    final ServletContext viewerContext = servletContext.getContext("/nifi");
                    viewerContext.getRequestDispatcher("/message").forward(request, response);
                    return;
                }
                // remove the request attribute
                request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE);
            }
        }
        // generate footer
        request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response);
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) BufferedInputStream(java.io.BufferedInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) ServletException(javax.servlet.ServletException) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) IOException(java.io.IOException) DisplayMode(org.apache.nifi.web.ViewableContent.DisplayMode) CharsetMatch(com.ibm.icu.text.CharsetMatch) BufferedInputStream(java.io.BufferedInputStream) ServletContext(javax.servlet.ServletContext) MediaType(org.apache.tika.mime.MediaType)

Example 3 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.

the class FileUtil method getFileCharsetName.

/**
 * Detects charset/encoding for given file. Not 100% accurate for
 * non-Unicode files.
 *
 * @param file the file for which to detect charset/encoding
 * @return The name of the detected charset or <code>null</code> if not detected
 * @throws IOException
 */
public static String getFileCharsetName(File file) throws IOException {
    CharsetMatch match = getFileCharsetMatch(file);
    if (match != null) {
        LOGGER.debug("Detected charset \"{}\" in file {}", match.getName(), file.getAbsolutePath());
        return match.getName().toUpperCase(PMS.getLocale());
    }
    LOGGER.debug("Found no matching charset for file {}", file.getAbsolutePath());
    return null;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch)

Example 4 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project data-access by pentaho.

the class CsvUtils method getEncoding.

public String getEncoding(String fileName) throws Exception {
    String path;
    if (fileName.endsWith(".tmp")) {
        // $NON-NLS-1$
        path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
    } else {
        String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path", // $NON-NLS-1$
        String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH));
        path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
    }
    String fileLocation = path + fileName;
    String encoding;
    try {
        byte[] bytes = new byte[1024];
        InputStream inputStream = new FileInputStream(new File(fileLocation));
        inputStream.read(bytes);
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText(bytes);
        CharsetMatch charsetMatch = charsetDetector.detect();
        encoding = charsetMatch.getName();
        inputStream.close();
    } catch (Exception e) {
        log.error(e);
        throw e;
    }
    return encoding;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector) File(java.io.File) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) CsvParseException(org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException) FileNotFoundException(java.io.FileNotFoundException)

Example 5 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project yyl_example by Relucent.

the class Icu4jCharsetDetectorExample method getEncoding.

private static String getEncoding(byte[] data) {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    // 获得全部可能的编码匹配
    CharsetMatch[] matches = detector.detectAll();
    // 根据可信度和优先级进行排序(可信度高的排在前面,可信度相同优先级高的排在前面)
    Arrays.sort(matches, COMPARATOR);
    if (matches == null || matches.length == 0) {
        return DEFAULT_ENCODING;
    }
    return matches[0].getName();
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)15 CharsetDetector (com.ibm.icu.text.CharsetDetector)9 IOException (java.io.IOException)6 InputStream (java.io.InputStream)3 FileInputStream (java.io.FileInputStream)2 UserServletException (com.zimbra.cs.service.UserServletException)1 ItemId (com.zimbra.cs.service.util.ItemId)1 BufferedInputStream (java.io.BufferedInputStream)1 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 InputStreamReader (java.io.InputStreamReader)1 PushbackInputStream (java.io.PushbackInputStream)1 Charset (java.nio.charset.Charset)1 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)1 Map (java.util.Map)1 ServletContext (javax.servlet.ServletContext)1 ServletException (javax.servlet.ServletException)1 AccessDeniedException (org.apache.nifi.authorization.AccessDeniedException)1 DisplayMode (org.apache.nifi.web.ViewableContent.DisplayMode)1