Examples with CharsetMatch - com.ibm.icu.text.CharsetMatch

Example 21 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project zm-mailbox by Zimbra.

the class CsvFormatter method saveCallback.

@Override
public void saveCallback(UserServletContext context, String contentType, Folder folder, String filename) throws UserServletException, ServiceException, IOException {
    // Disable the jetty timeout
    disableJettyTimeout(context);
    // Detect the charset of upload file.
    PushbackInputStream pis = new PushbackInputStream(context.getRequestInputStream(), READ_AHEAD_BUFFER_SIZE);
    byte[] buf = new byte[READ_AHEAD_BUFFER_SIZE];
    int bytesRead = pis.read(buf, 0, READ_AHEAD_BUFFER_SIZE);
    CharsetDetector detector = new CharsetDetector();
    detector.setText(buf);
    CharsetMatch match = detector.detect();
    String guess = match.getName();
    Charset charset;
    if (guess != null) {
        try {
            charset = Charset.forName(guess);
        } catch (IllegalArgumentException e) {
            charset = Charsets.UTF_8;
        }
    } else {
        charset = Charsets.UTF_8;
    }
    if (bytesRead > 0) {
        pis.unread(buf, 0, bytesRead);
    }
    InputStreamReader isr = new InputStreamReader(pis, charset);
    BufferedReader reader = new BufferedReader(isr);
    try {
        String format = context.params.get(UserServlet.QP_CSVFORMAT);
        String locale = context.req.getParameter(UserServlet.QP_CSVLOCALE);
        if (locale == null) {
            locale = context.getLocale().toString();
        }
        List<Map<String, String>> contacts = ContactCSV.getContacts(reader, format, locale);
        ItemId iidFolder = new ItemId(folder);
        ImportContacts.ImportCsvContacts(context.opContext, context.targetMailbox, iidFolder, contacts);
    } catch (ContactCSV.ParseException e) {
        ZimbraLog.misc.debug("ContactCSV - ParseException thrown", e);
        throw new UserServletException(HttpServletResponse.SC_BAD_REQUEST, "Could not parse csv file - Reason : " + e.getMessage());
    } finally {
        reader.close();
    }
}

Also used : InputStreamReader(java.io.InputStreamReader) UserServletException(com.zimbra.cs.service.UserServletException) CharsetDetector(com.ibm.icu.text.CharsetDetector) Charset(java.nio.charset.Charset) ItemId(com.zimbra.cs.service.util.ItemId) CharsetMatch(com.ibm.icu.text.CharsetMatch) PushbackInputStream(java.io.PushbackInputStream) BufferedReader(java.io.BufferedReader) Map(java.util.Map)

Example 22 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project Xponents by OpenSextant.

the class TextTranscodingConverter method setTextAndEncoding.

/**
     * If you have a buffer of text for a document and are unable to get a provided charset,
     * try this static method.  Better than nothing. This does not imply that the original document is a plain text doc.
     * It could be an object that was parsed adhoc.  We cannot make any assumption about
     * the state of the conversion.  This only sets String buffer and charset.
     *
     * @param doc the doc
     * @param data the byte data to test
     * @throws UnsupportedEncodingException on err
     */
public static void setTextAndEncoding(ConvertedDocument doc, byte[] data) throws UnsupportedEncodingException {
    boolean is_ascii = TextUtils.isASCII(data);
    if (is_ascii) {
        doc.setEncoding("ASCII");
        doc.setText(new String(data));
        return;
    }
    chardet.setText(data);
    CharsetMatch cs = chardet.detect();
    doc.setEncoding(cs.getName());
    doc.setText(new String(data, cs.getName()));
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch)

Example 23 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.

the class DLNAMediaSubtitle method setFileSubsCharacterSet.

/**
 * Detects and set Character Set and language of the subs file. When the {@code forcedLang} is not {@code null}
 * than it as priority over the detected language.
 *
 * @param forcedLang forced language
 */
private void setFileSubsCharacterSet(String forcedLang) {
    if (type.isPicture()) {
        subsCharacterSet = null;
    } else {
        try {
            CharsetMatch match = FileUtil.getFileCharsetMatch(externalFile);
            if (match != null) {
                subsCharacterSet = match.getName().toUpperCase(Locale.ROOT);
                // FFmpeg video filter knows only ISO-8859-8 so extract the additional "-I".
                if (subsCharacterSet.split("-").length > 3) {
                    subsCharacterSet = subsCharacterSet.substring(0, subsCharacterSet.lastIndexOf("-"));
                }
                if (forcedLang == null) {
                    // set the detected language when the language is not specified in the filename
                    lang = match.getLanguage();
                }
                LOGGER.debug("Set detected charset \"{}\" and language \"{}\" for {}", subsCharacterSet, lang, externalFile.getAbsolutePath());
            } else {
                subsCharacterSet = null;
                LOGGER.debug("No charset detected for {}", externalFile.getAbsolutePath());
            }
        } catch (IOException ex) {
            subsCharacterSet = null;
            LOGGER.warn("Exception during external file charset detection: ", ex.getMessage());
        }
    }
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) IOException(java.io.IOException)

Example 24 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.

the class FileUtil method getFileCharset.

/**
 * Detects charset/encoding for given file. Not 100% accurate for
 * non-Unicode files.
 *
 * @param file the file for which to detect charset/encoding
 * @return The detected <code>Charset</code> or <code>null</code> if not detected
 * @throws IOException
 */
public static Charset getFileCharset(File file) throws IOException {
    CharsetMatch match = getFileCharsetMatch(file);
    if (match != null) {
        try {
            if (Charset.isSupported(match.getName())) {
                LOGGER.debug("Detected charset \"{}\" in file {}", match.getName(), file.getAbsolutePath());
                return Charset.forName(match.getName());
            }
            LOGGER.debug("Detected charset \"{}\" in file {}, but cannot use it because it's not supported by the Java Virual Machine", match.getName(), file.getAbsolutePath());
            return null;
        } catch (IllegalCharsetNameException e) {
            LOGGER.debug("Illegal charset deteceted \"{}\" in file {}", match.getName(), file.getAbsolutePath());
        }
    }
    LOGGER.debug("Found no matching charset for file {}", file.getAbsolutePath());
    return null;
}

Also used : IllegalCharsetNameException(java.nio.charset.IllegalCharsetNameException) CharsetMatch(com.ibm.icu.text.CharsetMatch)

Example 25 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project polymap4-core by Polymap4.

the class FileEncodingGuesserTest method detect.

private void detect(File file, String expectedCharset) throws IOException {
    byte[] fileContent = null;
    FileInputStream fin = null;
    // create FileInputStream object
    fin = new FileInputStream(file.getPath());
    /*
         * Create byte array large enough to hold the content of the file. Use
         * File.length to determine size of the file in bytes.
         */
    fileContent = new byte[(int) file.length()];
    /*
         * To read content of the file in byte array, use int read(byte[] byteArray)
         * method of java FileInputStream class.
         *
         */
    fin.read(fileContent);
    byte[] data = fileContent;
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();
    // if (cm != null) {
    int confidence = cm.getConfidence();
    String name = cm.getName();
    assertEquals(expectedCharset, name);
    System.out.println("File: " + file.getName() + " - Encoding: " + cm.getName() + ":" + cm.getLanguage() + " - Confidence: " + confidence + "%");
    // if ("ISO-8859-1".equals(name)) {
    // "\u20ac".codePoints().forEach(a -> System.out.println(a));
    // // for (int i=0;i<data.length;i++) {
    // // System.out.print(data[i]);
    // // }
    // System.out.println("===");
    // cm.getString().codePoints().forEach(a -> System.out.println(a));
    // }
    // Here you have the encode name and the confidence
    // In my case if the confidence is > 50 I return the encode, else I return
    // the default value
    // if (confidence > 50) {
    // }
    // }
    fin.close();
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector) FileInputStream(java.io.FileInputStream)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)43 CharsetDetector (com.ibm.icu.text.CharsetDetector)28 IOException (java.io.IOException)12 BufferedInputStream (java.io.BufferedInputStream)8 InputStream (java.io.InputStream)5 File (java.io.File)4 FileInputStream (java.io.FileInputStream)4 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)3 BufferedReader (java.io.BufferedReader)2 Charset (java.nio.charset.Charset)2 Nullable (javax.annotation.Nullable)2 ServletException (javax.servlet.ServletException)2 SneakyThrows (lombok.SneakyThrows)2 DocumentFile (androidx.documentfile.provider.DocumentFile)1 Getter (burp.Getter)1 IExtensionHelpers (burp.IExtensionHelpers)1 OKMDocument (com.openkm.api.OKMDocument)1 AutomationException (com.openkm.automation.AutomationException)1 Document (com.openkm.bean.Document)1 OKMException (com.openkm.frontend.client.OKMException)1