Search in sources :

Example 6 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project goobi-viewer-indexer by intranda.

the class FileTools method getCharset.

/**
 * Uses ICU4J to determine the charset of the given InputStream.
 *
 * @param input a {@link java.io.InputStream} object.
 * @return Detected charset name; null if not detected.
 * @throws java.io.IOException
 * @should detect charset correctly
 */
public static String getCharset(InputStream input) throws IOException {
    CharsetDetector cd = new CharsetDetector();
    try (BufferedInputStream bis = new BufferedInputStream(input)) {
        cd.setText(bis);
        CharsetMatch cm = cd.detect();
        if (cm != null) {
            return cm.getName();
        }
    }
    return null;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) BufferedInputStream(java.io.BufferedInputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 7 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project Portugol-Studio by UNIVALI-LITE.

the class EncodingDetector method detect.

public String detect(InputStream fin, byte[] fileContent) throws IOException {
    String charset = "ISO-8859-1";
    fin.read(fileContent);
    byte[] data = fileContent;
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();
    if (cm != null) {
        int confidence = cm.getConfidence();
        // System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%");
        if (confidence > 50) {
            charset = cm.getName();
        }
    }
    return charset;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 8 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project data-access by pentaho.

the class CsvUtils method getEncoding.

public String getEncoding(String fileName) throws Exception {
    String path;
    if (fileName.endsWith(".tmp")) {
        // $NON-NLS-1$
        path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
    } else {
        String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path", // $NON-NLS-1$
        String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH));
        path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
    }
    String fileLocation = path + fileName;
    String encoding;
    try {
        byte[] bytes = new byte[1024];
        InputStream inputStream = new FileInputStream(new File(fileLocation));
        inputStream.read(bytes);
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText(bytes);
        CharsetMatch charsetMatch = charsetDetector.detect();
        encoding = charsetMatch.getName();
        inputStream.close();
    } catch (Exception e) {
        log.error(e);
        throw e;
    }
    return encoding;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector) File(java.io.File) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) CsvParseException(org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException) FileNotFoundException(java.io.FileNotFoundException)

Example 9 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project DigitalMediaServer by DigitalMediaServer.

the class FileUtil method getFileCharset.

/**
 * Detects charset/encoding for given file. Not 100% accurate for
 * non-Unicode files.
 *
 * @param file the file for which to detect charset/encoding.
 * @return The detected {@link Charset} or {@code null} if not detected.
 * @throws IOException If an IO error occurs during the operation.
 */
@Nullable
public static Charset getFileCharset(@Nullable File file) throws IOException {
    if (file == null) {
        return null;
    }
    CharsetMatch match = getFileCharsetMatch(file);
    try {
        if (Charset.isSupported(match.getName())) {
            LOGGER.debug("Detected charset \"{}\" in file \"{}\"", match.getName(), file.getAbsolutePath());
            return Charset.forName(match.getName());
        }
        LOGGER.debug("Detected charset \"{}\" in file \"{}\", but cannot use it because it's not supported by the Java Virual Machine", match.getName(), file.getAbsolutePath());
        return null;
    } catch (IllegalCharsetNameException e) {
        LOGGER.debug("Illegal charset \"{}\" deteceted in file \"{}\"", match.getName(), file.getAbsolutePath());
    }
    LOGGER.debug("Found no matching charset for file \"{}\"", file.getAbsolutePath());
    return null;
}
Also used : IllegalCharsetNameException(java.nio.charset.IllegalCharsetNameException) CharsetMatch(com.ibm.icu.text.CharsetMatch) Nullable(javax.annotation.Nullable)

Example 10 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project mucommander by mucommander.

the class EncodingDetector method detectEncoding.

/**
 * Try and detect the character encoding in which the given bytes are encoded, and returns the best guess or
 * <code>null</code> if there is none (not enough data or confidence).
 * Note that the returned character encoding may not be available on the Java runtime -- use
 * <code>java.nio.Charset#isSupported(String)</code> to determine if it is available.
 *
 * <p>A maximum of {@link #MAX_RECOMMENDED_BYTE_SIZE} will be read from the array. If the array is larger than this
 * value, all further bytes will be ignored.</p>
 *
 * @param bytes the bytes for which to detect the encoding
 * @param off the array offset at which the data to process starts
 * @param len length of the data in the array
 * @return the best guess at the encoding, null if there is none (not enough data or confidence)
 */
public static String detectEncoding(byte[] bytes, int off, int len) {
    // supplied array is less than 4 bytes long. In that case, return null.
    if (len < 4)
        return null;
    // having more bytes won't help any further
    if (len > MAX_RECOMMENDED_BYTE_SIZE)
        len = MAX_RECOMMENDED_BYTE_SIZE;
    // length, create a new array that fits the data exactly
    if (off > 0 || len < bytes.length) {
        byte[] tmp = new byte[len];
        System.arraycopy(bytes, off, tmp, 0, len);
        bytes = tmp;
    }
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch cm = cd.detect();
    // Debug info
    LOGGER.trace("bestMatch getName()={}, getConfidence()={}", (cm == null ? "null" : cm.getName()), (cm == null ? "null" : Integer.toString(cm.getConfidence())));
    return cm == null ? null : cm.getName();
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)30 CharsetDetector (com.ibm.icu.text.CharsetDetector)21 IOException (java.io.IOException)9 BufferedInputStream (java.io.BufferedInputStream)3 InputStream (java.io.InputStream)3 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)3 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 Nullable (javax.annotation.Nullable)2 DocumentFile (androidx.documentfile.provider.DocumentFile)1 Getter (burp.Getter)1 IExtensionHelpers (burp.IExtensionHelpers)1 UserServletException (com.zimbra.cs.service.UserServletException)1 ItemId (com.zimbra.cs.service.util.ItemId)1 BufferedWriter (java.io.BufferedWriter)1 FileNotFoundException (java.io.FileNotFoundException)1 FileWriter (java.io.FileWriter)1 InputStreamReader (java.io.InputStreamReader)1 PushbackInputStream (java.io.PushbackInputStream)1