use of com.ibm.icu.text.CharsetMatch in project goobi-viewer-indexer by intranda.
the class FileTools method getCharset.
/**
* Uses ICU4J to determine the charset of the given InputStream.
*
* @param input a {@link java.io.InputStream} object.
* @return Detected charset name; null if not detected.
* @throws java.io.IOException
* @should detect charset correctly
*/
public static String getCharset(InputStream input) throws IOException {
CharsetDetector cd = new CharsetDetector();
try (BufferedInputStream bis = new BufferedInputStream(input)) {
cd.setText(bis);
CharsetMatch cm = cd.detect();
if (cm != null) {
return cm.getName();
}
}
return null;
}
use of com.ibm.icu.text.CharsetMatch in project Portugol-Studio by UNIVALI-LITE.
the class EncodingDetector method detect.
public String detect(InputStream fin, byte[] fileContent) throws IOException {
String charset = "ISO-8859-1";
fin.read(fileContent);
byte[] data = fileContent;
CharsetDetector detector = new CharsetDetector();
detector.setText(data);
CharsetMatch cm = detector.detect();
if (cm != null) {
int confidence = cm.getConfidence();
// System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%");
if (confidence > 50) {
charset = cm.getName();
}
}
return charset;
}
use of com.ibm.icu.text.CharsetMatch in project data-access by pentaho.
the class CsvUtils method getEncoding.
public String getEncoding(String fileName) throws Exception {
String path;
if (fileName.endsWith(".tmp")) {
// $NON-NLS-1$
path = PentahoSystem.getApplicationContext().getSolutionPath(TMP_FILE_PATH);
} else {
String relativePath = PentahoSystem.getSystemSetting("file-upload-defaults/relative-path", // $NON-NLS-1$
String.valueOf(DEFAULT_RELATIVE_UPLOAD_FILE_PATH));
path = PentahoSystem.getApplicationContext().getSolutionPath(relativePath);
}
String fileLocation = path + fileName;
String encoding;
try {
byte[] bytes = new byte[1024];
InputStream inputStream = new FileInputStream(new File(fileLocation));
inputStream.read(bytes);
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(bytes);
CharsetMatch charsetMatch = charsetDetector.detect();
encoding = charsetMatch.getName();
inputStream.close();
} catch (Exception e) {
log.error(e);
throw e;
}
return encoding;
}
use of com.ibm.icu.text.CharsetMatch in project DigitalMediaServer by DigitalMediaServer.
the class FileUtil method getFileCharset.
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding.
* @return The detected {@link Charset} or {@code null} if not detected.
* @throws IOException If an IO error occurs during the operation.
*/
@Nullable
public static Charset getFileCharset(@Nullable File file) throws IOException {
if (file == null) {
return null;
}
CharsetMatch match = getFileCharsetMatch(file);
try {
if (Charset.isSupported(match.getName())) {
LOGGER.debug("Detected charset \"{}\" in file \"{}\"", match.getName(), file.getAbsolutePath());
return Charset.forName(match.getName());
}
LOGGER.debug("Detected charset \"{}\" in file \"{}\", but cannot use it because it's not supported by the Java Virual Machine", match.getName(), file.getAbsolutePath());
return null;
} catch (IllegalCharsetNameException e) {
LOGGER.debug("Illegal charset \"{}\" deteceted in file \"{}\"", match.getName(), file.getAbsolutePath());
}
LOGGER.debug("Found no matching charset for file \"{}\"", file.getAbsolutePath());
return null;
}
use of com.ibm.icu.text.CharsetMatch in project mucommander by mucommander.
the class EncodingDetector method detectEncoding.
/**
* Try and detect the character encoding in which the given bytes are encoded, and returns the best guess or
* <code>null</code> if there is none (not enough data or confidence).
* Note that the returned character encoding may not be available on the Java runtime -- use
* <code>java.nio.Charset#isSupported(String)</code> to determine if it is available.
*
* <p>A maximum of {@link #MAX_RECOMMENDED_BYTE_SIZE} will be read from the array. If the array is larger than this
* value, all further bytes will be ignored.</p>
*
* @param bytes the bytes for which to detect the encoding
* @param off the array offset at which the data to process starts
* @param len length of the data in the array
* @return the best guess at the encoding, null if there is none (not enough data or confidence)
*/
public static String detectEncoding(byte[] bytes, int off, int len) {
// supplied array is less than 4 bytes long. In that case, return null.
if (len < 4)
return null;
// having more bytes won't help any further
if (len > MAX_RECOMMENDED_BYTE_SIZE)
len = MAX_RECOMMENDED_BYTE_SIZE;
// length, create a new array that fits the data exactly
if (off > 0 || len < bytes.length) {
byte[] tmp = new byte[len];
System.arraycopy(bytes, off, tmp, 0, len);
bytes = tmp;
}
CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
CharsetMatch cm = cd.detect();
// Debug info
LOGGER.trace("bestMatch getName()={}, getConfidence()={}", (cm == null ? "null" : cm.getName()), (cm == null ? "null" : Integer.toString(cm.getConfidence())));
return cm == null ? null : cm.getName();
}
Aggregations