use of com.ibm.icu.text.CharsetMatch in project dataio by DBCDK.
the class EncodingDetector method detect.
/**
* Detects the charset that best matches the supplied input data
*
* @param bytes the input text of unknown encoding
* @return charset or empty
*/
public Optional<Charset> detect(byte[] bytes) {
charsetDetector.setText(bytes);
final CharsetMatch match = charsetDetector.detect();
if (match != null) {
return Optional.of(Charset.forName(match.getName()));
}
return Optional.empty();
}
use of com.ibm.icu.text.CharsetMatch in project DigitalMediaServer by DigitalMediaServer.
the class DLNAMediaSubtitle method setFileSubsCharacterSet.
/**
* Detects and set Character Set and language of the subs file. When the {@code forcedLang} is not {@code null}
* than it as priority over the detected language.
*
* @param forcedLang forced language
*/
private void setFileSubsCharacterSet(String forcedLang) {
if (type.isPicture()) {
subsCharacterSet = null;
} else {
try {
CharsetMatch match = FileUtil.getFileCharsetMatch(externalFile);
if (match != null) {
subsCharacterSet = match.getName().toUpperCase(Locale.ROOT);
// FFmpeg video filter knows only ISO-8859-8 so extract the additional "-I".
if (subsCharacterSet.split("-").length > 3) {
subsCharacterSet = subsCharacterSet.substring(0, subsCharacterSet.lastIndexOf("-"));
}
if (forcedLang == null) {
// set the detected language when the language is not specified in the filename
lang = match.getLanguage();
}
LOGGER.debug("Set detected charset \"{}\" and language \"{}\" for {}", subsCharacterSet, lang, externalFile.getAbsolutePath());
} else {
subsCharacterSet = null;
LOGGER.debug("No charset detected for {}", externalFile.getAbsolutePath());
}
} catch (IOException ex) {
subsCharacterSet = null;
LOGGER.warn("Exception during external file charset detection: {}", ex.getMessage());
}
}
}
use of com.ibm.icu.text.CharsetMatch in project DigitalMediaServer by DigitalMediaServer.
the class FileUtil method getFileCharsetName.
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding.
* @return The name of the detected charset or {@code null} if not detected.
* @throws IOException If an IO error occurs during the operation.
*/
@Nullable
public static String getFileCharsetName(@Nullable File file) throws IOException {
if (file == null) {
return null;
}
CharsetMatch match = getFileCharsetMatch(file);
try {
if (Charset.isSupported(match.getName())) {
LOGGER.debug("Detected charset \"{}\" in file \"{}\"", match.getName(), file.getAbsolutePath());
return match.getName().toUpperCase(Locale.ROOT);
}
LOGGER.debug("Detected charset \"{}\" in file \"{}\", but cannot use it because it's not supported by the Java Virual Machine", match.getName(), file.getAbsolutePath());
return null;
} catch (IllegalCharsetNameException e) {
LOGGER.debug("Illegal charset \"{}\" deteceted in file \"{}\"", match.getName(), file.getAbsolutePath());
}
LOGGER.debug("Found no matching charset for file \"{}\"", file.getAbsolutePath());
return null;
}
use of com.ibm.icu.text.CharsetMatch in project nutch by apache.
the class EncodingDetector method autoDetectClues.
public void autoDetectClues(Content content, boolean filter) {
byte[] data = content.getContent();
if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) {
CharsetMatch[] matches = null;
// will sometimes throw exceptions
try {
detector.enableInputFilter(filter);
detector.setText(data);
matches = detector.detectAll();
} catch (Exception e) {
LOG.debug("Exception from ICU4J (ignoring): ", e);
}
if (matches != null) {
for (CharsetMatch match : matches) {
addClue(match.getName(), "detect", match.getConfidence());
}
}
}
// add character encoding coming from HTTP response header
addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
use of com.ibm.icu.text.CharsetMatch in project jabref by JabRef.
the class Importer method getCharset.
protected static Charset getCharset(BufferedInputStream bufferedInputStream) {
Charset defaultCharSet = StandardCharsets.UTF_8;
// This reads the first 8000 bytes only, thus the default size of 8192 of the bufferedInputStream is OK.
// See https://github.com/unicode-org/icu/blob/06ef8867f35befee7340e35082fefc9d3561d230/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java#L125 for details
CharsetDetector charsetDetector = new CharsetDetector();
try {
charsetDetector.setText(bufferedInputStream);
CharsetMatch[] matches = charsetDetector.detectAll();
if ((matches == null) || (matches.length == 0)) {
return defaultCharSet;
}
if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(defaultCharSet.toString()))) {
return defaultCharSet;
}
if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(StandardCharsets.UTF_16.toString()))) {
return StandardCharsets.UTF_16;
}
if (matches[0] != null) {
return Charset.forName(matches[0].getName());
}
} catch (IOException e) {
LOGGER.error("Could not determine charset. Using default one.", e);
}
return defaultCharSet;
}
Aggregations