use of com.ibm.icu.text.CharsetMatch in project zm-mailbox by Zimbra.
the class CsvFormatter method saveCallback.
@Override
public void saveCallback(UserServletContext context, String contentType, Folder folder, String filename) throws UserServletException, ServiceException, IOException {
// Disable the jetty timeout
disableJettyTimeout(context);
// Detect the charset of upload file.
PushbackInputStream pis = new PushbackInputStream(context.getRequestInputStream(), READ_AHEAD_BUFFER_SIZE);
byte[] buf = new byte[READ_AHEAD_BUFFER_SIZE];
int bytesRead = pis.read(buf, 0, READ_AHEAD_BUFFER_SIZE);
CharsetDetector detector = new CharsetDetector();
detector.setText(buf);
CharsetMatch match = detector.detect();
String guess = match.getName();
Charset charset;
if (guess != null) {
try {
charset = Charset.forName(guess);
} catch (IllegalArgumentException e) {
charset = Charsets.UTF_8;
}
} else {
charset = Charsets.UTF_8;
}
if (bytesRead > 0) {
pis.unread(buf, 0, bytesRead);
}
InputStreamReader isr = new InputStreamReader(pis, charset);
BufferedReader reader = new BufferedReader(isr);
try {
String format = context.params.get(UserServlet.QP_CSVFORMAT);
String locale = context.req.getParameter(UserServlet.QP_CSVLOCALE);
if (locale == null) {
locale = context.getLocale().toString();
}
List<Map<String, String>> contacts = ContactCSV.getContacts(reader, format, locale);
ItemId iidFolder = new ItemId(folder);
ImportContacts.ImportCsvContacts(context.opContext, context.targetMailbox, iidFolder, contacts);
} catch (ContactCSV.ParseException e) {
ZimbraLog.misc.debug("ContactCSV - ParseException thrown", e);
throw new UserServletException(HttpServletResponse.SC_BAD_REQUEST, "Could not parse csv file - Reason : " + e.getMessage());
} finally {
reader.close();
}
}
use of com.ibm.icu.text.CharsetMatch in project Xponents by OpenSextant.
the class TextTranscodingConverter method setTextAndEncoding.
/**
* If you have a buffer of text for a document and are unable to get a provided charset,
* try this static method. Better than nothing. This does not imply that the original document is a plain text doc.
* It could be an object that was parsed adhoc. We cannot make any assumption about
* the state of the conversion. This only sets String buffer and charset.
*
* @param doc the doc
* @param data the byte data to test
* @throws UnsupportedEncodingException on err
*/
public static void setTextAndEncoding(ConvertedDocument doc, byte[] data) throws UnsupportedEncodingException {
boolean is_ascii = TextUtils.isASCII(data);
if (is_ascii) {
doc.setEncoding("ASCII");
doc.setText(new String(data));
return;
}
chardet.setText(data);
CharsetMatch cs = chardet.detect();
doc.setEncoding(cs.getName());
doc.setText(new String(data, cs.getName()));
}
use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.
the class DLNAMediaSubtitle method setFileSubsCharacterSet.
/**
* Detects and set Character Set and language of the subs file. When the {@code forcedLang} is not {@code null}
* than it as priority over the detected language.
*
* @param forcedLang forced language
*/
private void setFileSubsCharacterSet(String forcedLang) {
if (type.isPicture()) {
subsCharacterSet = null;
} else {
try {
CharsetMatch match = FileUtil.getFileCharsetMatch(externalFile);
if (match != null) {
subsCharacterSet = match.getName().toUpperCase(Locale.ROOT);
// FFmpeg video filter knows only ISO-8859-8 so extract the additional "-I".
if (subsCharacterSet.split("-").length > 3) {
subsCharacterSet = subsCharacterSet.substring(0, subsCharacterSet.lastIndexOf("-"));
}
if (forcedLang == null) {
// set the detected language when the language is not specified in the filename
lang = match.getLanguage();
}
LOGGER.debug("Set detected charset \"{}\" and language \"{}\" for {}", subsCharacterSet, lang, externalFile.getAbsolutePath());
} else {
subsCharacterSet = null;
LOGGER.debug("No charset detected for {}", externalFile.getAbsolutePath());
}
} catch (IOException ex) {
subsCharacterSet = null;
LOGGER.warn("Exception during external file charset detection: ", ex.getMessage());
}
}
}
use of com.ibm.icu.text.CharsetMatch in project UniversalMediaServer by UniversalMediaServer.
the class FileUtil method getFileCharset.
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding
* @return The detected <code>Charset</code> or <code>null</code> if not detected
* @throws IOException
*/
public static Charset getFileCharset(File file) throws IOException {
CharsetMatch match = getFileCharsetMatch(file);
if (match != null) {
try {
if (Charset.isSupported(match.getName())) {
LOGGER.debug("Detected charset \"{}\" in file {}", match.getName(), file.getAbsolutePath());
return Charset.forName(match.getName());
}
LOGGER.debug("Detected charset \"{}\" in file {}, but cannot use it because it's not supported by the Java Virual Machine", match.getName(), file.getAbsolutePath());
return null;
} catch (IllegalCharsetNameException e) {
LOGGER.debug("Illegal charset deteceted \"{}\" in file {}", match.getName(), file.getAbsolutePath());
}
}
LOGGER.debug("Found no matching charset for file {}", file.getAbsolutePath());
return null;
}
use of com.ibm.icu.text.CharsetMatch in project polymap4-core by Polymap4.
the class FileEncodingGuesserTest method detect.
private void detect(File file, String expectedCharset) throws IOException {
byte[] fileContent = null;
FileInputStream fin = null;
// create FileInputStream object
fin = new FileInputStream(file.getPath());
/*
* Create byte array large enough to hold the content of the file. Use
* File.length to determine size of the file in bytes.
*/
fileContent = new byte[(int) file.length()];
/*
* To read content of the file in byte array, use int read(byte[] byteArray)
* method of java FileInputStream class.
*
*/
fin.read(fileContent);
byte[] data = fileContent;
CharsetDetector detector = new CharsetDetector();
detector.setText(data);
CharsetMatch cm = detector.detect();
// if (cm != null) {
int confidence = cm.getConfidence();
String name = cm.getName();
assertEquals(expectedCharset, name);
System.out.println("File: " + file.getName() + " - Encoding: " + cm.getName() + ":" + cm.getLanguage() + " - Confidence: " + confidence + "%");
// if ("ISO-8859-1".equals(name)) {
// "\u20ac".codePoints().forEach(a -> System.out.println(a));
// // for (int i=0;i<data.length;i++) {
// // System.out.print(data[i]);
// // }
// System.out.println("===");
// cm.getString().codePoints().forEach(a -> System.out.println(a));
// }
// Here you have the encode name and the confidence
// In my case if the confidence is > 50 I return the encode, else I return
// the default value
// if (confidence > 50) {
// }
// }
fin.close();
}
Aggregations