use of org.mozilla.universalchardet.UniversalDetector in project Asqatasun by Asqatasun.
the class CrawlUtils method extractCharset.
/**
* This method extracts the charset from the html source code.
* If the charset is not specified, it is set to UTF-8 by default
* @param is
* @return
*/
public static String extractCharset(InputStream is) throws java.io.IOException {
byte[] buf = new byte[4096];
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
LOGGER.debug("Detected encoding = " + encoding);
} else {
LOGGER.debug("No encoding detected.");
}
detector.reset();
if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
return encoding;
} else {
return DEFAULT_CHARSET;
}
}
use of org.mozilla.universalchardet.UniversalDetector in project Jota-Text-Editor-old by jiro-aqua.
the class TextLoadTask method openFile.
protected SpannableStringBuilder openFile(InputStream input, String encode) {
SpannableStringBuilder result = new SpannableStringBuilder();
InputStream is;
try {
mCharset = "utf-8";
mLinebreak = LineBreak.LF;
is = new BufferedInputStream(input, 65536);
is.mark(65536);
// preread leading 64KB
int nread;
byte[] buff = new byte[64 * 1024];
nread = is.read(buff);
if (nread <= 0) {
if (encode.length() != 0) {
mCharset = encode;
}
return new SpannableStringBuilder("");
}
// Detect charset
UniversalDetector detector;
if (encode == null || encode.length() == 0) {
try {
detector = new UniversalDetector();
detector.handleData(buff, 0, nread);
detector.dataEnd();
encode = detector.getCharset();
detector.destroy();
} catch (DetectorException e1) {
}
}
is.reset();
// detect linbreak code
if (encode == null || encode.length() == 0) {
encode = "utf-8";
}
Charset charset = Charset.forName(encode);
byte[] cr = new byte[] { '\r' };
byte[] lf = new byte[] { '\n' };
if (charset != null) {
ByteBuffer bb;
bb = charset.encode("\r");
cr = new byte[bb.limit()];
bb.get(cr);
bb = charset.encode("\n");
lf = new byte[bb.limit()];
bb.get(lf);
}
int linebreak = LineBreak.LF;
if (cr.length == 1) {
for (int i = 0; i < nread - 1; i++) {
if (buff[i] == lf[0]) {
linebreak = LineBreak.LF;
break;
} else if (buff[i] == cr[0]) {
if (buff[i + 1] == lf[0]) {
linebreak = LineBreak.CRLF;
} else {
linebreak = LineBreak.CR;
}
break;
}
}
} else {
// cr.length == 2 // we dont think in the case cr.length>2
for (int i = 0; i < nread - 2; i += 2) {
if (buff[i] == lf[0] && buff[i + 1] == lf[1]) {
linebreak = LineBreak.LF;
break;
} else if (buff[i] == cr[0] && buff[i + 1] == cr[1]) {
if (buff[i + 2] == lf[0] && buff[i + 3] == lf[1]) {
linebreak = LineBreak.CRLF;
} else {
linebreak = LineBreak.CR;
}
break;
}
}
}
// if ( encode != null ){
// Log.e( TAG , "CharSet="+encode+"Linebreak=" + new String[]{"CR","LF","CRLF"}[linebreak]);
// }else{
// Log.e( TAG , "CharSet="+"--"+"Linebreak=" + new String[]{"CR","LF","CRLF"}[linebreak]);
// }
mCharset = encode;
mLinebreak = linebreak;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(is, encode), 8192 * 2);
int line = 0;
String text;
while ((text = br.readLine()) != null) {
// remove BOM
if (line == 0) {
if (text.length() > 0 && text.charAt(0) == 0xfeff) {
text = text.substring(1);
}
}
line++;
if (line == mLine) {
mLineToChar = result.length();
}
result.append(text);
result.append('\n');
}
br.close();
is.close();
return result;
} catch (IOException e) {
e.printStackTrace();
}
} catch (IOException e1) {
e1.printStackTrace();
}
return null;
}
use of org.mozilla.universalchardet.UniversalDetector in project gerrit by GerritCodeReview.
the class Text method charset.
private static Charset charset(byte[] content, String encoding) {
if (encoding == null) {
UniversalDetector d = new UniversalDetector(null);
d.handleData(content, 0, content.length);
d.dataEnd();
encoding = d.getDetectedCharset();
}
if (encoding == null) {
return ISO_8859_1;
}
try {
return Charset.forName(encoding);
} catch (IllegalCharsetNameException err) {
log.error("Invalid detected charset name '" + encoding + "': " + err);
return ISO_8859_1;
} catch (UnsupportedCharsetException err) {
log.error("Detected charset '" + encoding + "' not supported: " + err);
return ISO_8859_1;
}
}
Aggregations