use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.
the class Commons method detectCharset.
/**
* utf8 utf-8都是可以的。
* @param requestOrResponse
* @return
*/
public static String detectCharset(byte[] requestOrResponse) {
IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
Getter getter = new Getter(helpers);
boolean isRequest = true;
if (new String(requestOrResponse).startsWith("HTTP/")) {
// response
isRequest = false;
}
String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
// 1、尝试从contentTpye中获取
if (contentType != null) {
if (contentType.toLowerCase().contains("charset=")) {
String tmpcharSet = contentType.toLowerCase().split("charset=")[1];
if (tmpcharSet != null && tmpcharSet.length() > 0) {
return tmpcharSet;
}
}
}
if (!isRequest) {
String tmpCharset = detectCharsetInBody(requestOrResponse);
System.out.println("响应包中编码识别结果:" + tmpCharset);
if (null != tmpCharset) {
return tmpCharset;
}
}
// 2、尝试使用ICU4J进行编码的检测
CharsetDetector detector = new CharsetDetector();
detector.setText(requestOrResponse);
CharsetMatch cm = detector.detect();
System.out.println("ICU4J检测到编码:" + cm.getName());
if (cm != null) {
return cm.getName();
}
// 3、http post的默认编码
return "ISO-8859-1";
}
use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.
the class HttpMessageCharSet method getCharset.
public static String getCharset(byte[] requestOrResponse) {
IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
Getter getter = new Getter(helpers);
boolean isRequest = true;
if (new String(requestOrResponse).startsWith("HTTP/")) {
// response
isRequest = false;
}
String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
// http post的默认编码
String tmpcharSet = "ISO-8859-1";
if (contentType != null) {
// 1、尝试从contentTpye中获取
if (contentType.toLowerCase().contains("charset=")) {
tmpcharSet = contentType.toLowerCase().split("charset=")[1];
}
}
if (tmpcharSet == null) {
// 2、尝试使用ICU4J进行编码的检测
CharsetDetector detector = new CharsetDetector();
detector.setText(requestOrResponse);
CharsetMatch cm = detector.detect();
tmpcharSet = cm.getName();
}
tmpcharSet = tmpcharSet.toLowerCase().trim();
// 常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,utf8".toLowerCase().split(","));
for (String item : commonCharSet) {
if (tmpcharSet.contains(item)) {
tmpcharSet = item;
}
}
if (tmpcharSet.equals("utf8"))
tmpcharSet = "utf-8";
return tmpcharSet;
}
use of com.ibm.icu.text.CharsetMatch in project hale by halestudio.
the class CharsetConfigurationPage method detectCharset.
/**
* Try to detect the character encoding.
*
* @param source the source
* @throws IOException if the resource cannot be read
*/
protected void detectCharset(LocatableInputSupplier<? extends InputStream> source) throws IOException {
InputStream input = source.getInput();
CharsetDetector cd = new CharsetDetector();
cd.setText(input);
CharsetMatch cm = cd.detect();
if (cm != null) {
charsetCombo.setText(cm.getName());
update();
setMessage(MessageFormat.format("Character encoding {0} detected with {1}% confidence.", cm.getName(), cm.getConfidence()), INFORMATION);
} else {
setMessage("Character encoding detection yielded no result.", WARNING);
}
}
use of com.ibm.icu.text.CharsetMatch in project stanbol by apache.
the class CharsetRecognizer method detect.
public static String detect(InputStream in, String format, String encoding) throws IOException {
// the input stream must support marks
if (!in.markSupported()) {
throw new IOException("Mark not supported by input stream");
}
String result = null;
if (format != null) {
result = checkFormat(format, in);
if (result != null) {
return result;
}
}
// in case of HTML or XML check whether there is a charset
// specification; might be too fragile
CharsetDetector detector = new CharsetDetector();
if (encoding != null) {
detector.setDeclaredEncoding(encoding);
}
detector.setText(in);
CharsetMatch found = detector.detect();
result = found.getName();
LOG.debug("Encoding: " + result);
return result;
}
use of com.ibm.icu.text.CharsetMatch in project nutch by apache.
the class EncodingDetector method autoDetectClues.
public void autoDetectClues(Content content, boolean filter) {
byte[] data = content.getContent();
if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) {
CharsetMatch[] matches = null;
// will sometimes throw exceptions
try {
detector.enableInputFilter(filter);
detector.setText(data);
matches = detector.detectAll();
} catch (Exception e) {
LOG.debug("Exception from ICU4J (ignoring): ", e);
}
if (matches != null) {
for (CharsetMatch match : matches) {
addClue(match.getName(), "detect", match.getConfidence());
}
}
}
// add character encoding coming from HTTP response header
addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
Aggregations