Examples with CharsetMatch - com.ibm.icu.text.CharsetMatch

Example 11 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.

the class Commons method detectCharset.

/**
 * utf8 utf-8都是可以的。
 * @param requestOrResponse
 * @return
 */
public static String detectCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // 1、尝试从contentTpye中获取
    if (contentType != null) {
        if (contentType.toLowerCase().contains("charset=")) {
            String tmpcharSet = contentType.toLowerCase().split("charset=")[1];
            if (tmpcharSet != null && tmpcharSet.length() > 0) {
                return tmpcharSet;
            }
        }
    }
    if (!isRequest) {
        String tmpCharset = detectCharsetInBody(requestOrResponse);
        System.out.println("响应包中编码识别结果：" + tmpCharset);
        if (null != tmpCharset) {
            return tmpCharset;
        }
    }
    // 2、尝试使用ICU4J进行编码的检测
    CharsetDetector detector = new CharsetDetector();
    detector.setText(requestOrResponse);
    CharsetMatch cm = detector.detect();
    System.out.println("ICU4J检测到编码：" + cm.getName());
    if (cm != null) {
        return cm.getName();
    }
    // 3、http post的默认编码
    return "ISO-8859-1";
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 12 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.

the class HttpMessageCharSet method getCharset.

public static String getCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // http post的默认编码
    String tmpcharSet = "ISO-8859-1";
    if (contentType != null) {
        // 1、尝试从contentTpye中获取
        if (contentType.toLowerCase().contains("charset=")) {
            tmpcharSet = contentType.toLowerCase().split("charset=")[1];
        }
    }
    if (tmpcharSet == null) {
        // 2、尝试使用ICU4J进行编码的检测
        CharsetDetector detector = new CharsetDetector();
        detector.setText(requestOrResponse);
        CharsetMatch cm = detector.detect();
        tmpcharSet = cm.getName();
    }
    tmpcharSet = tmpcharSet.toLowerCase().trim();
    // 常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
    List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,utf8".toLowerCase().split(","));
    for (String item : commonCharSet) {
        if (tmpcharSet.contains(item)) {
            tmpcharSet = item;
        }
    }
    if (tmpcharSet.equals("utf8"))
        tmpcharSet = "utf-8";
    return tmpcharSet;
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 13 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project hale by halestudio.

the class CharsetConfigurationPage method detectCharset.

/**
 * Try to detect the character encoding.
 *
 * @param source the source
 * @throws IOException if the resource cannot be read
 */
protected void detectCharset(LocatableInputSupplier<? extends InputStream> source) throws IOException {
    InputStream input = source.getInput();
    CharsetDetector cd = new CharsetDetector();
    cd.setText(input);
    CharsetMatch cm = cd.detect();
    if (cm != null) {
        charsetCombo.setText(cm.getName());
        update();
        setMessage(MessageFormat.format("Character encoding {0} detected with {1}% confidence.", cm.getName(), cm.getConfidence()), INFORMATION);
    } else {
        setMessage("Character encoding detection yielded no result.", WARNING);
    }
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 14 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project stanbol by apache.

the class CharsetRecognizer method detect.

public static String detect(InputStream in, String format, String encoding) throws IOException {
    // the input stream must support marks
    if (!in.markSupported()) {
        throw new IOException("Mark not supported by input stream");
    }
    String result = null;
    if (format != null) {
        result = checkFormat(format, in);
        if (result != null) {
            return result;
        }
    }
    // in case of HTML or XML check whether there is a charset
    // specification; might be too fragile
    CharsetDetector detector = new CharsetDetector();
    if (encoding != null) {
        detector.setDeclaredEncoding(encoding);
    }
    detector.setText(in);
    CharsetMatch found = detector.detect();
    result = found.getName();
    LOG.debug("Encoding: " + result);
    return result;
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector) IOException(java.io.IOException)

Example 15 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project nutch by apache.

the class EncodingDetector method autoDetectClues.

public void autoDetectClues(Content content, boolean filter) {
    byte[] data = content.getContent();
    if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) {
        CharsetMatch[] matches = null;
        // will sometimes throw exceptions
        try {
            detector.enableInputFilter(filter);
            detector.setText(data);
            matches = detector.detectAll();
        } catch (Exception e) {
            LOG.debug("Exception from ICU4J (ignoring): ", e);
        }
        if (matches != null) {
            for (CharsetMatch match : matches) {
                addClue(match.getName(), "detect", match.getConfidence());
            }
        }
    }
    // add character encoding coming from HTTP response header
    addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}

Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) IOException(java.io.IOException)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)30 CharsetDetector (com.ibm.icu.text.CharsetDetector)21 IOException (java.io.IOException)9 BufferedInputStream (java.io.BufferedInputStream)3 InputStream (java.io.InputStream)3 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)3 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 Nullable (javax.annotation.Nullable)2 DocumentFile (androidx.documentfile.provider.DocumentFile)1 Getter (burp.Getter)1 IExtensionHelpers (burp.IExtensionHelpers)1 UserServletException (com.zimbra.cs.service.UserServletException)1 ItemId (com.zimbra.cs.service.util.ItemId)1 BufferedWriter (java.io.BufferedWriter)1 FileNotFoundException (java.io.FileNotFoundException)1 FileWriter (java.io.FileWriter)1 InputStreamReader (java.io.InputStreamReader)1 PushbackInputStream (java.io.PushbackInputStream)1