Search in sources :

Example 16 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project mucommander by mucommander.

the class EncodingDetector method detectEncoding.

/**
 * Try and detect the character encoding in which the given bytes are encoded, and returns the best guess or
 * <code>null</code> if there is none (not enough data or confidence).
 * Note that the returned character encoding may not be available on the Java runtime -- use
 * <code>java.nio.Charset#isSupported(String)</code> to determine if it is available.
 *
 * <p>A maximum of {@link #MAX_RECOMMENDED_BYTE_SIZE} will be read from the array. If the array is larger than this
 * value, all further bytes will be ignored.</p>
 *
 * @param bytes the bytes for which to detect the encoding
 * @param off the array offset at which the data to process starts
 * @param len length of the data in the array
 * @return the best guess at the encoding, null if there is none (not enough data or confidence)
 */
public static String detectEncoding(byte[] bytes, int off, int len) {
    // supplied array is less than 4 bytes long. In that case, return null.
    if (len < 4)
        return null;
    // having more bytes won't help any further
    if (len > MAX_RECOMMENDED_BYTE_SIZE)
        len = MAX_RECOMMENDED_BYTE_SIZE;
    // length, create a new array that fits the data exactly
    if (off > 0 || len < bytes.length) {
        byte[] tmp = new byte[len];
        System.arraycopy(bytes, off, tmp, 0, len);
        bytes = tmp;
    }
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch cm = cd.detect();
    // Debug info
    LOGGER.trace("bestMatch getName()={}, getConfidence()={}", (cm == null ? "null" : cm.getName()), (cm == null ? "null" : Integer.toString(cm.getConfidence())));
    return cm == null ? null : cm.getName();
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 17 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.

the class Commons method detectCharset.

/**
 * utf8 utf-8都是可以的。
 * @param requestOrResponse
 * @return
 */
public static String detectCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // 1、尝试从contentTpye中获取
    if (contentType != null) {
        if (contentType.toLowerCase().contains("charset=")) {
            String tmpcharSet = contentType.toLowerCase().split("charset=")[1];
            if (tmpcharSet != null && tmpcharSet.length() > 0) {
                return tmpcharSet;
            }
        }
    }
    if (!isRequest) {
        String tmpCharset = detectCharsetInBody(requestOrResponse);
        System.out.println("响应包中编码识别结果:" + tmpCharset);
        if (null != tmpCharset) {
            return tmpCharset;
        }
    }
    // 2、尝试使用ICU4J进行编码的检测
    CharsetDetector detector = new CharsetDetector();
    detector.setText(requestOrResponse);
    CharsetMatch cm = detector.detect();
    System.out.println("ICU4J检测到编码:" + cm.getName());
    if (cm != null) {
        return cm.getName();
    }
    // 3、http post的默认编码
    return "ISO-8859-1";
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 18 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project domain_hunter_pro by bit4woo.

the class HttpMessageCharSet method getCharset.

public static String getCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // http post的默认编码
    String tmpcharSet = "ISO-8859-1";
    if (contentType != null) {
        // 1、尝试从contentTpye中获取
        if (contentType.toLowerCase().contains("charset=")) {
            tmpcharSet = contentType.toLowerCase().split("charset=")[1];
        }
    }
    if (tmpcharSet == null) {
        // 2、尝试使用ICU4J进行编码的检测
        CharsetDetector detector = new CharsetDetector();
        detector.setText(requestOrResponse);
        CharsetMatch cm = detector.detect();
        tmpcharSet = cm.getName();
    }
    tmpcharSet = tmpcharSet.toLowerCase().trim();
    // 常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
    List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,utf8".toLowerCase().split(","));
    for (String item : commonCharSet) {
        if (tmpcharSet.contains(item)) {
            tmpcharSet = item;
        }
    }
    if (tmpcharSet.equals("utf8"))
        tmpcharSet = "utf-8";
    return tmpcharSet;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 19 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project hale by halestudio.

the class CharsetConfigurationPage method detectCharset.

/**
 * Try to detect the character encoding.
 *
 * @param source the source
 * @throws IOException if the resource cannot be read
 */
protected void detectCharset(LocatableInputSupplier<? extends InputStream> source) throws IOException {
    InputStream input = source.getInput();
    CharsetDetector cd = new CharsetDetector();
    cd.setText(input);
    CharsetMatch cm = cd.detect();
    if (cm != null) {
        charsetCombo.setText(cm.getName());
        update();
        setMessage(MessageFormat.format("Character encoding {0} detected with {1}% confidence.", cm.getName(), cm.getConfidence()), INFORMATION);
    } else {
        setMessage("Character encoding detection yielded no result.", WARNING);
    }
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 20 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project stanbol by apache.

the class CharsetRecognizer method detect.

public static String detect(InputStream in, String format, String encoding) throws IOException {
    // the input stream must support marks
    if (!in.markSupported()) {
        throw new IOException("Mark not supported by input stream");
    }
    String result = null;
    if (format != null) {
        result = checkFormat(format, in);
        if (result != null) {
            return result;
        }
    }
    // in case of HTML or XML check whether there is a charset
    // specification; might be too fragile
    CharsetDetector detector = new CharsetDetector();
    if (encoding != null) {
        detector.setDeclaredEncoding(encoding);
    }
    detector.setText(in);
    CharsetMatch found = detector.detect();
    result = found.getName();
    LOG.debug("Encoding: " + result);
    return result;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector) IOException(java.io.IOException)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)43 CharsetDetector (com.ibm.icu.text.CharsetDetector)28 IOException (java.io.IOException)12 BufferedInputStream (java.io.BufferedInputStream)8 InputStream (java.io.InputStream)5 File (java.io.File)4 FileInputStream (java.io.FileInputStream)4 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)3 BufferedReader (java.io.BufferedReader)2 Charset (java.nio.charset.Charset)2 Nullable (javax.annotation.Nullable)2 ServletException (javax.servlet.ServletException)2 SneakyThrows (lombok.SneakyThrows)2 DocumentFile (androidx.documentfile.provider.DocumentFile)1 Getter (burp.Getter)1 IExtensionHelpers (burp.IExtensionHelpers)1 OKMDocument (com.openkm.api.OKMDocument)1 AutomationException (com.openkm.automation.AutomationException)1 Document (com.openkm.bean.Document)1 OKMException (com.openkm.frontend.client.OKMException)1