Search in sources :

Example 1 with UniversalDetector

use of org.mozilla.universalchardet.UniversalDetector in project Asqatasun by Asqatasun.

the class CrawlUtils method extractCharset.

/**
     * This method extracts the charset from the html source code.
     * If the charset is not specified, it is set to UTF-8 by default
     * @param is
     * @return
     */
public static String extractCharset(InputStream is) throws java.io.IOException {
    byte[] buf = new byte[4096];
    UniversalDetector detector = new UniversalDetector(null);
    int nread;
    while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, nread);
    }
    detector.dataEnd();
    String encoding = detector.getDetectedCharset();
    if (encoding != null) {
        LOGGER.debug("Detected encoding = " + encoding);
    } else {
        LOGGER.debug("No encoding detected.");
    }
    detector.reset();
    if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
        return encoding;
    } else {
        return DEFAULT_CHARSET;
    }
}
Also used : UniversalDetector(org.mozilla.universalchardet.UniversalDetector)

Example 2 with UniversalDetector

use of org.mozilla.universalchardet.UniversalDetector in project Jota-Text-Editor-old by jiro-aqua.

the class TextLoadTask method openFile.

protected SpannableStringBuilder openFile(InputStream input, String encode) {
    SpannableStringBuilder result = new SpannableStringBuilder();
    InputStream is;
    try {
        mCharset = "utf-8";
        mLinebreak = LineBreak.LF;
        is = new BufferedInputStream(input, 65536);
        is.mark(65536);
        // preread leading 64KB
        int nread;
        byte[] buff = new byte[64 * 1024];
        nread = is.read(buff);
        if (nread <= 0) {
            if (encode.length() != 0) {
                mCharset = encode;
            }
            return new SpannableStringBuilder("");
        }
        // Detect charset
        UniversalDetector detector;
        if (encode == null || encode.length() == 0) {
            try {
                detector = new UniversalDetector();
                detector.handleData(buff, 0, nread);
                detector.dataEnd();
                encode = detector.getCharset();
                detector.destroy();
            } catch (DetectorException e1) {
            }
        }
        is.reset();
        // detect linbreak code
        if (encode == null || encode.length() == 0) {
            encode = "utf-8";
        }
        Charset charset = Charset.forName(encode);
        byte[] cr = new byte[] { '\r' };
        byte[] lf = new byte[] { '\n' };
        if (charset != null) {
            ByteBuffer bb;
            bb = charset.encode("\r");
            cr = new byte[bb.limit()];
            bb.get(cr);
            bb = charset.encode("\n");
            lf = new byte[bb.limit()];
            bb.get(lf);
        }
        int linebreak = LineBreak.LF;
        if (cr.length == 1) {
            for (int i = 0; i < nread - 1; i++) {
                if (buff[i] == lf[0]) {
                    linebreak = LineBreak.LF;
                    break;
                } else if (buff[i] == cr[0]) {
                    if (buff[i + 1] == lf[0]) {
                        linebreak = LineBreak.CRLF;
                    } else {
                        linebreak = LineBreak.CR;
                    }
                    break;
                }
            }
        } else {
            // cr.length == 2 // we dont think in the case cr.length>2
            for (int i = 0; i < nread - 2; i += 2) {
                if (buff[i] == lf[0] && buff[i + 1] == lf[1]) {
                    linebreak = LineBreak.LF;
                    break;
                } else if (buff[i] == cr[0] && buff[i + 1] == cr[1]) {
                    if (buff[i + 2] == lf[0] && buff[i + 3] == lf[1]) {
                        linebreak = LineBreak.CRLF;
                    } else {
                        linebreak = LineBreak.CR;
                    }
                    break;
                }
            }
        }
        //            if ( encode != null ){
        //                Log.e( TAG , "CharSet="+encode+"Linebreak=" + new String[]{"CR","LF","CRLF"}[linebreak]);
        //            }else{
        //                Log.e( TAG , "CharSet="+"--"+"Linebreak=" + new String[]{"CR","LF","CRLF"}[linebreak]);
        //            }
        mCharset = encode;
        mLinebreak = linebreak;
        BufferedReader br = null;
        try {
            br = new BufferedReader(new InputStreamReader(is, encode), 8192 * 2);
            int line = 0;
            String text;
            while ((text = br.readLine()) != null) {
                // remove BOM
                if (line == 0) {
                    if (text.length() > 0 && text.charAt(0) == 0xfeff) {
                        text = text.substring(1);
                    }
                }
                line++;
                if (line == mLine) {
                    mLineToChar = result.length();
                }
                result.append(text);
                result.append('\n');
            }
            br.close();
            is.close();
            return result;
        } catch (IOException e) {
            e.printStackTrace();
        }
    } catch (IOException e1) {
        e1.printStackTrace();
    }
    return null;
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Charset(java.nio.charset.Charset) DetectorException(org.mozilla.universalchardet.UniversalDetector.DetectorException) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) UniversalDetector(org.mozilla.universalchardet.UniversalDetector) BufferedInputStream(java.io.BufferedInputStream) BufferedReader(java.io.BufferedReader) SpannableStringBuilder(jp.sblo.pandora.jota.text.SpannableStringBuilder)

Example 3 with UniversalDetector

use of org.mozilla.universalchardet.UniversalDetector in project gerrit by GerritCodeReview.

the class Text method charset.

private static Charset charset(byte[] content, String encoding) {
    if (encoding == null) {
        UniversalDetector d = new UniversalDetector(null);
        d.handleData(content, 0, content.length);
        d.dataEnd();
        encoding = d.getDetectedCharset();
    }
    if (encoding == null) {
        return ISO_8859_1;
    }
    try {
        return Charset.forName(encoding);
    } catch (IllegalCharsetNameException err) {
        log.error("Invalid detected charset name '" + encoding + "': " + err);
        return ISO_8859_1;
    } catch (UnsupportedCharsetException err) {
        log.error("Detected charset '" + encoding + "' not supported: " + err);
        return ISO_8859_1;
    }
}
Also used : IllegalCharsetNameException(java.nio.charset.IllegalCharsetNameException) UnsupportedCharsetException(java.nio.charset.UnsupportedCharsetException) UniversalDetector(org.mozilla.universalchardet.UniversalDetector)

Aggregations

UniversalDetector (org.mozilla.universalchardet.UniversalDetector)3 BufferedInputStream (java.io.BufferedInputStream)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 ByteBuffer (java.nio.ByteBuffer)1 Charset (java.nio.charset.Charset)1 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)1 UnsupportedCharsetException (java.nio.charset.UnsupportedCharsetException)1 SpannableStringBuilder (jp.sblo.pandora.jota.text.SpannableStringBuilder)1 DetectorException (org.mozilla.universalchardet.UniversalDetector.DetectorException)1