Search in sources :

Example 31 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project knife by bit4woo.

the class HttpMessageCharSet method getCharset.

public static String getCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // http post的默认编码
    String tmpcharSet = "ISO-8859-1";
    if (contentType != null) {
        // 1、尝试从contentTpye中获取
        if (contentType.toLowerCase().contains("charset=")) {
            tmpcharSet = contentType.toLowerCase().split("charset=")[1];
        }
    }
    if (tmpcharSet == null) {
        // 2、尝试使用ICU4J进行编码的检测
        CharsetDetector detector = new CharsetDetector();
        detector.setText(requestOrResponse);
        CharsetMatch cm = detector.detect();
        tmpcharSet = cm.getName();
    }
    tmpcharSet = tmpcharSet.toLowerCase().trim();
    // 常见的编码格式有ASCII、ANSI、GBK、GB2312、UTF-8、GB18030和UNICODE等。
    List<String> commonCharSet = Arrays.asList("ASCII,ANSI,GBK,GB2312,UTF-8,GB18030,UNICODE,utf8".toLowerCase().split(","));
    for (String item : commonCharSet) {
        if (tmpcharSet.contains(item)) {
            tmpcharSet = item;
        }
    }
    if (tmpcharSet.equals("utf8"))
        tmpcharSet = "utf-8";
    return tmpcharSet;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 32 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project knife by bit4woo.

the class CharSetHelper method detectCharset.

/**
 * utf8 utf-8都是可以的。
 * @param requestOrResponse
 * @return
 */
public static String detectCharset(byte[] requestOrResponse) {
    IExtensionHelpers helpers = BurpExtender.getCallbacks().getHelpers();
    Getter getter = new Getter(helpers);
    boolean isRequest = true;
    if (new String(requestOrResponse).startsWith("HTTP/")) {
        // response
        isRequest = false;
    }
    String contentType = getter.getHeaderValueOf(isRequest, requestOrResponse, "Content-Type");
    // 1、尝试从contentTpye中获取
    if (contentType != null) {
        if (contentType.toLowerCase().contains("charset=")) {
            String tmpcharSet = contentType.toLowerCase().split("charset=")[1];
            if (tmpcharSet != null && tmpcharSet.length() > 0) {
                return tmpcharSet;
            }
        }
    }
    // 2、尝试使用ICU4J进行编码的检测
    CharsetDetector detector = new CharsetDetector();
    detector.setText(requestOrResponse);
    CharsetMatch cm = detector.detect();
    if (cm != null) {
        return cm.getName();
    }
    // 3、http post的默认编码
    return "ISO-8859-1";
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) IExtensionHelpers(burp.IExtensionHelpers) Getter(burp.Getter) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 33 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project tablesaw by jtablesaw.

the class Source method getCharSet.

/**
 * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
 * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
 * returned instead.
 *
 * @param buffer The byte array to evaluate
 * @return The likely charset, or the system default charset
 */
private static Charset getCharSet(byte[] buffer) {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(buffer);
    CharsetMatch match = detector.detect();
    if (match == null || match.getConfidence() < 60) {
        return Charset.defaultCharset();
    }
    return Charset.forName(match.getName());
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector)

Example 34 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project htmlparser by validator.

the class IcuDetectorSniffer method sniff.

public Encoding sniff() throws IOException {
    try {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(this);
        CharsetMatch match = detector.detect();
        Encoding enc = Encoding.forName(match.getName());
        Encoding actual = enc.getActualHtmlEncoding();
        if (actual != null) {
            enc = actual;
        }
        if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
            return enc;
        } else {
            return null;
        }
    } catch (Exception e) {
        return null;
    }
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) CharsetDetector(com.ibm.icu.text.CharsetDetector) Encoding(nu.validator.htmlparser.io.Encoding) IOException(java.io.IOException)

Example 35 with CharsetMatch

use of com.ibm.icu.text.CharsetMatch in project ultimate-cube by G3G4X5X6.

the class EncodeConversion method addToolBarActionListener.

private void addToolBarActionListener() {
    importBtn.addActionListener(new AbstractAction() {

        @SneakyThrows
        @Override
        public void actionPerformed(ActionEvent e) {
            log.debug("导入待转换文件");
            // 创建一个默认的文件选取器
            JFileChooser fileChooser = new JFileChooser();
            // 允许多选
            fileChooser.setMultiSelectionEnabled(true);
            // 设置文件选择的模式(只选文件、只选文件夹、文件和文件均可选)
            fileChooser.setFileSelectionMode(JFileChooser.FILES_AND_DIRECTORIES);
            // 打开文件选择框(线程将被阻塞, 直到选择框被关闭)
            int result = fileChooser.showOpenDialog(App.mainFrame);
            if (result == JFileChooser.APPROVE_OPTION) {
                File[] files = fileChooser.getSelectedFiles();
                // 设置进度条
                progressBar.setVisible(true);
                progressBar.setStringPainted(false);
                progressBar.setIndeterminate(true);
                // 创建后台任务
                SwingWorker<String, Object> task = new SwingWorker<String, Object>() {

                    @Override
                    protected String doInBackground() throws Exception {
                        // 此处处于 SwingWorker 线程池中
                        for (File file : files) {
                            if (file.isDirectory()) {
                                log.debug("Directory: " + file.getAbsolutePath());
                                readDir(file);
                            } else {
                                log.debug("File: " + file.getPath());
                                CharsetMatch cm = CommonUtil.checkCharset(new BufferedInputStream(new FileInputStream(file)));
                                log.debug("CheckCharset:" + cm.getName());
                                leftModel.addRow(new String[] { file.getName(), cm.getName(), String.valueOf(cm.getConfidence()) });
                                globalFile.add(file);
                                progressBar.setValue(globalFile.size());
                            }
                        }
                        return "Hello";
                    }

                    @Override
                    protected void done() {
                        // 此方法将在后台任务完成后在事件调度线程中被回调
                        progressBar.setIndeterminate(false);
                        progressBar.setMaximum(globalFile.size());
                        progressBar.setValue(globalFile.size());
                    }
                };
                // 启动任务
                task.execute();
            }
        }
    });
    exportBtn.addActionListener(new AbstractAction() {

        @Override
        public void actionPerformed(ActionEvent e) {
            // 创建一个默认的文件选取器
            JFileChooser fileChooser = new JFileChooser();
            // 允许多选
            fileChooser.setMultiSelectionEnabled(false);
            // 设置文件选择的模式(只选文件、只选文件夹、文件和文件均可选)
            fileChooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
            // 打开文件选择框(线程将被阻塞, 直到选择框被关闭)
            int result = fileChooser.showOpenDialog(App.mainFrame);
            if (result == JFileChooser.APPROVE_OPTION) {
                outputDir = fileChooser.getSelectedFile();
            }
        }
    });
    conversionBtn.addActionListener(new AbstractAction() {

        @SneakyThrows
        @Override
        public void actionPerformed(ActionEvent e) {
            log.debug("开始转换文件编码");
            if (globalFile.size() > 0) {
                // TODO 每次重新转换需考虑缓存
                rightModel.setRowCount(0);
                // 设置进度条
                // JProgressBar rogressBar = new JProgressBar();
                progressPane.add(progressBar);
                progressBar.setMaximum(globalFile.size());
                progressBar.setValue(0);
                progressBar.setVisible(true);
                progressBar.setStringPainted(true);
                // 创建后台任务
                SwingWorker<String, Object> task = new SwingWorker<String, Object>() {

                    @Override
                    protected String doInBackground() throws Exception {
                        // 此处处于 SwingWorker 线程池中
                        Iterator<File> iterator = globalFile.iterator();
                        int i = 1;
                        // TODO 转换保存文件编码
                        while (iterator.hasNext()) {
                            File file = iterator.next();
                            log.debug(file.getPath());
                            CharsetMatch cm = CommonUtil.checkCharset(new BufferedInputStream(new FileInputStream(file)));
                            log.debug("CheckCharset:" + cm.getName());
                            // BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), dstComboBox.getSelectedItem().toString()));
                            // BufferedWriter converionWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputDir.getAbsolutePath() + "/" + file.getName())), dstComboBox.getSelectedItem().toString()));
                            // String buffer = null;
                            // while ((buffer = bufferedReader.readLine()) != null) {
                            // converionWriter.write(buffer + "\n");
                            // log.debug(buffer);
                            // }
                            // bufferedReader.close();
                            // converionWriter.close();
                            Files.copy(file, new File(outputDir.getAbsolutePath() + "/" + file.getName()));
                            // FileUtil.convertCharset(new File(outputDir.getAbsolutePath() + "/" + file.getName()), Charset.forName(cm.getName()), Charset.forName(dstComboBox.getSelectedItem().toString()));
                            CharsetMatch tmp = CommonUtil.checkCharset(new BufferedInputStream(new FileInputStream(outputDir.getAbsolutePath() + "/" + file.getName())));
                            rightModel.addRow(new String[] { file.getName(), tmp.getName(), String.valueOf(tmp.getConfidence()) });
                            progressBar.setValue(i);
                            i++;
                        }
                        return "Hello";
                    }

                    @Override
                    protected void done() {
                        // 此方法将在后台任务完成后在事件调度线程中被回调
                        log.debug("文件编码转换完成");
                    }
                };
                // 启动任务
                task.execute();
            }
        }
    });
    cleanBtn.addActionListener(new AbstractAction() {

        @Override
        public void actionPerformed(ActionEvent e) {
            log.debug("清除缓存");
            globalFile.clear();
            leftModel.setRowCount(0);
            rightModel.setRowCount(0);
            progressBar.setValue(0);
            progressBar.setVisible(false);
        }
    });
}
Also used : ActionEvent(java.awt.event.ActionEvent) SneakyThrows(lombok.SneakyThrows) FileInputStream(java.io.FileInputStream) CharsetMatch(com.ibm.icu.text.CharsetMatch) BufferedInputStream(java.io.BufferedInputStream) Iterator(java.util.Iterator) File(java.io.File)

Aggregations

CharsetMatch (com.ibm.icu.text.CharsetMatch)43 CharsetDetector (com.ibm.icu.text.CharsetDetector)28 IOException (java.io.IOException)12 BufferedInputStream (java.io.BufferedInputStream)8 InputStream (java.io.InputStream)5 File (java.io.File)4 FileInputStream (java.io.FileInputStream)4 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)3 BufferedReader (java.io.BufferedReader)2 Charset (java.nio.charset.Charset)2 Nullable (javax.annotation.Nullable)2 ServletException (javax.servlet.ServletException)2 SneakyThrows (lombok.SneakyThrows)2 DocumentFile (androidx.documentfile.provider.DocumentFile)1 Getter (burp.Getter)1 IExtensionHelpers (burp.IExtensionHelpers)1 OKMDocument (com.openkm.api.OKMDocument)1 AutomationException (com.openkm.automation.AutomationException)1 Document (com.openkm.bean.Document)1 OKMException (com.openkm.frontend.client.OKMException)1