Search in sources :

Example 1 with CharsetMatch

use of org.apache.tika.parser.txt.CharsetMatch in project ddf by codice.

the class KlvEncodingDetectedString method decodeValue.

@Override
protected void decodeValue(Klv klv) {
    byte[] bytes = klv.getValue();
    CharsetDetector charsetDetector = new CharsetDetector();
    charsetDetector.setText(bytes);
    CharsetMatch[] charsetMatches = charsetDetector.detectAll();
    Optional<CharsetMatch> charsetMatch = Arrays.stream(charsetMatches).filter(match -> possibleCharsets.contains(match.getName())).findFirst();
    Charset charset = utf8;
    if (charsetMatch.isPresent()) {
        try {
            charset = Charset.forName(charsetMatch.get().getName());
        } catch (IllegalArgumentException e) {
            LOGGER.trace("Unsupported encoding, falling back to default encoding");
        }
    }
    value = new String(bytes, charset);
}
Also used : Arrays(java.util.Arrays) List(java.util.List) KlvDataElement(org.codice.ddf.libs.klv.KlvDataElement) Logger(org.slf4j.Logger) Charset(java.nio.charset.Charset) CharsetMatch(org.apache.tika.parser.txt.CharsetMatch) Klv(org.codice.ddf.libs.klv.data.Klv) LoggerFactory(org.slf4j.LoggerFactory) Optional(java.util.Optional) CharsetDetector(org.apache.tika.parser.txt.CharsetDetector) CharsetMatch(org.apache.tika.parser.txt.CharsetMatch) CharsetDetector(org.apache.tika.parser.txt.CharsetDetector) Charset(java.nio.charset.Charset)

Example 2 with CharsetMatch

use of org.apache.tika.parser.txt.CharsetMatch in project tika by apache.

the class OutlookExtractor method guess7BitEncoding.

/**
     * Tries to identify the correct encoding for 7-bit (non-unicode)
     *  strings in the file.
     * <p>Many messages store their strings as unicode, which is
     *  nice and easy. Some use one-byte encodings for their
     *  strings, but don't always store the encoding anywhere
     *  helpful in the file.</p>
     * <p>This method checks for codepage properties, and failing that
     *  looks at the headers for the message, and uses these to
     *  guess the correct encoding for your file.</p>
     * <p>Bug #49441 has more on why this is needed</p>
     * <p>This is taken verbatim from POI (TIKA-1238)
     * as a temporary workaround to prevent unsupported encoding exceptions</p>
     */
private void guess7BitEncoding(MAPIMessage msg) {
    Chunks mainChunks = msg.getMainChunks();
    //sanity check
    if (mainChunks == null) {
        return;
    }
    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    if (props != null) {
        // First choice is a codepage property
        for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
            List<PropertyValue> val = props.get(prop);
            if (val != null && val.size() > 0) {
                int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                String encoding = null;
                try {
                    encoding = CodePageUtil.codepageToEncoding(codepage, true);
                } catch (UnsupportedEncodingException e) {
                //swallow
                }
                if (tryToSet7BitEncoding(msg, encoding)) {
                    return;
                }
            }
        }
    }
    // Second choice is a charset on a content type header
    try {
        String[] headers = msg.getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (tryToSet7BitEncoding(msg, charset)) {
                            return;
                        }
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }
    // meta header if there is no other information?
    try {
        String html = msg.getHtmlBody();
        if (html != null && html.length() > 0) {
            Charset charset = null;
            try {
                charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
            } catch (IOException e) {
            //swallow
            }
            if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }
    //absolute last resort, try charset detector
    StringChunk text = mainChunks.getTextBodyChunk();
    if (text != null) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(text.getRawValue());
        CharsetMatch match = detector.detect();
        if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
            return;
        }
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) Pattern(java.util.regex.Pattern) Chunks(org.apache.poi.hsmf.datatypes.Chunks) RecipientChunks(org.apache.poi.hsmf.datatypes.RecipientChunks) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) Matcher(java.util.regex.Matcher) CharsetDetector(org.apache.tika.parser.txt.CharsetDetector) PropertyValue(org.apache.poi.hsmf.datatypes.PropertyValue) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Charset(java.nio.charset.Charset) IOException(java.io.IOException) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) CharsetMatch(org.apache.tika.parser.txt.CharsetMatch) ByteArrayInputStream(java.io.ByteArrayInputStream) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) MAPIProperty(org.apache.poi.hsmf.datatypes.MAPIProperty)

Example 3 with CharsetMatch

use of org.apache.tika.parser.txt.CharsetMatch in project Java-readability by basis-technology-corp.

the class TikaCharsetDetector method detect.

@Override
public String detect(byte[] data, String hint) {
    CharsetDetector detector = new CharsetDetector();
    if (hint != null) {
        detector.setDeclaredEncoding(hint);
    }
    detector.setText(data);
    CharsetMatch match = detector.detect();
    return match.getName();
}
Also used : CharsetMatch(org.apache.tika.parser.txt.CharsetMatch) CharsetDetector(org.apache.tika.parser.txt.CharsetDetector)

Aggregations

CharsetDetector (org.apache.tika.parser.txt.CharsetDetector)3 CharsetMatch (org.apache.tika.parser.txt.CharsetMatch)3 Charset (java.nio.charset.Charset)2 List (java.util.List)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 LinkedList (java.util.LinkedList)1 Optional (java.util.Optional)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)1 Chunks (org.apache.poi.hsmf.datatypes.Chunks)1 MAPIProperty (org.apache.poi.hsmf.datatypes.MAPIProperty)1 PropertyValue (org.apache.poi.hsmf.datatypes.PropertyValue)1 RecipientChunks (org.apache.poi.hsmf.datatypes.RecipientChunks)1 StringChunk (org.apache.poi.hsmf.datatypes.StringChunk)1 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)1