use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class DBFParser method getCharset.
private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) throws IOException, TikaException {
//TODO: potentially use codepage info in the header
Charset charset = DEFAULT_CHARSET;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
for (DBFRow row : firstRows) {
for (DBFCell cell : row.cells) {
if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
byte[] bytes = cell.getBytes();
bos.write(bytes);
if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
break;
}
}
}
}
byte[] bytes = bos.toByteArray();
if (bytes.length > 20) {
EncodingDetector detector = new Icu4jEncodingDetector();
detector.detect(TikaInputStream.get(bytes), new Metadata());
charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata());
}
return charset;
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class HtmlParserTest method testMultiThreadingEncodingDetection.
@Test
public void testMultiThreadingEncodingDetection() throws Exception {
List<EncodingDetector> detectors = new ArrayList<>();
ServiceLoader loader = new ServiceLoader(AutoDetectReader.class.getClassLoader());
detectors.addAll(loader.loadServiceProviders(EncodingDetector.class));
for (EncodingDetector detector : detectors) {
testDetector(detector);
}
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaConfigSerializer method addEncodingDetectors.
private static void addEncodingDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
EncodingDetector encDetector = config.getEncodingDetector();
if (mode == Mode.MINIMAL && encDetector instanceof DefaultEncodingDetector) {
// Don't output anything, all using defaults
Node detComment = doc.createComment("for example: <encodingDetectors><encodingDetector class=\"" + "org.apache.tika.detect.DefaultEncodingDetector\"></encodingDetectors>");
rootElement.appendChild(detComment);
return;
}
Element encDetectorsElement = doc.createElement("encodingDetectors");
if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector || !(encDetector instanceof CompositeEncodingDetector)) {
Element encDetectorElement = doc.createElement("encodingDetector");
encDetectorElement.setAttribute("class", encDetector.getClass().getCanonicalName());
encDetectorsElement.appendChild(encDetectorElement);
} else {
List<EncodingDetector> children = ((CompositeEncodingDetector) encDetector).getDetectors();
for (EncodingDetector d : children) {
Element encDetectorElement = doc.createElement("encodingDetector");
encDetectorElement.setAttribute("class", d.getClass().getCanonicalName());
encDetectorsElement.appendChild(encDetectorElement);
}
}
rootElement.appendChild(encDetectorsElement);
}
use of org.apache.tika.detect.EncodingDetector in project tika by apache.
the class TikaEncodingDetectorTest method testEncodingDetectorsAreLoaded.
@Test
public void testEncodingDetectorsAreLoaded() {
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) new TXTParser()).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
}
Aggregations