use of com.optimaize.langdetect.LanguageDetectorBuilder in project tika by apache.
the class OptimaizeLangDetector method createDetector.
private com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles) {
// FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which
// means you can often get 0 probabilities. So we pick a very short length for this limit.
LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()).shortTextAlgorithm(30).withProfiles(languageProfiles);
if (languageProbabilities != null) {
Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size());
for (String language : languageProbabilities.keySet()) {
Double priority = (double) languageProbabilities.get(language);
languageWeights.put(LdLocale.fromString(language), priority);
}
builder.languagePriorities(languageWeights);
}
return builder.build();
}
Aggregations