use of com.cybozu.labs.langdetect.LangDetectException in project Asqatasun by Asqatasun.
the class LanguageDetector method initProfiles.
/**
* Initialise the language profiles needed by the detector. This
* initialisation has to be performed only once.
*/
private void initProfiles() {
PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
List<String> profiles = new ArrayList<>();
DetectorFactory.setSeed(0L);
try {
for (Resource rs : resolver.getResources(profilePath)) {
StringWriter writer = new StringWriter();
IOUtils.copy(rs.getInputStream(), writer);
profiles.add(writer.toString());
}
DetectorFactory.loadProfile(profiles);
} catch (IOException | LangDetectException ex) {
LOGGER.warn("error occured when detecting lang", ex);
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project lucene-solr by apache.
the class LangDetectLanguageIdentifierUpdateProcessor method detectLanguage.
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
try {
Detector detector = DetectorFactory.create();
detector.setMaxTextLength(maxTotalChars);
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
detector.append(stringContent.substring(0, maxFieldValueChars));
} else {
detector.append(stringContent);
}
detector.append(" ");
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l : langlist) {
solrLangList.add(new DetectedLanguage(l.lang, l.prob));
}
return solrLangList;
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project validator by validator.
the class LanguageDetectingChecker method detectLanguageAndCheckAgainstDeclaredLanguage.
private void detectLanguageAndCheckAgainstDeclaredLanguage() throws SAXException {
if (nonWhitespaceCharacterCount < MIN_CHARS) {
warnIfMissingLang();
return;
}
if (// "No Linguistic Content"
"zxx".equals(declaredLangCode) || // Esperanto
"eo".equals(declaredLangCode) || // Latin
"la".equals(declaredLangCode)) {
return;
}
if (LANG_TAGS_BY_TLD.containsKey(tld) && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld), declaredLangCode) >= 0) {
return;
}
try {
String textContent = //
documentContent.toString().replaceAll("\\s+", " ");
String detectedLanguage = "";
Detector detector = DetectorFactory.create();
detector.append(textContent);
detector.getProbabilities();
ArrayList<String> possibileLanguages = new ArrayList<>();
ArrayList<Language> possibilities = detector.getProbabilities();
for (Language possibility : possibilities) {
possibileLanguages.add(possibility.lang);
if (possibility.prob > MIN_PROBABILITY) {
detectedLanguage = possibility.lang;
setDocumentLanguage(detectedLanguage);
} else if ((possibileLanguages.contains("hr") && (possibileLanguages.contains("sr-latn") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("sr-latn") && (possibileLanguages.contains("hr") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("bs") && (possibileLanguages.contains("hr") || possibileLanguages.contains("sr-latn")))) {
if (htmlElementHasLang || systemId != null) {
detectedLanguage = getDetectedLanguageSerboCroatian();
setDocumentLanguage(detectedLanguage);
}
if ("sh".equals(detectedLanguage)) {
checkLangAttributeSerboCroatian();
return;
}
}
}
if ("".equals(detectedLanguage)) {
warnIfMissingLang();
return;
}
String detectedLanguageName = "";
String preferredLanguageCode = "";
ULocale locale = new ULocale(detectedLanguage);
String detectedLanguageCode = locale.getLanguage();
if ("no".equals(detectedLanguage)) {
checkLangAttributeNorwegian();
checkContentLanguageHeaderNorwegian(detectedLanguage, detectedLanguageName, detectedLanguageCode);
return;
}
if ("zh-hans".equals(detectedLanguage)) {
detectedLanguageName = "Simplified Chinese";
preferredLanguageCode = "zh-hans";
} else if ("zh-hant".equals(detectedLanguage)) {
detectedLanguageName = "Traditional Chinese";
preferredLanguageCode = "zh-hant";
} else if ("mhr".equals(detectedLanguage)) {
detectedLanguageName = "Meadow Mari";
preferredLanguageCode = "mhr";
} else if ("mrj".equals(detectedLanguage)) {
detectedLanguageName = "Hill Mari";
preferredLanguageCode = "mrj";
} else if ("nah".equals(detectedLanguage)) {
detectedLanguageName = "Nahuatl";
preferredLanguageCode = "nah";
} else if ("pnb".equals(detectedLanguage)) {
detectedLanguageName = "Western Panjabi";
preferredLanguageCode = "pnb";
} else if ("sr-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("sr-latn".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("uz-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("uz-latn".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("zxx".equals(detectedLanguage)) {
detectedLanguageName = "Lorem ipsum text";
preferredLanguageCode = "zxx";
} else {
detectedLanguageName = locale.getDisplayName();
preferredLanguageCode = detectedLanguageCode;
}
checkLangAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkDirAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkContentLanguageHeader(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
} catch (LangDetectException e) {
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project opencms-core by alkacon.
the class CmsStringUtil method getLocaleForText.
/**
* Returns the locale for the given text based on the language detection library.<p>
*
* The result will be <code>null</code> if the detection fails or the detected locale is not configured
* in the 'opencms-system.xml' as available locale.<p>
*
* @param text the text to retrieve the locale for
*
* @return the detected locale for the given text
*/
public static Locale getLocaleForText(String text) {
// try to detect locale by language detector
if (isNotEmptyOrWhitespaceOnly(text)) {
try {
Detector detector = DetectorFactory.create();
detector.append(text);
String lang = detector.detect();
Locale loc = new Locale(lang);
if (OpenCms.getLocaleManager().getAvailableLocales().contains(loc)) {
return loc;
}
} catch (LangDetectException e) {
LOG.debug(e.getLocalizedMessage(), e);
}
}
return null;
}
use of com.cybozu.labs.langdetect.LangDetectException in project mojito by box.
the class LanguageDetectionService method detect.
/**
* Detects the language of input text taking in account the expected
* language of the text.
*
* @param text text for language detection
* @param expectedBcp47Tag the expected bcp47 tag of the text (can contain a locale)
* @return a {@link LanguageDetectionResult}
* @throws UnsupportedLanguageException if the language is not supported,
* see {@link #isSupportedBcp47Tag(java.lang.String) } to check supported
* languages.
*/
public LanguageDetectionResult detect(String text, String expectedBcp47Tag) throws UnsupportedLanguageException {
LanguageDetectionResult languageDetectionResult = new LanguageDetectionResult();
try {
String detectorLanguageForBcp47Tag = getDetectorLanguageForBcp47Tag(expectedBcp47Tag);
Detector detector = getDetectorForLanguage(detectorLanguageForBcp47Tag);
detector.append(text);
String detect = detector.detect();
ArrayList<Language> probabilities = detector.getProbabilities();
for (Language probability : probabilities) {
if (probability.lang.equals(detectorLanguageForBcp47Tag)) {
languageDetectionResult.setProbabilityExpected(probability.prob);
}
}
languageDetectionResult.setProbability(detector.getProbabilities().get(0).prob);
languageDetectionResult.setDetected(detect);
languageDetectionResult.setExpected(detectorLanguageForBcp47Tag);
languageDetectionResult.setDetector(detector);
} catch (LangDetectException lde) {
logger.error("language detection failed\ntext: {}", text);
languageDetectionResult.setLangDetectException(lde);
}
return languageDetectionResult;
}
Aggregations