use of com.cybozu.labs.langdetect.Language in project validator by validator.
the class LanguageDetectingChecker method detectLanguageAndCheckAgainstDeclaredLanguage.
private void detectLanguageAndCheckAgainstDeclaredLanguage() throws SAXException {
if (nonWhitespaceCharacterCount < MIN_CHARS) {
warnIfMissingLang();
return;
}
if (// "No Linguistic Content"
"zxx".equals(declaredLangCode) || // Esperanto
"eo".equals(declaredLangCode) || // Latin
"la".equals(declaredLangCode)) {
return;
}
if (LANG_TAGS_BY_TLD.containsKey(tld) && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld), declaredLangCode) >= 0) {
return;
}
try {
String textContent = //
documentContent.toString().replaceAll("\\s+", " ");
String detectedLanguage = "";
Detector detector = DetectorFactory.create();
detector.append(textContent);
detector.getProbabilities();
ArrayList<String> possibileLanguages = new ArrayList<>();
ArrayList<Language> possibilities = detector.getProbabilities();
for (Language possibility : possibilities) {
possibileLanguages.add(possibility.lang);
ULocale plocale = new ULocale(possibility.lang);
if (Arrays.binarySearch(COMMON_LANGS, possibility.lang) < 0 && systemId != null) {
log4j.info(String.format("%s %s %s", plocale.getDisplayName(), possibility.prob, systemId));
}
if (possibility.prob > MIN_PROBABILITY) {
detectedLanguage = possibility.lang;
setDocumentLanguage(detectedLanguage);
} else if ((possibileLanguages.contains("hr") && (possibileLanguages.contains("sr-latn") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("sr-latn") && (possibileLanguages.contains("hr") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("bs") && (possibileLanguages.contains("hr") || possibileLanguages.contains("sr-latn")))) {
if (htmlElementHasLang || systemId != null) {
detectedLanguage = getDetectedLanguageSerboCroatian();
setDocumentLanguage(detectedLanguage);
}
if ("sh".equals(detectedLanguage)) {
checkLangAttributeSerboCroatian();
return;
}
}
}
if ("".equals(detectedLanguage)) {
warnIfMissingLang();
return;
}
String detectedLanguageName = "";
String preferredLanguageCode = "";
ULocale locale = new ULocale(detectedLanguage);
String detectedLanguageCode = locale.getLanguage();
if ("no".equals(detectedLanguage)) {
checkLangAttributeNorwegian();
checkContentLanguageHeaderNorwegian(detectedLanguage, detectedLanguageName, detectedLanguageCode);
return;
}
if ("zh-hans".equals(detectedLanguage)) {
detectedLanguageName = "Simplified Chinese";
preferredLanguageCode = "zh-hans";
} else if ("zh-hant".equals(detectedLanguage)) {
detectedLanguageName = "Traditional Chinese";
preferredLanguageCode = "zh-hant";
} else if ("mhr".equals(detectedLanguage)) {
detectedLanguageName = "Meadow Mari";
preferredLanguageCode = "mhr";
} else if ("mrj".equals(detectedLanguage)) {
detectedLanguageName = "Hill Mari";
preferredLanguageCode = "mrj";
} else if ("nah".equals(detectedLanguage)) {
detectedLanguageName = "Nahuatl";
preferredLanguageCode = "nah";
} else if ("pnb".equals(detectedLanguage)) {
detectedLanguageName = "Western Panjabi";
preferredLanguageCode = "pnb";
} else if ("sr-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("sr-latn".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("uz-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("uz-latn".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("zxx".equals(detectedLanguage)) {
detectedLanguageName = "Lorem ipsum text";
preferredLanguageCode = "zxx";
} else {
detectedLanguageName = locale.getDisplayName();
preferredLanguageCode = detectedLanguageCode;
}
checkLangAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkDirAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkContentLanguageHeader(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
} catch (LangDetectException e) {
}
}
Aggregations