Search in sources :

Example 6 with Language

use of com.cybozu.labs.langdetect.Language in project validator by validator.

the class LanguageDetectingChecker method detectLanguageAndCheckAgainstDeclaredLanguage.

private void detectLanguageAndCheckAgainstDeclaredLanguage() throws SAXException {
    if (nonWhitespaceCharacterCount < MIN_CHARS) {
        warnIfMissingLang();
        return;
    }
    if (// "No Linguistic Content"
    "zxx".equals(declaredLangCode) || // Esperanto
    "eo".equals(declaredLangCode) || // Latin
    "la".equals(declaredLangCode)) {
        return;
    }
    if (LANG_TAGS_BY_TLD.containsKey(tld) && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld), declaredLangCode) >= 0) {
        return;
    }
    try {
        String textContent = // 
        documentContent.toString().replaceAll("\\s+", " ");
        String detectedLanguage = "";
        Detector detector = DetectorFactory.create();
        detector.append(textContent);
        detector.getProbabilities();
        ArrayList<String> possibileLanguages = new ArrayList<>();
        ArrayList<Language> possibilities = detector.getProbabilities();
        for (Language possibility : possibilities) {
            possibileLanguages.add(possibility.lang);
            ULocale plocale = new ULocale(possibility.lang);
            if (Arrays.binarySearch(COMMON_LANGS, possibility.lang) < 0 && systemId != null) {
                log4j.info(String.format("%s %s %s", plocale.getDisplayName(), possibility.prob, systemId));
            }
            if (possibility.prob > MIN_PROBABILITY) {
                detectedLanguage = possibility.lang;
                setDocumentLanguage(detectedLanguage);
            } else if ((possibileLanguages.contains("hr") && (possibileLanguages.contains("sr-latn") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("sr-latn") && (possibileLanguages.contains("hr") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("bs") && (possibileLanguages.contains("hr") || possibileLanguages.contains("sr-latn")))) {
                if (htmlElementHasLang || systemId != null) {
                    detectedLanguage = getDetectedLanguageSerboCroatian();
                    setDocumentLanguage(detectedLanguage);
                }
                if ("sh".equals(detectedLanguage)) {
                    checkLangAttributeSerboCroatian();
                    return;
                }
            }
        }
        if ("".equals(detectedLanguage)) {
            warnIfMissingLang();
            return;
        }
        String detectedLanguageName = "";
        String preferredLanguageCode = "";
        ULocale locale = new ULocale(detectedLanguage);
        String detectedLanguageCode = locale.getLanguage();
        if ("no".equals(detectedLanguage)) {
            checkLangAttributeNorwegian();
            checkContentLanguageHeaderNorwegian(detectedLanguage, detectedLanguageName, detectedLanguageCode);
            return;
        }
        if ("zh-hans".equals(detectedLanguage)) {
            detectedLanguageName = "Simplified Chinese";
            preferredLanguageCode = "zh-hans";
        } else if ("zh-hant".equals(detectedLanguage)) {
            detectedLanguageName = "Traditional Chinese";
            preferredLanguageCode = "zh-hant";
        } else if ("mhr".equals(detectedLanguage)) {
            detectedLanguageName = "Meadow Mari";
            preferredLanguageCode = "mhr";
        } else if ("mrj".equals(detectedLanguage)) {
            detectedLanguageName = "Hill Mari";
            preferredLanguageCode = "mrj";
        } else if ("nah".equals(detectedLanguage)) {
            detectedLanguageName = "Nahuatl";
            preferredLanguageCode = "nah";
        } else if ("pnb".equals(detectedLanguage)) {
            detectedLanguageName = "Western Panjabi";
            preferredLanguageCode = "pnb";
        } else if ("sr-cyrl".equals(detectedLanguage)) {
            detectedLanguageName = "Serbian";
            preferredLanguageCode = "sr";
        } else if ("sr-latn".equals(detectedLanguage)) {
            detectedLanguageName = "Serbian";
            preferredLanguageCode = "sr";
        } else if ("uz-cyrl".equals(detectedLanguage)) {
            detectedLanguageName = "Uzbek";
            preferredLanguageCode = "uz";
        } else if ("uz-latn".equals(detectedLanguage)) {
            detectedLanguageName = "Uzbek";
            preferredLanguageCode = "uz";
        } else if ("zxx".equals(detectedLanguage)) {
            detectedLanguageName = "Lorem ipsum text";
            preferredLanguageCode = "zxx";
        } else {
            detectedLanguageName = locale.getDisplayName();
            preferredLanguageCode = detectedLanguageCode;
        }
        checkLangAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
        checkDirAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
        checkContentLanguageHeader(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
    } catch (LangDetectException e) {
    }
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) ULocale(com.ibm.icu.util.ULocale) Language(com.cybozu.labs.langdetect.Language) ArrayList(java.util.ArrayList) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Aggregations

Language (com.cybozu.labs.langdetect.Language)6 LangDetectException (com.cybozu.labs.langdetect.LangDetectException)5 Detector (com.cybozu.labs.langdetect.Detector)4 ArrayList (java.util.ArrayList)3 ULocale (com.ibm.icu.util.ULocale)1 IOException (java.io.IOException)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)1