Search in sources :

Example 6 with LangDetectException

use of com.cybozu.labs.langdetect.LangDetectException in project Asqatasun by Asqatasun.

the class LanguageDetector method initProfiles.

/**
 * Initialise the language profiles needed by the detector. This
 * initialisation has to be performed only once.
 */
private void initProfiles() {
    PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
    List<String> profiles = new ArrayList<>();
    DetectorFactory.setSeed(0L);
    try {
        for (Resource rs : resolver.getResources(profilePath)) {
            StringWriter writer = new StringWriter();
            IOUtils.copy(rs.getInputStream(), writer);
            profiles.add(writer.toString());
        }
        DetectorFactory.loadProfile(profiles);
    } catch (IOException | LangDetectException ex) {
        LOGGER.warn("error occured when detecting lang", ex);
    }
}
Also used : StringWriter(java.io.StringWriter) ArrayList(java.util.ArrayList) Resource(org.springframework.core.io.Resource) IOException(java.io.IOException) PathMatchingResourcePatternResolver(org.springframework.core.io.support.PathMatchingResourcePatternResolver) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 7 with LangDetectException

use of com.cybozu.labs.langdetect.LangDetectException in project lucene-solr by apache.

the class LangDetectLanguageIdentifierUpdateProcessor method detectLanguage.

@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
    try {
        Detector detector = DetectorFactory.create();
        detector.setMaxTextLength(maxTotalChars);
        for (String fieldName : inputFields) {
            log.debug("Appending field " + fieldName);
            if (doc.containsKey(fieldName)) {
                Collection<Object> fieldValues = doc.getFieldValues(fieldName);
                if (fieldValues != null) {
                    for (Object content : fieldValues) {
                        if (content instanceof String) {
                            String stringContent = (String) content;
                            if (stringContent.length() > maxFieldValueChars) {
                                detector.append(stringContent.substring(0, maxFieldValueChars));
                            } else {
                                detector.append(stringContent);
                            }
                            detector.append(" ");
                        } else {
                            log.warn("Field " + fieldName + " not a String value, not including in detection");
                        }
                    }
                }
            }
        }
        ArrayList<Language> langlist = detector.getProbabilities();
        ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
        for (Language l : langlist) {
            solrLangList.add(new DetectedLanguage(l.lang, l.prob));
        }
        return solrLangList;
    } catch (LangDetectException e) {
        log.debug("Could not determine language, returning empty list: ", e);
        return Collections.emptyList();
    }
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) Language(com.cybozu.labs.langdetect.Language) ArrayList(java.util.ArrayList) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 8 with LangDetectException

use of com.cybozu.labs.langdetect.LangDetectException in project validator by validator.

the class LanguageDetectingChecker method detectLanguageAndCheckAgainstDeclaredLanguage.

private void detectLanguageAndCheckAgainstDeclaredLanguage() throws SAXException {
    if (nonWhitespaceCharacterCount < MIN_CHARS) {
        warnIfMissingLang();
        return;
    }
    if (// "No Linguistic Content"
    "zxx".equals(declaredLangCode) || // Esperanto
    "eo".equals(declaredLangCode) || // Latin
    "la".equals(declaredLangCode)) {
        return;
    }
    if (LANG_TAGS_BY_TLD.containsKey(tld) && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld), declaredLangCode) >= 0) {
        return;
    }
    try {
        String textContent = // 
        documentContent.toString().replaceAll("\\s+", " ");
        String detectedLanguage = "";
        Detector detector = DetectorFactory.create();
        detector.append(textContent);
        detector.getProbabilities();
        ArrayList<String> possibileLanguages = new ArrayList<>();
        ArrayList<Language> possibilities = detector.getProbabilities();
        for (Language possibility : possibilities) {
            possibileLanguages.add(possibility.lang);
            if (possibility.prob > MIN_PROBABILITY) {
                detectedLanguage = possibility.lang;
                setDocumentLanguage(detectedLanguage);
            } else if ((possibileLanguages.contains("hr") && (possibileLanguages.contains("sr-latn") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("sr-latn") && (possibileLanguages.contains("hr") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("bs") && (possibileLanguages.contains("hr") || possibileLanguages.contains("sr-latn")))) {
                if (htmlElementHasLang || systemId != null) {
                    detectedLanguage = getDetectedLanguageSerboCroatian();
                    setDocumentLanguage(detectedLanguage);
                }
                if ("sh".equals(detectedLanguage)) {
                    checkLangAttributeSerboCroatian();
                    return;
                }
            }
        }
        if ("".equals(detectedLanguage)) {
            warnIfMissingLang();
            return;
        }
        String detectedLanguageName = "";
        String preferredLanguageCode = "";
        ULocale locale = new ULocale(detectedLanguage);
        String detectedLanguageCode = locale.getLanguage();
        if ("no".equals(detectedLanguage)) {
            checkLangAttributeNorwegian();
            checkContentLanguageHeaderNorwegian(detectedLanguage, detectedLanguageName, detectedLanguageCode);
            return;
        }
        if ("zh-hans".equals(detectedLanguage)) {
            detectedLanguageName = "Simplified Chinese";
            preferredLanguageCode = "zh-hans";
        } else if ("zh-hant".equals(detectedLanguage)) {
            detectedLanguageName = "Traditional Chinese";
            preferredLanguageCode = "zh-hant";
        } else if ("mhr".equals(detectedLanguage)) {
            detectedLanguageName = "Meadow Mari";
            preferredLanguageCode = "mhr";
        } else if ("mrj".equals(detectedLanguage)) {
            detectedLanguageName = "Hill Mari";
            preferredLanguageCode = "mrj";
        } else if ("nah".equals(detectedLanguage)) {
            detectedLanguageName = "Nahuatl";
            preferredLanguageCode = "nah";
        } else if ("pnb".equals(detectedLanguage)) {
            detectedLanguageName = "Western Panjabi";
            preferredLanguageCode = "pnb";
        } else if ("sr-cyrl".equals(detectedLanguage)) {
            detectedLanguageName = "Serbian";
            preferredLanguageCode = "sr";
        } else if ("sr-latn".equals(detectedLanguage)) {
            detectedLanguageName = "Serbian";
            preferredLanguageCode = "sr";
        } else if ("uz-cyrl".equals(detectedLanguage)) {
            detectedLanguageName = "Uzbek";
            preferredLanguageCode = "uz";
        } else if ("uz-latn".equals(detectedLanguage)) {
            detectedLanguageName = "Uzbek";
            preferredLanguageCode = "uz";
        } else if ("zxx".equals(detectedLanguage)) {
            detectedLanguageName = "Lorem ipsum text";
            preferredLanguageCode = "zxx";
        } else {
            detectedLanguageName = locale.getDisplayName();
            preferredLanguageCode = detectedLanguageCode;
        }
        checkLangAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
        checkDirAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
        checkContentLanguageHeader(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
    } catch (LangDetectException e) {
    }
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) ULocale(com.ibm.icu.util.ULocale) Language(com.cybozu.labs.langdetect.Language) ArrayList(java.util.ArrayList) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 9 with LangDetectException

use of com.cybozu.labs.langdetect.LangDetectException in project opencms-core by alkacon.

the class CmsStringUtil method getLocaleForText.

/**
 * Returns the locale for the given text based on the language detection library.<p>
 *
 * The result will be <code>null</code> if the detection fails or the detected locale is not configured
 * in the 'opencms-system.xml' as available locale.<p>
 *
 * @param text the text to retrieve the locale for
 *
 * @return the detected locale for the given text
 */
public static Locale getLocaleForText(String text) {
    // try to detect locale by language detector
    if (isNotEmptyOrWhitespaceOnly(text)) {
        try {
            Detector detector = DetectorFactory.create();
            detector.append(text);
            String lang = detector.detect();
            Locale loc = new Locale(lang);
            if (OpenCms.getLocaleManager().getAvailableLocales().contains(loc)) {
                return loc;
            }
        } catch (LangDetectException e) {
            LOG.debug(e.getLocalizedMessage(), e);
        }
    }
    return null;
}
Also used : Locale(java.util.Locale) Detector(com.cybozu.labs.langdetect.Detector) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 10 with LangDetectException

use of com.cybozu.labs.langdetect.LangDetectException in project mojito by box.

the class LanguageDetectionService method detect.

/**
 * Detects the language of input text taking in account the expected
 * language of the text.
 *
 * @param text text for language detection
 * @param expectedBcp47Tag the expected bcp47 tag of the text (can contain a locale)
 * @return a {@link LanguageDetectionResult}
 * @throws UnsupportedLanguageException if the language is not supported,
 * see {@link #isSupportedBcp47Tag(java.lang.String) } to check supported
 * languages.
 */
public LanguageDetectionResult detect(String text, String expectedBcp47Tag) throws UnsupportedLanguageException {
    LanguageDetectionResult languageDetectionResult = new LanguageDetectionResult();
    try {
        String detectorLanguageForBcp47Tag = getDetectorLanguageForBcp47Tag(expectedBcp47Tag);
        Detector detector = getDetectorForLanguage(detectorLanguageForBcp47Tag);
        detector.append(text);
        String detect = detector.detect();
        ArrayList<Language> probabilities = detector.getProbabilities();
        for (Language probability : probabilities) {
            if (probability.lang.equals(detectorLanguageForBcp47Tag)) {
                languageDetectionResult.setProbabilityExpected(probability.prob);
            }
        }
        languageDetectionResult.setProbability(detector.getProbabilities().get(0).prob);
        languageDetectionResult.setDetected(detect);
        languageDetectionResult.setExpected(detectorLanguageForBcp47Tag);
        languageDetectionResult.setDetector(detector);
    } catch (LangDetectException lde) {
        logger.error("language detection failed\ntext: {}", text);
        languageDetectionResult.setLangDetectException(lde);
    }
    return languageDetectionResult;
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) Language(com.cybozu.labs.langdetect.Language) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Aggregations

LangDetectException (com.cybozu.labs.langdetect.LangDetectException)10 Detector (com.cybozu.labs.langdetect.Detector)8 Language (com.cybozu.labs.langdetect.Language)6 ArrayList (java.util.ArrayList)5 IOException (java.io.IOException)2 ULocale (com.ibm.icu.util.ULocale)1 StringWriter (java.io.StringWriter)1 Locale (java.util.Locale)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 IndexingException (org.apache.nutch.indexer.IndexingException)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)1 Resource (org.springframework.core.io.Resource)1 PathMatchingResourcePatternResolver (org.springframework.core.io.support.PathMatchingResourcePatternResolver)1