Search in sources :

Example 1 with Language

use of com.cybozu.labs.langdetect.Language in project SearchServices by Alfresco.

the class AbstractQParser method detectLanguage.

private List<DetectedLanguage> detectLanguage(String content) {
    if (content.trim().length() == 0) {
        // to be consistent with the tika impl?
        log.debug("No input text to detect language from, returning empty list");
        return Collections.emptyList();
    }
    try {
        Detector detector = DetectorFactory.create();
        detector.append(content);
        ArrayList<Language> langlist = detector.getProbabilities();
        ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
        for (Language l : langlist) {
            if ((autoDetectQueryLocales.size() == 0) || (autoDetectQueryLocales.contains(l.lang))) {
                solrLangList.add(new DetectedLanguage(l.lang, l.prob));
            }
        }
        return solrLangList;
    } catch (LangDetectException e) {
        log.debug("Could not determine language, returning empty list: ", e);
        return Collections.emptyList();
    }
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) Language(com.cybozu.labs.langdetect.Language) ArrayList(java.util.ArrayList) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 2 with Language

use of com.cybozu.labs.langdetect.Language in project stanbol by apache.

the class LanguageDetectionEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    // do not call trim() on long texts to check if the text is empty
    if (text.length() < 50 && text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    List<Language> languages = null;
    try {
        languages = languageIdentifier.getLanguages(text);
        log.debug("language identified: {}", languages);
    } catch (LangDetectException e) {
        Enum<?> errorCode = e.getCode();
        // ignore " 0 - NoTextError" and "5 - CantDetectError"
        if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
            StringBuilder msg = new StringBuilder("Could not identify language of text: ");
            if (text.length() < 200) {
                msg.append(text);
            } else {
                msg.append(text.subSequence(0, 199)).append("...");
            }
            msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
            throw new EngineException(this, ci, msg.toString(), e);
        } else {
            log.debug("No text to detect the language from present in ContentItem ", ci);
        }
    }
    // add language to metadata
    if (languages != null) {
        Graph g = ci.getMetadata();
        ci.getLock().writeLock().lock();
        try {
            for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
                // add a hypothesis
                Language hypothesis = languages.get(i);
                IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
                g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(com.cybozu.labs.langdetect.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 3 with Language

use of com.cybozu.labs.langdetect.Language in project lucene-solr by apache.

the class LangDetectLanguageIdentifierUpdateProcessor method detectLanguage.

@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
    try {
        Detector detector = DetectorFactory.create();
        detector.setMaxTextLength(maxTotalChars);
        for (String fieldName : inputFields) {
            log.debug("Appending field " + fieldName);
            if (doc.containsKey(fieldName)) {
                Collection<Object> fieldValues = doc.getFieldValues(fieldName);
                if (fieldValues != null) {
                    for (Object content : fieldValues) {
                        if (content instanceof String) {
                            String stringContent = (String) content;
                            if (stringContent.length() > maxFieldValueChars) {
                                detector.append(stringContent.substring(0, maxFieldValueChars));
                            } else {
                                detector.append(stringContent);
                            }
                            detector.append(" ");
                        } else {
                            log.warn("Field " + fieldName + " not a String value, not including in detection");
                        }
                    }
                }
            }
        }
        ArrayList<Language> langlist = detector.getProbabilities();
        ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
        for (Language l : langlist) {
            solrLangList.add(new DetectedLanguage(l.lang, l.prob));
        }
        return solrLangList;
    } catch (LangDetectException e) {
        log.debug("Could not determine language, returning empty list: ", e);
        return Collections.emptyList();
    }
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) Language(com.cybozu.labs.langdetect.Language) ArrayList(java.util.ArrayList) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 4 with Language

use of com.cybozu.labs.langdetect.Language in project Asqatasun by Asqatasun.

the class LanguageDetector method detectLanguage.

/**
 * Perform the detection
 *
 * @param text to test
 * @return the detected language
 */
public LanguageDetectionResult detectLanguage(String text) {
    try {
        Detector detector = DetectorFactory.create(0.15);
        // issue#47 correction
        detector.append(text.toLowerCase());
        ArrayList<Language> languages = detector.getProbabilities();
        Language detectedLanguage = extractLangWithHighestProbability(languages);
        return new LanguageDetectionResult(detectedLanguage, text, languages.size() > 1);
    } catch (LangDetectException ex) {
        LOGGER.warn("error occured when detecting lang", ex);
    }
    return null;
}
Also used : Detector(com.cybozu.labs.langdetect.Detector) Language(com.cybozu.labs.langdetect.Language) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Example 5 with Language

use of com.cybozu.labs.langdetect.Language in project Asqatasun by Asqatasun.

the class LanguageDetector method extractLangWithHighestProbability.

/**
 * Multiple results are returned in a list. This method parses the different
 * results and keeps the best regarding the relevancy value.
 *
 * @param languages
 * @return the language with the highest probability
 */
private Language extractLangWithHighestProbability(ArrayList<Language> languages) {
    double bestRelevancy = -1;
    Language langWinner = null;
    for (Language lang : languages) {
        if (lang.prob > bestRelevancy) {
            bestRelevancy = lang.prob;
            langWinner = lang;
        }
    }
    return langWinner;
}
Also used : Language(com.cybozu.labs.langdetect.Language)

Aggregations

Language (com.cybozu.labs.langdetect.Language)6 LangDetectException (com.cybozu.labs.langdetect.LangDetectException)5 Detector (com.cybozu.labs.langdetect.Detector)4 ArrayList (java.util.ArrayList)3 ULocale (com.ibm.icu.util.ULocale)1 IOException (java.io.IOException)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)1