use of com.cybozu.labs.langdetect.LangDetectException in project SearchServices by Alfresco.
the class AbstractQParser method detectLanguage.
private List<DetectedLanguage> detectLanguage(String content) {
if (content.trim().length() == 0) {
// to be consistent with the tika impl?
log.debug("No input text to detect language from, returning empty list");
return Collections.emptyList();
}
try {
Detector detector = DetectorFactory.create();
detector.append(content);
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l : langlist) {
if ((autoDetectQueryLocales.size() == 0) || (autoDetectQueryLocales.contains(l.lang))) {
solrLangList.add(new DetectedLanguage(l.lang, l.prob));
}
}
return solrLangList;
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project stanbol by apache.
the class LanguageDetectionEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
// do not call trim() on long texts to check if the text is empty
if (text.length() < 50 && text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
List<Language> languages = null;
try {
languages = languageIdentifier.getLanguages(text);
log.debug("language identified: {}", languages);
} catch (LangDetectException e) {
Enum<?> errorCode = e.getCode();
// ignore " 0 - NoTextError" and "5 - CantDetectError"
if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
StringBuilder msg = new StringBuilder("Could not identify language of text: ");
if (text.length() < 200) {
msg.append(text);
} else {
msg.append(text.subSequence(0, 199)).append("...");
}
msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
throw new EngineException(this, ci, msg.toString(), e);
} else {
log.debug("No text to detect the language from present in ContentItem ", ci);
}
}
// add language to metadata
if (languages != null) {
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
// add a hypothesis
Language hypothesis = languages.get(i);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project lucene-solr by apache.
the class LangDetectLanguageIdentifierUpdateProcessor method detectLanguage.
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
try {
Detector detector = DetectorFactory.create();
detector.setMaxTextLength(maxTotalChars);
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
detector.append(stringContent.substring(0, maxFieldValueChars));
} else {
detector.append(stringContent);
}
detector.append(" ");
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l : langlist) {
solrLangList.add(new DetectedLanguage(l.lang, l.prob));
}
return solrLangList;
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project validator by validator.
the class LanguageDetectingChecker method detectLanguageAndCheckAgainstDeclaredLanguage.
private void detectLanguageAndCheckAgainstDeclaredLanguage() throws SAXException {
if (nonWhitespaceCharacterCount < MIN_CHARS) {
warnIfMissingLang();
return;
}
if (// "No Linguistic Content"
"zxx".equals(declaredLangCode) || // Esperanto
"eo".equals(declaredLangCode) || // Latin
"la".equals(declaredLangCode)) {
return;
}
if (LANG_TAGS_BY_TLD.containsKey(tld) && Arrays.binarySearch(LANG_TAGS_BY_TLD.get(tld), declaredLangCode) >= 0) {
return;
}
try {
String textContent = //
documentContent.toString().replaceAll("\\s+", " ");
String detectedLanguage = "";
Detector detector = DetectorFactory.create();
detector.append(textContent);
detector.getProbabilities();
ArrayList<String> possibileLanguages = new ArrayList<>();
ArrayList<Language> possibilities = detector.getProbabilities();
for (Language possibility : possibilities) {
possibileLanguages.add(possibility.lang);
ULocale plocale = new ULocale(possibility.lang);
if (Arrays.binarySearch(COMMON_LANGS, possibility.lang) < 0 && systemId != null) {
log4j.info(String.format("%s %s %s", plocale.getDisplayName(), possibility.prob, systemId));
}
if (possibility.prob > MIN_PROBABILITY) {
detectedLanguage = possibility.lang;
setDocumentLanguage(detectedLanguage);
} else if ((possibileLanguages.contains("hr") && (possibileLanguages.contains("sr-latn") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("sr-latn") && (possibileLanguages.contains("hr") || possibileLanguages.contains("bs"))) || (possibileLanguages.contains("bs") && (possibileLanguages.contains("hr") || possibileLanguages.contains("sr-latn")))) {
if (htmlElementHasLang || systemId != null) {
detectedLanguage = getDetectedLanguageSerboCroatian();
setDocumentLanguage(detectedLanguage);
}
if ("sh".equals(detectedLanguage)) {
checkLangAttributeSerboCroatian();
return;
}
}
}
if ("".equals(detectedLanguage)) {
warnIfMissingLang();
return;
}
String detectedLanguageName = "";
String preferredLanguageCode = "";
ULocale locale = new ULocale(detectedLanguage);
String detectedLanguageCode = locale.getLanguage();
if ("no".equals(detectedLanguage)) {
checkLangAttributeNorwegian();
checkContentLanguageHeaderNorwegian(detectedLanguage, detectedLanguageName, detectedLanguageCode);
return;
}
if ("zh-hans".equals(detectedLanguage)) {
detectedLanguageName = "Simplified Chinese";
preferredLanguageCode = "zh-hans";
} else if ("zh-hant".equals(detectedLanguage)) {
detectedLanguageName = "Traditional Chinese";
preferredLanguageCode = "zh-hant";
} else if ("mhr".equals(detectedLanguage)) {
detectedLanguageName = "Meadow Mari";
preferredLanguageCode = "mhr";
} else if ("mrj".equals(detectedLanguage)) {
detectedLanguageName = "Hill Mari";
preferredLanguageCode = "mrj";
} else if ("nah".equals(detectedLanguage)) {
detectedLanguageName = "Nahuatl";
preferredLanguageCode = "nah";
} else if ("pnb".equals(detectedLanguage)) {
detectedLanguageName = "Western Panjabi";
preferredLanguageCode = "pnb";
} else if ("sr-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("sr-latn".equals(detectedLanguage)) {
detectedLanguageName = "Serbian";
preferredLanguageCode = "sr";
} else if ("uz-cyrl".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("uz-latn".equals(detectedLanguage)) {
detectedLanguageName = "Uzbek";
preferredLanguageCode = "uz";
} else if ("zxx".equals(detectedLanguage)) {
detectedLanguageName = "Lorem ipsum text";
preferredLanguageCode = "zxx";
} else {
detectedLanguageName = locale.getDisplayName();
preferredLanguageCode = detectedLanguageCode;
}
checkLangAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkDirAttribute(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
checkContentLanguageHeader(detectedLanguage, detectedLanguageName, detectedLanguageCode, preferredLanguageCode);
} catch (LangDetectException e) {
}
}
use of com.cybozu.labs.langdetect.LangDetectException in project Asqatasun by Asqatasun.
the class LanguageDetector method detectLanguage.
/**
* Perform the detection
*
* @param text to test
* @return the detected language
*/
public LanguageDetectionResult detectLanguage(String text) {
try {
Detector detector = DetectorFactory.create(0.15);
// issue#47 correction
detector.append(text.toLowerCase());
ArrayList<Language> languages = detector.getProbabilities();
Language detectedLanguage = extractLangWithHighestProbability(languages);
return new LanguageDetectionResult(detectedLanguage, text, languages.size() > 1);
} catch (LangDetectException ex) {
LOGGER.warn("error occured when detecting lang", ex);
}
return null;
}
Aggregations