use of com.yahoo.language.Language in project vespa by vespa-engine.
the class YqlParser method resegment.
@NonNull
private TaggableItem resegment(String field, OperatorNode<ExpressionOperator> ast, String wordData, boolean fromQuery, Class<?> parent, Language language) {
String toSegment = wordData;
Substring s = getOrigin(ast);
Language usedLanguage = language == null ? currentlyParsing.getLanguage() : language;
if (s != null) {
toSegment = s.getValue();
}
List<String> words = segmenter.segment(toSegment, usedLanguage);
TaggableItem wordItem;
if (words.size() == 0) {
wordItem = new WordItem(wordData, fromQuery);
} else if (words.size() == 1 || !phraseArgumentSupported(parent)) {
wordItem = new WordItem(words.get(0), fromQuery);
} else {
wordItem = new PhraseSegmentItem(toSegment, fromQuery, false);
((PhraseSegmentItem) wordItem).setIndexName(field);
for (String w : words) {
WordItem segment = new WordItem(w, fromQuery);
prepareWord(field, ast, fromQuery, segment);
((PhraseSegmentItem) wordItem).addItem(segment);
}
((PhraseSegmentItem) wordItem).lock();
}
return wordItem;
}
use of com.yahoo.language.Language in project vespa by vespa-engine.
the class ExecutionContext method resolveLanguage.
public Language resolveLanguage(Linguistics linguistics) {
if (language != null && language != Language.UNKNOWN) {
return language;
}
if (linguistics == null) {
return Language.ENGLISH;
}
Detection detection = linguistics.getDetector().detect(String.valueOf(value), null);
if (detection == null) {
return Language.ENGLISH;
}
Language detected = detection.getLanguage();
if (detected == Language.UNKNOWN) {
return Language.ENGLISH;
}
return detected;
}
use of com.yahoo.language.Language in project vespa by vespa-engine.
the class SimpleDetector method guessLanguage.
public static Language guessLanguage(String input) {
if (input == null || input.length() == 0) {
return Language.UNKNOWN;
}
// used to record the current theory of language guess, in case of ambiguous characters, such as Chinese
Language soFar = Language.UNKNOWN;
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
// overlap with Japanese or Chinese, so this is a good test.
if (// parenthesized hangul
(c >= 0x3200 && c < 0x3220) || // circled hangul
(c >= 0x3260 && c < 0x3280) || // halfwidth hangul
(c >= 0xFFA0 && c < 0xFFE0) || // hangul tone mark
(c == 0x302E || c == 0x302F) || // standard Hangul character blocks
block == Character.UnicodeBlock.HANGUL_SYLLABLES || block == Character.UnicodeBlock.HANGUL_JAMO || block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) {
return Language.KOREAN;
}
// katakana phonetic extensions.
if (0x31f0 <= c && c <= 0x31ff) {
// per above.
return Language.JAPANESE;
}
if (// these are standard character blocks for japanese characters.
0x31f0 <= c && c <= 0x31ff || block == Character.UnicodeBlock.HIRAGANA || block == Character.UnicodeBlock.KATAKANA || block == Character.UnicodeBlock.KANBUN) {
// per above.
return Language.JAPANESE;
}
if (block == Character.UnicodeBlock.CJK_COMPATIBILITY || block == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS || block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT || block == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT || block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) {
// seeing one of these chars, we assume that the text is Chinese, until more concrete evidence is found
soFar = Language.CHINESE_TRADITIONAL;
}
if (block == Character.UnicodeBlock.BOPOMOFO || block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) {
return Language.CHINESE_TRADITIONAL;
}
if (block == Character.UnicodeBlock.THAI) {
return Language.THAI;
}
}
// got to the end, so return the current best guess
return soFar;
}
Aggregations