Search in sources :

Example 11 with Language

use of com.yahoo.language.Language in project vespa by vespa-engine.

the class YqlParser method resegment.

@NonNull
private TaggableItem resegment(String field, OperatorNode<ExpressionOperator> ast, String wordData, boolean fromQuery, Class<?> parent, Language language) {
    String toSegment = wordData;
    Substring s = getOrigin(ast);
    Language usedLanguage = language == null ? currentlyParsing.getLanguage() : language;
    if (s != null) {
        toSegment = s.getValue();
    }
    List<String> words = segmenter.segment(toSegment, usedLanguage);
    TaggableItem wordItem;
    if (words.size() == 0) {
        wordItem = new WordItem(wordData, fromQuery);
    } else if (words.size() == 1 || !phraseArgumentSupported(parent)) {
        wordItem = new WordItem(words.get(0), fromQuery);
    } else {
        wordItem = new PhraseSegmentItem(toSegment, fromQuery, false);
        ((PhraseSegmentItem) wordItem).setIndexName(field);
        for (String w : words) {
            WordItem segment = new WordItem(w, fromQuery);
            prepareWord(field, ast, fromQuery, segment);
            ((PhraseSegmentItem) wordItem).addItem(segment);
        }
        ((PhraseSegmentItem) wordItem).lock();
    }
    return wordItem;
}
Also used : Substring(com.yahoo.prelude.query.Substring) TaggableItem(com.yahoo.prelude.query.TaggableItem) Language(com.yahoo.language.Language) WordItem(com.yahoo.prelude.query.WordItem) PhraseSegmentItem(com.yahoo.prelude.query.PhraseSegmentItem) NonNull(edu.umd.cs.findbugs.annotations.NonNull)

Example 12 with Language

use of com.yahoo.language.Language in project vespa by vespa-engine.

the class ExecutionContext method resolveLanguage.

public Language resolveLanguage(Linguistics linguistics) {
    if (language != null && language != Language.UNKNOWN) {
        return language;
    }
    if (linguistics == null) {
        return Language.ENGLISH;
    }
    Detection detection = linguistics.getDetector().detect(String.valueOf(value), null);
    if (detection == null) {
        return Language.ENGLISH;
    }
    Language detected = detection.getLanguage();
    if (detected == Language.UNKNOWN) {
        return Language.ENGLISH;
    }
    return detected;
}
Also used : Language(com.yahoo.language.Language) Detection(com.yahoo.language.detect.Detection)

Example 13 with Language

use of com.yahoo.language.Language in project vespa by vespa-engine.

the class SimpleDetector method guessLanguage.

public static Language guessLanguage(String input) {
    if (input == null || input.length() == 0) {
        return Language.UNKNOWN;
    }
    // used to record the current theory of language guess, in case of ambiguous characters, such as Chinese
    Language soFar = Language.UNKNOWN;
    for (int i = 0; i < input.length(); i++) {
        char c = input.charAt(i);
        Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
        // overlap with Japanese or Chinese, so this is a good test.
        if (// parenthesized hangul
        (c >= 0x3200 && c < 0x3220) || // circled hangul
        (c >= 0x3260 && c < 0x3280) || // halfwidth hangul
        (c >= 0xFFA0 && c < 0xFFE0) || // hangul tone mark
        (c == 0x302E || c == 0x302F) || // standard Hangul character blocks
        block == Character.UnicodeBlock.HANGUL_SYLLABLES || block == Character.UnicodeBlock.HANGUL_JAMO || block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) {
            return Language.KOREAN;
        }
        // katakana phonetic extensions.
        if (0x31f0 <= c && c <= 0x31ff) {
            // per above.
            return Language.JAPANESE;
        }
        if (// these are standard character blocks for japanese characters.
        0x31f0 <= c && c <= 0x31ff || block == Character.UnicodeBlock.HIRAGANA || block == Character.UnicodeBlock.KATAKANA || block == Character.UnicodeBlock.KANBUN) {
            // per above.
            return Language.JAPANESE;
        }
        if (block == Character.UnicodeBlock.CJK_COMPATIBILITY || block == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS || block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT || block == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT || block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) {
            // seeing one of these chars, we assume that the text is Chinese, until more concrete evidence is found
            soFar = Language.CHINESE_TRADITIONAL;
        }
        if (block == Character.UnicodeBlock.BOPOMOFO || block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) {
            return Language.CHINESE_TRADITIONAL;
        }
        if (block == Character.UnicodeBlock.THAI) {
            return Language.THAI;
        }
    }
    // got to the end, so return the current best guess
    return soFar;
}
Also used : Language(com.yahoo.language.Language) Hint(com.yahoo.language.detect.Hint)

Aggregations

Language (com.yahoo.language.Language)13 IndexFacts (com.yahoo.prelude.IndexFacts)2 AndItem (com.yahoo.prelude.query.AndItem)2 CompositeItem (com.yahoo.prelude.query.CompositeItem)2 Item (com.yahoo.prelude.query.Item)2 NullItem (com.yahoo.prelude.query.NullItem)2 PhraseSegmentItem (com.yahoo.prelude.query.PhraseSegmentItem)2 TaggableItem (com.yahoo.prelude.query.TaggableItem)2 WordItem (com.yahoo.prelude.query.WordItem)2 NonNull (edu.umd.cs.findbugs.annotations.NonNull)2 StringFieldValue (com.yahoo.document.datatypes.StringFieldValue)1 Linguistics (com.yahoo.language.Linguistics)1 Detection (com.yahoo.language.detect.Detection)1 Hint (com.yahoo.language.detect.Hint)1 StemMode (com.yahoo.language.process.StemMode)1 Index (com.yahoo.prelude.Index)1 AndSegmentItem (com.yahoo.prelude.query.AndSegmentItem)1 DotProductItem (com.yahoo.prelude.query.DotProductItem)1 EquivItem (com.yahoo.prelude.query.EquivItem)1 ExactStringItem (com.yahoo.prelude.query.ExactStringItem)1