Search in sources :

Example 6 with TurkicLetter

use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.

the class AttributesHelper method getMorphemicAttributes.

public static AttributeSet<PhoneticAttribute> getMorphemicAttributes(CharSequence seq, AttributeSet<PhoneticAttribute> predecessorAttrs) {
    if (seq.length() == 0) {
        return predecessorAttrs.copy();
    }
    AttributeSet<PhoneticAttribute> attrs = new AttributeSet<>();
    if (alphabet.containsVowel(seq)) {
        TurkicLetter last = alphabet.getLastLetter(seq);
        if (last.isVowel()) {
            attrs.add(LastLetterVowel);
        } else {
            attrs.add(LastLetterConsonant);
        }
        TurkicLetter lastVowel = last.isVowel() ? last : alphabet.getLastVowel(seq);
        if (lastVowel.isFrontal()) {
            attrs.add(LastVowelFrontal);
        } else {
            attrs.add(LastVowelBack);
        }
        if (lastVowel.isRounded()) {
            attrs.add(LastVowelRounded);
        } else {
            attrs.add(LastVowelUnrounded);
        }
        if (alphabet.getFirstLetter(seq).isVowel()) {
            attrs.add(FirstLetterVowel);
        } else {
            attrs.add(FirstLetterConsonant);
        }
    } else {
        // we transfer vowel attributes from the predecessor attributes.
        attrs.copyFrom(predecessorAttrs);
        attrs.addAll(NO_VOWEL_ATTRIBUTES);
        attrs.remove(LastLetterVowel);
        attrs.remove(ExpectsConsonant);
    }
    TurkicLetter last = alphabet.getLastLetter(seq);
    if (last.isVoiceless()) {
        attrs.add(LastLetterVoiceless);
        if (last.isStopConsonant()) {
            // kitap
            attrs.add(LastLetterVoicelessStop);
        }
    } else {
        attrs.add(LastLetterVoiced);
    }
    return attrs;
}
Also used : TurkicLetter(zemberek.core.turkish.TurkicLetter) AttributeSet(zemberek.morphology.morphotactics.AttributeSet) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Example 7 with TurkicLetter

use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.

the class StrictTurkishSyllableParser method letterCountForLastSyllable.

/**
 * Returns the letter count of the last syllable of a given word. This algorithm does not work for
 * words that starts with [tr-,st-,pr-] or ends with [-trak] Foreign letter words also cannot be
 * processed.
 *
 * @param seq: TurkishLetterSequence object.
 * @return Size of the last syllable. It can be 1,2,3 or 4. Returns -1 if syllable rules are not
 * met.
 */
private int letterCountForLastSyllable(TurkishLetterSequence seq) {
    final int length = seq.length();
    TurkicLetter current = seq.getLetter(length - 1);
    TurkicLetter previous = seq.getLetter(length - 2);
    if (length == 0) {
        return -1;
    }
    if (current.isVowel()) {
        // seq consist of a single vowel
        if (length == 1) {
            return 1;
        }
        // current and previous letters are vowels. Eg. "saa"
        if (previous.isVowel()) {
            return 1;
        }
        // length is two and previous is vowel. Eg. "ya"
        if (length == 2) {
            return 2;
        }
        TurkicLetter twoBefore = seq.getLetter(length - 3);
        // ste-tos-kop -> ste
        if (!twoBefore.isVowel() && length == 3) {
            return 3;
        }
        return 2;
    } else {
        // single consonant.
        if (length == 1) {
            return -1;
        }
        TurkicLetter twoBefore = seq.getLetter(length - 3);
        if (previous.isVowel()) {
            // For words like [el, al] or two letter before is vowel. (`at` in sa-at)
            if (length == 2 || twoBefore.isVowel()) {
                return 2;
            }
            TurkicLetter threeBefore = seq.getLetter(length - 4);
            // seq uc harfli (kal, sel) ya da uc onceki harf sesli (kanat),
            if (length == 3 || threeBefore.isVowel()) {
                return 3;
            }
            // Such as tren, strateji, krank, angstrom.
            if (length == 4) {
                return -1;
            }
            TurkicLetter fourBefore = seq.getLetter(length - 5);
            if (!fourBefore.isVowel()) {
                return 3;
            }
            return 3;
        } else {
            if (length == 2 || !twoBefore.isVowel()) {
                return -1;
            }
            TurkicLetter threeBefore = seq.getLetter(length - 4);
            if (length > 3 && !threeBefore.isVowel()) {
                return 4;
            }
            return 3;
        }
    }
}
Also used : TurkicLetter(zemberek.core.turkish.TurkicLetter)

Aggregations

TurkicLetter (zemberek.core.turkish.TurkicLetter)7 PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)6 RootAttribute (zemberek.core.turkish.RootAttribute)3 LexiconException (zemberek.morphology.lexicon.LexiconException)3 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)2 ArrayList (java.util.ArrayList)1 PhoneticExpectation (zemberek.core.turkish.PhoneticExpectation)1 AttributeSet (zemberek.morphology._morphotactics.AttributeSet)1 MorphemeState (zemberek.morphology._morphotactics.MorphemeState)1 StemTransition (zemberek.morphology._morphotactics.StemTransition)1 StemNode (zemberek.morphology.lexicon.graph.StemNode)1 SuffixData (zemberek.morphology.lexicon.graph.SuffixData)1 SuffixSurfaceNode (zemberek.morphology.lexicon.graph.SuffixSurfaceNode)1 AttributeSet (zemberek.morphology.morphotactics.AttributeSet)1 MorphemeState (zemberek.morphology.morphotactics.MorphemeState)1 StemTransition (zemberek.morphology.morphotactics.StemTransition)1