Search in sources :

Example 1 with MorphemeState

use of zemberek.morphology.morphotactics.MorphemeState in project zemberek-nlp by ahmetaa.

the class StemTransitionsBase method generateModifiedRootNodes.

private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
    StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
    MorphemeState modifiedRootState = null;
    MorphemeState unmodifiedRootState = null;
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                char last = alphabet.lastChar(modifiedSeq);
                char voiced = alphabet.voice(last);
                if (last == voiced) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    voiced = 'g';
                }
                modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                // TODO: find a better way for this.
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case Doubling:
                modifiedSeq.append(alphabet.lastChar(modifiedSeq));
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case LastVowelDrop:
                TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
                if (lastLetter.isVowel()) {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                    modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                } else {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    } else {
                        unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
                        modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
                    }
                    modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                if (modifiedSeq.length() > 1) {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                    if (alphabet.containsVowel(modifiedSeq)) {
                        modifiedAttrs = calculateAttributes(modifiedSeq);
                    }
                    modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
                }
                break;
            default:
                break;
        }
    }
    if (unmodifiedRootState == null) {
        unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
    }
    StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
    // if modified root state is not defined in the switch block, get it from morphotactics.
    if (modifiedRootState == null) {
        modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
    }
    StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
    if (original.equals(modified)) {
        return Collections.singletonList(original);
    }
    return Lists.newArrayList(original, modified);
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) StemTransition(zemberek.morphology.morphotactics.StemTransition) LexiconException(zemberek.morphology.lexicon.LexiconException) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) MorphemeState(zemberek.morphology.morphotactics.MorphemeState)

Example 2 with MorphemeState

use of zemberek.morphology.morphotactics.MorphemeState in project zemberek-nlp by ahmetaa.

the class StemTransitionsBase method handleSpecialRoots.

private List<StemTransition> handleSpecialRoots(DictionaryItem item) {
    String id = item.getId();
    AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(item.pronunciation);
    StemTransition original, modified;
    MorphemeState unmodifiedRootState = morphotactics.getRootState(item, originalAttrs);
    switch(id) {
        case "içeri_Noun":
        case "içeri_Adj":
        case "dışarı_Adj":
        case "dışarı_Noun":
        case "dışarı_Postp":
        case "yukarı_Noun":
        case "ileri_Noun":
        case "yukarı_Adj":
        case "şura_Noun":
        case "bura_Noun":
        case "ora_Noun":
            original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
            MorphemeState rootForModified;
            switch(item.primaryPos) {
                case Noun:
                    rootForModified = morphotactics.nounLastVowelDropRoot_S;
                    break;
                case Adjective:
                    rootForModified = morphotactics.adjLastVowelDropRoot_S;
                    break;
                // TODO: check postpositive case. Maybe it is not required.
                case PostPositive:
                    rootForModified = morphotactics.adjLastVowelDropRoot_S;
                    break;
                default:
                    throw new IllegalStateException("No root morpheme state found for " + item);
            }
            String m = item.root.substring(0, item.root.length() - 1);
            modified = new StemTransition(m, item, calculateAttributes(m), rootForModified);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ExpectsConsonant);
            modified.getPhoneticAttributes().add(PhoneticAttribute.CannotTerminate);
            return Lists.newArrayList(original, modified);
        case "ben_Pron_Pers":
        case "sen_Pron_Pers":
            original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
            if (item.lemma.equals("ben")) {
                modified = new StemTransition("ban", item, calculateAttributes("ban"), morphotactics.pronPers_Mod_S);
            } else {
                modified = new StemTransition("san", item, calculateAttributes("san"), morphotactics.pronPers_Mod_S);
            }
            original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
            return Lists.newArrayList(original, modified);
        case "demek_Verb":
        case "yemek_Verb":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.vDeYeRoot_S);
            switch(item.lemma) {
                case "demek":
                    modified = new StemTransition("di", item, calculateAttributes("di"), morphotactics.vDeYeRoot_S);
                    break;
                default:
                    modified = new StemTransition("yi", item, calculateAttributes("yi"), morphotactics.vDeYeRoot_S);
            }
            return Lists.newArrayList(original, modified);
        case "imek_Verb":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.imekRoot_S);
            return Lists.newArrayList(original);
        case "birbiri_Pron_Quant":
        case "çoğu_Pron_Quant":
        case "öbürü_Pron_Quant":
        case "birçoğu_Pron_Quant":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.pronQuant_S);
            switch(item.lemma) {
                case "birbiri":
                    modified = new StemTransition("birbir", item, calculateAttributes("birbir"), morphotactics.pronQuantModified_S);
                    break;
                case "çoğu":
                    modified = new StemTransition("çok", item, calculateAttributes("çok"), morphotactics.pronQuantModified_S);
                    break;
                case "öbürü":
                    modified = new StemTransition("öbür", item, calculateAttributes("öbür"), morphotactics.pronQuantModified_S);
                    break;
                default:
                    modified = new StemTransition("birçok", item, calculateAttributes("birçok"), morphotactics.pronQuantModified_S);
                    break;
            }
            original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
            return Lists.newArrayList(original, modified);
        default:
            throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
    }
}
Also used : StemTransition(zemberek.morphology.morphotactics.StemTransition) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) MorphemeState(zemberek.morphology.morphotactics.MorphemeState)

Aggregations

PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)2 MorphemeState (zemberek.morphology.morphotactics.MorphemeState)2 StemTransition (zemberek.morphology.morphotactics.StemTransition)2 RootAttribute (zemberek.core.turkish.RootAttribute)1 TurkicLetter (zemberek.core.turkish.TurkicLetter)1 LexiconException (zemberek.morphology.lexicon.LexiconException)1