Search in sources :

Example 1 with RootAttribute

use of zemberek.core.turkish.RootAttribute in project zemberek-nlp by ahmetaa.

the class StemTransitionGenerator method generateModifiedRootNodes.

private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
    StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
    MorphemeState modifiedRootState = null;
    MorphemeState unmodifiedRootState = null;
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                char last = alphabet.getLastChar(modifiedSeq);
                char voiced = alphabet.voice(last);
                if (last == voiced) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    voiced = 'g';
                }
                modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                // TODO: find a better way for this.
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case Doubling:
                modifiedSeq.append(alphabet.getLastChar(modifiedSeq));
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case LastVowelDrop:
                TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
                if (lastLetter.isVowel()) {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                    modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                } else {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    } else {
                        unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
                        modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
                    }
                    modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                if (alphabet.containsVowel(modifiedSeq)) {
                    modifiedAttrs = calculateAttributes(modifiedSeq);
                }
                modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
                break;
            default:
                break;
        }
    }
    if (unmodifiedRootState == null) {
        unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
    }
    StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
    // if modified root state is not defined in the switch block, get it from morphotactics.
    if (modifiedRootState == null) {
        modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
    }
    StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
    if (original.equals(modified)) {
        return Collections.singletonList(original);
    }
    return Lists.newArrayList(original, modified);
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) StemTransition(zemberek.morphology._morphotactics.StemTransition) LexiconException(zemberek.morphology.lexicon.LexiconException) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) MorphemeState(zemberek.morphology._morphotactics.MorphemeState)

Example 2 with RootAttribute

use of zemberek.core.turkish.RootAttribute in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method generateModifiedRootNodes.

private StemNode[] generateModifiedRootNodes(DictionaryItem dicItem) {
    if (dicItem.hasAttribute(Special)) {
        return handleSpecialStems(dicItem);
    }
    TurkishLetterSequence modifiedSeq = new TurkishLetterSequence(dicItem.pronunciation, alphabet);
    EnumSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    EnumSet<PhoneticAttribute> modifiedAttrs = originalAttrs.clone();
    EnumSet<PhoneticExpectation> originalExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    EnumSet<PhoneticExpectation> modifiedExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                TurkicLetter last = modifiedSeq.lastLetter();
                TurkicLetter modifiedLetter = alphabet.voice(last);
                if (modifiedLetter == null) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    modifiedLetter = TurkishAlphabet.L_g;
                }
                modifiedSeq.changeLetter(modifiedSeq.length() - 1, modifiedLetter);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case Doubling:
                modifiedSeq.append(modifiedSeq.lastLetter());
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case LastVowelDrop:
                if (modifiedSeq.lastLetter().isVowel()) {
                    modifiedSeq.delete(modifiedSeq.length() - 1);
                    modifiedExpectations.add(PhoneticExpectation.ConsonantStart);
                } else {
                    modifiedSeq.delete(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalExpectations.add(PhoneticExpectation.ConsonantStart);
                    }
                    modifiedExpectations.add(PhoneticExpectation.VowelStart);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                modifiedSeq.delete(modifiedSeq.length() - 1);
                if (modifiedSeq.hasVowel()) {
                    modifiedAttrs = calculateAttributes(modifiedSeq);
                }
                break;
            default:
                break;
        }
    }
    StemNode original = new StemNode(dicItem.root, dicItem, originalAttrs, originalExpectations);
    StemNode modified = new StemNode(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedExpectations);
    SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(dicItem);
    original.exclusiveSuffixData = roots[0];
    modified.exclusiveSuffixData = roots[1];
    if (original.equals(modified)) {
        return new StemNode[] { original };
    }
    modified.setTermination(TerminationType.NON_TERMINAL);
    if (dicItem.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
        original.setTermination(TerminationType.NON_TERMINAL);
    }
    return new StemNode[] { original, modified };
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) LexiconException(zemberek.morphology.lexicon.LexiconException) StemNode(zemberek.morphology.lexicon.graph.StemNode) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixData(zemberek.morphology.lexicon.graph.SuffixData)

Example 3 with RootAttribute

use of zemberek.core.turkish.RootAttribute in project zemberek-nlp by ahmetaa.

the class TurkishSuffixes method getForVerb.

private void getForVerb(DictionaryItem item, SuffixData original, SuffixData modified) {
    original.add(Verb_TEMPLATE.allConnections().remove(Caus_t));
    modified.add(Verb_TEMPLATE.allConnections().remove(Caus_t));
    for (RootAttribute attribute : item.attributes) {
        switch(attribute) {
            case Aorist_A:
                original.add(Aor_Ar, AorPart_Ar_2Adj);
                original.remove(Aor_Ir, AorPart_Ir_2Adj);
                if (!item.attributes.contains(RootAttribute.ProgressiveVowelDrop)) {
                    modified.add(Aor_Ar, AorPart_Ar_2Adj);
                    modified.remove(Aor_Ir, AorPart_Ir_2Adj);
                }
                break;
            case Aorist_I:
                original.add(Aor_Ir, AorPart_Ir_2Adj);
                original.remove(Aor_Ar, AorPart_Ar_2Adj);
                if (!item.attributes.contains(RootAttribute.ProgressiveVowelDrop)) {
                    modified.add(Aor_Ir, AorPart_Ir_2Adj);
                    modified.remove(Aor_Ar, AorPart_Ar_2Adj);
                }
                break;
            case Voicing:
                modified.remove(Pass_In);
                modified.remove(Pass_InIl);
                break;
            case Passive_In:
                original.add(Pass_In);
                original.add(Pass_InIl);
                original.remove(Pass_nIl);
                break;
            case LastVowelDrop:
                original.remove(Pass_nIl);
                modified.clear().add(Pass_nIl, Verb2Verb);
                break;
            case ProgressiveVowelDrop:
                original.remove(Prog_Iyor);
                modified.clear().add(Pos_EMPTY, Prog_Iyor);
                break;
            case Reflexive:
                original.add(Reflex_In);
                modified.add(Reflex_In);
                break;
            /*
                case Reciprocal:
                    original.add(Recip_Is);
                    modified.add(Recip_Is);
                    break;
*/
            case Causative_t:
                original.remove(Caus_tIr);
                original.add(Caus_t);
                if (!item.attributes.contains(RootAttribute.ProgressiveVowelDrop)) {
                    modified.remove(Caus_tIr);
                    modified.add(Caus_t);
                }
                break;
            default:
                break;
        }
    }
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute)

Example 4 with RootAttribute

use of zemberek.core.turkish.RootAttribute in project zemberek-nlp by ahmetaa.

the class DictionarySerializer method convertToDictionaryItem.

private static DictionaryItem convertToDictionaryItem(LexiconProto.DictionaryItem item) {
    EnumSet<RootAttribute> attributes = EnumSet.noneOf(RootAttribute.class);
    for (LexiconProto.RootAttribute rootAttribute : item.getRootAttributesList()) {
        attributes.add(rootAttributeConverter.convertBack(rootAttribute, RootAttribute.Unknown));
    }
    Locale locale = attributes.contains(RootAttribute.LocaleEn) ? Locale.ENGLISH : Turkish.LOCALE;
    String lowercaseLemma = item.getLemma().toLowerCase(locale);
    return new DictionaryItem(item.getLemma(), item.getRoot().isEmpty() ? lowercaseLemma : item.getRoot(), item.getPronunciation().isEmpty() ? lowercaseLemma : item.getPronunciation(), primaryPosConverter.convertBack(item.getPrimaryPos(), PrimaryPos.Unknown), item.getSecondaryPos() == LexiconProto.SecondaryPos.SecondaryPos_Unknown ? SecondaryPos.None : secondaryPosConverter.convertBack(item.getSecondaryPos(), SecondaryPos.UnknownSec), attributes, item.getIndex());
}
Also used : LexiconProto(zemberek.morphology.lexicon.proto.LexiconProto) Locale(java.util.Locale) RootAttribute(zemberek.core.turkish.RootAttribute)

Example 5 with RootAttribute

use of zemberek.core.turkish.RootAttribute in project zemberek-nlp by ahmetaa.

the class StemTransitionsBase method generateModifiedRootNodes.

private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
    StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
    MorphemeState modifiedRootState = null;
    MorphemeState unmodifiedRootState = null;
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                char last = alphabet.lastChar(modifiedSeq);
                char voiced = alphabet.voice(last);
                if (last == voiced) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    voiced = 'g';
                }
                modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                // TODO: find a better way for this.
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case Doubling:
                modifiedSeq.append(alphabet.lastChar(modifiedSeq));
                originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                break;
            case LastVowelDrop:
                TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
                if (lastLetter.isVowel()) {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                    modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                } else {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
                    } else {
                        unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
                        modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
                    }
                    modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
                    modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                if (modifiedSeq.length() > 1) {
                    modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
                    if (alphabet.containsVowel(modifiedSeq)) {
                        modifiedAttrs = calculateAttributes(modifiedSeq);
                    }
                    modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
                }
                break;
            default:
                break;
        }
    }
    if (unmodifiedRootState == null) {
        unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
    }
    StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
    // if modified root state is not defined in the switch block, get it from morphotactics.
    if (modifiedRootState == null) {
        modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
    }
    StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
    if (original.equals(modified)) {
        return Collections.singletonList(original);
    }
    return Lists.newArrayList(original, modified);
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) StemTransition(zemberek.morphology.morphotactics.StemTransition) LexiconException(zemberek.morphology.lexicon.LexiconException) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) MorphemeState(zemberek.morphology.morphotactics.MorphemeState)

Aggregations

RootAttribute (zemberek.core.turkish.RootAttribute)6 PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)3 TurkicLetter (zemberek.core.turkish.TurkicLetter)3 LexiconException (zemberek.morphology.lexicon.LexiconException)3 Locale (java.util.Locale)1 PhoneticExpectation (zemberek.core.turkish.PhoneticExpectation)1 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)1 MorphemeState (zemberek.morphology._morphotactics.MorphemeState)1 StemTransition (zemberek.morphology._morphotactics.StemTransition)1 StemNode (zemberek.morphology.lexicon.graph.StemNode)1 SuffixData (zemberek.morphology.lexicon.graph.SuffixData)1 LexiconProto (zemberek.morphology.lexicon.proto.LexiconProto)1 MorphemeState (zemberek.morphology.morphotactics.MorphemeState)1 StemTransition (zemberek.morphology.morphotactics.StemTransition)1