Search in sources :

Example 11 with PhoneticAttribute

use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.

the class StemTransitionsBase method handleSpecialRoots.

private List<StemTransition> handleSpecialRoots(DictionaryItem item) {
    String id = item.getId();
    AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(item.pronunciation);
    StemTransition original, modified;
    MorphemeState unmodifiedRootState = morphotactics.getRootState(item, originalAttrs);
    switch(id) {
        case "içeri_Noun":
        case "içeri_Adj":
        case "dışarı_Adj":
        case "dışarı_Noun":
        case "dışarı_Postp":
        case "yukarı_Noun":
        case "ileri_Noun":
        case "yukarı_Adj":
        case "şura_Noun":
        case "bura_Noun":
        case "ora_Noun":
            original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
            MorphemeState rootForModified;
            switch(item.primaryPos) {
                case Noun:
                    rootForModified = morphotactics.nounLastVowelDropRoot_S;
                    break;
                case Adjective:
                    rootForModified = morphotactics.adjLastVowelDropRoot_S;
                    break;
                // TODO: check postpositive case. Maybe it is not required.
                case PostPositive:
                    rootForModified = morphotactics.adjLastVowelDropRoot_S;
                    break;
                default:
                    throw new IllegalStateException("No root morpheme state found for " + item);
            }
            String m = item.root.substring(0, item.root.length() - 1);
            modified = new StemTransition(m, item, calculateAttributes(m), rootForModified);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ExpectsConsonant);
            modified.getPhoneticAttributes().add(PhoneticAttribute.CannotTerminate);
            return Lists.newArrayList(original, modified);
        case "ben_Pron_Pers":
        case "sen_Pron_Pers":
            original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
            if (item.lemma.equals("ben")) {
                modified = new StemTransition("ban", item, calculateAttributes("ban"), morphotactics.pronPers_Mod_S);
            } else {
                modified = new StemTransition("san", item, calculateAttributes("san"), morphotactics.pronPers_Mod_S);
            }
            original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
            return Lists.newArrayList(original, modified);
        case "demek_Verb":
        case "yemek_Verb":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.vDeYeRoot_S);
            switch(item.lemma) {
                case "demek":
                    modified = new StemTransition("di", item, calculateAttributes("di"), morphotactics.vDeYeRoot_S);
                    break;
                default:
                    modified = new StemTransition("yi", item, calculateAttributes("yi"), morphotactics.vDeYeRoot_S);
            }
            return Lists.newArrayList(original, modified);
        case "imek_Verb":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.imekRoot_S);
            return Lists.newArrayList(original);
        case "birbiri_Pron_Quant":
        case "çoğu_Pron_Quant":
        case "öbürü_Pron_Quant":
        case "birçoğu_Pron_Quant":
            original = new StemTransition(item.root, item, originalAttrs, morphotactics.pronQuant_S);
            switch(item.lemma) {
                case "birbiri":
                    modified = new StemTransition("birbir", item, calculateAttributes("birbir"), morphotactics.pronQuantModified_S);
                    break;
                case "çoğu":
                    modified = new StemTransition("çok", item, calculateAttributes("çok"), morphotactics.pronQuantModified_S);
                    break;
                case "öbürü":
                    modified = new StemTransition("öbür", item, calculateAttributes("öbür"), morphotactics.pronQuantModified_S);
                    break;
                default:
                    modified = new StemTransition("birçok", item, calculateAttributes("birçok"), morphotactics.pronQuantModified_S);
                    break;
            }
            original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
            modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
            return Lists.newArrayList(original, modified);
        default:
            throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
    }
}
Also used : StemTransition(zemberek.morphology.morphotactics.StemTransition) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) MorphemeState(zemberek.morphology.morphotactics.MorphemeState)

Example 12 with PhoneticAttribute

use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.

the class AttributesHelper method getMorphemicAttributes.

public static AttributeSet<PhoneticAttribute> getMorphemicAttributes(CharSequence seq, AttributeSet<PhoneticAttribute> predecessorAttrs) {
    if (seq.length() == 0) {
        return predecessorAttrs.copy();
    }
    AttributeSet<PhoneticAttribute> attrs = new AttributeSet<>();
    if (alphabet.containsVowel(seq)) {
        TurkicLetter last = alphabet.getLastLetter(seq);
        if (last.isVowel()) {
            attrs.add(LastLetterVowel);
        } else {
            attrs.add(LastLetterConsonant);
        }
        TurkicLetter lastVowel = last.isVowel() ? last : alphabet.getLastVowel(seq);
        if (lastVowel.isFrontal()) {
            attrs.add(LastVowelFrontal);
        } else {
            attrs.add(LastVowelBack);
        }
        if (lastVowel.isRounded()) {
            attrs.add(LastVowelRounded);
        } else {
            attrs.add(LastVowelUnrounded);
        }
        if (alphabet.getFirstLetter(seq).isVowel()) {
            attrs.add(FirstLetterVowel);
        } else {
            attrs.add(FirstLetterConsonant);
        }
    } else {
        // we transfer vowel attributes from the predecessor attributes.
        attrs.copyFrom(predecessorAttrs);
        attrs.addAll(NO_VOWEL_ATTRIBUTES);
        attrs.remove(LastLetterVowel);
        attrs.remove(ExpectsConsonant);
    }
    TurkicLetter last = alphabet.getLastLetter(seq);
    if (last.isVoiceless()) {
        attrs.add(LastLetterVoiceless);
        if (last.isStopConsonant()) {
            // kitap
            attrs.add(LastLetterVoicelessStop);
        }
    } else {
        attrs.add(LastLetterVoiced);
    }
    return attrs;
}
Also used : TurkicLetter(zemberek.core.turkish.TurkicLetter) AttributeSet(zemberek.morphology.morphotactics.AttributeSet) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Example 13 with PhoneticAttribute

use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method handleSpecialStems.

// handle special words such as demek-diyecek , beni-bana
private StemNode[] handleSpecialStems(DictionaryItem item) {
    TurkishSuffixes turkishSuffixes = (TurkishSuffixes) suffixProvider;
    String id = item.getId();
    if (id.equals("yemek_Verb")) {
        StemNode[] stems;
        stems = new StemNode[3];
        stems[0] = new StemNode("ye", item, TerminationType.TERMINAL, calculateAttributes(item.root));
        stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_Ye.allConnections());
        EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
        attrs.remove(PhoneticAttribute.LastLetterVowel);
        attrs.add(PhoneticAttribute.LastLetterConsonant);
        stems[1] = new StemNode("y", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
        stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
        stems[2] = new StemNode("yi", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
        stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Yi.allConnections());
        return stems;
    } else if (id.equals("demek_Verb")) {
        StemNode[] stems;
        stems = new StemNode[3];
        stems[0] = new StemNode("de", item, TerminationType.TERMINAL, calculateAttributes(item.root));
        stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_De.allConnections());
        EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
        attrs.remove(PhoneticAttribute.LastLetterVowel);
        attrs.add(PhoneticAttribute.LastLetterConsonant);
        stems[1] = new StemNode("d", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
        stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
        stems[2] = new StemNode("di", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
        stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Di.allConnections());
        return stems;
    } else if (id.equals("ben_Pron_Pers") || id.equals("sen_Pron_Pers")) {
        StemNode[] stems;
        stems = new StemNode[2];
        if (item.lemma.equals("ben")) {
            stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
            stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Ben.allConnections());
            stems[1] = new StemNode("ban", item, TerminationType.NON_TERMINAL, calculateAttributes("ban"));
        } else {
            stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
            stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Sen.allConnections());
            stems[1] = new StemNode("san", item, TerminationType.NON_TERMINAL, calculateAttributes("san"));
        }
        stems[1].exclusiveSuffixData.add(turkishSuffixes.PersPron_BanSan);
        return stems;
    } else {
        throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
    }
}
Also used : PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) EnumSet(java.util.EnumSet) StemNode(zemberek.morphology.lexicon.graph.StemNode) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Example 14 with PhoneticAttribute

use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.

the class StemTransitionGenerator method generate.

/**
 * Generates StemTransition objects from the dictionary item. <p>Most of the time a single
 * StemNode is generated.
 *
 * @param item DictionaryItem
 * @return one or more StemTransition objects.
 */
public List<StemTransition> generate(DictionaryItem item) {
    if (specialRoots.contains(item.id)) {
        return handleSpecialRoots(item);
    }
    if (hasModifierAttribute(item)) {
        return generateModifiedRootNodes(item);
    } else {
        AttributeSet<PhoneticAttribute> phoneticAttributes = calculateAttributes(item.pronunciation);
        StemTransition transition = new StemTransition(item.root, item, phoneticAttributes, morphotactics.getRootState(item, phoneticAttributes));
        return Lists.newArrayList(transition);
    }
}
Also used : StemTransition(zemberek.morphology._morphotactics.StemTransition) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Example 15 with PhoneticAttribute

use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.

the class RuleBasedAnalyzer method advance.

// for all allowed matching outgoing transitions, new paths are generated.
// Transition `conditions` are used for checking if a `search path`
// is allowed to pass a transition.
private List<SearchPath> advance(SearchPath path) {
    List<SearchPath> newPaths = new ArrayList<>(2);
    // for all outgoing transitions.
    for (MorphemeTransition transition : path.currentState.getOutgoing()) {
        SuffixTransition suffixTransition = (SuffixTransition) transition;
        // if tail is empty and this transitions surface is not empty, no need to check.
        if (path.tail.isEmpty() && suffixTransition.hasSurfaceForm()) {
            if (debugMode) {
                debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Empty surface expected."));
            }
            continue;
        }
        String surface = SurfaceTransition.generateSurface(suffixTransition, path.phoneticAttributes);
        // no need to go further if generated surface form is not a prefix of the paths's tail.
        boolean tailStartsWith = asciiTolerant ? TurkishAlphabet.INSTANCE.startsWithIgnoreDiacritics(path.tail, surface) : path.tail.startsWith(surface);
        if (!tailStartsWith) {
            if (debugMode) {
                debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Surface Mismatch:" + surface));
            }
            continue;
        }
        // if transition condition fails, add it to debug data.
        if (debugMode && suffixTransition.getCondition() != null) {
            Condition condition = suffixTransition.getCondition();
            Condition failed;
            if (condition instanceof CombinedCondition) {
                failed = ((CombinedCondition) condition).getFailingCondition(path);
            } else {
                failed = condition.accept(path) ? null : condition;
            }
            if (failed != null) {
                debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Condition → " + failed.toString()));
            }
        }
        // check conditions.
        if (!suffixTransition.canPass(path)) {
            continue;
        }
        // epsilon (empty) transition. Add and continue. Use existing attributes.
        if (!suffixTransition.hasSurfaceForm()) {
            newPaths.add(path.getCopy(new SurfaceTransition("", suffixTransition), path.phoneticAttributes));
            continue;
        }
        SurfaceTransition surfaceTransition = new SurfaceTransition(surface, suffixTransition);
        // if tail is equal to surface, no need to calculate phonetic attributes.
        boolean tailEqualsSurface = asciiTolerant ? TurkishAlphabet.INSTANCE.equalsIgnoreDiacritics(path.tail, surface) : path.tail.equals(surface);
        AttributeSet<PhoneticAttribute> attributes = tailEqualsSurface ? path.phoneticAttributes.copy() : AttributesHelper.getMorphemicAttributes(surface, path.phoneticAttributes);
        // This is required for suffixes like `cik` and `ciğ`
        // an extra attribute is added if "cik" or "ciğ" is generated and matches the tail.
        // if "cik" is generated, ExpectsConsonant attribute is added, so only a consonant starting
        // suffix can follow. Likewise, if "ciğ" is produced, a vowel starting suffix is allowed.
        attributes.remove(PhoneticAttribute.CannotTerminate);
        SuffixTemplateToken lastToken = suffixTransition.getLastTemplateToken();
        if (lastToken.type == TemplateTokenType.LAST_VOICED) {
            attributes.add(PhoneticAttribute.ExpectsConsonant);
        } else if (lastToken.type == TemplateTokenType.LAST_NOT_VOICED) {
            attributes.add(PhoneticAttribute.ExpectsVowel);
            attributes.add(PhoneticAttribute.CannotTerminate);
        }
        SearchPath p = path.getCopy(surfaceTransition, attributes);
        newPaths.add(p);
    }
    return newPaths;
}
Also used : RejectedTransition(zemberek.morphology.analysis.AnalysisDebugData.RejectedTransition) CombinedCondition(zemberek.morphology.morphotactics.CombinedCondition) Condition(zemberek.morphology.morphotactics.Condition) SuffixTransition(zemberek.morphology.morphotactics.SuffixTransition) ArrayList(java.util.ArrayList) CombinedCondition(zemberek.morphology.morphotactics.CombinedCondition) MorphemeTransition(zemberek.morphology.morphotactics.MorphemeTransition) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixTemplateToken(zemberek.morphology.analysis.SurfaceTransition.SuffixTemplateToken)

Aggregations

PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)16 TurkicLetter (zemberek.core.turkish.TurkicLetter)6 ArrayList (java.util.ArrayList)4 PhoneticExpectation (zemberek.core.turkish.PhoneticExpectation)3 RootAttribute (zemberek.core.turkish.RootAttribute)3 StemTransition (zemberek.morphology._morphotactics.StemTransition)3 LexiconException (zemberek.morphology.lexicon.LexiconException)3 StemNode (zemberek.morphology.lexicon.graph.StemNode)3 StemTransition (zemberek.morphology.morphotactics.StemTransition)3 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)2 MorphemeState (zemberek.morphology._morphotactics.MorphemeState)2 RejectedTransition (zemberek.morphology.analysis.AnalysisDebugData.RejectedTransition)2 SuffixTemplateToken (zemberek.morphology.analysis.SurfaceTransition.SuffixTemplateToken)2 SuffixData (zemberek.morphology.lexicon.graph.SuffixData)2 CombinedCondition (zemberek.morphology.morphotactics.CombinedCondition)2 Condition (zemberek.morphology.morphotactics.Condition)2 MorphemeState (zemberek.morphology.morphotactics.MorphemeState)2 MorphemeTransition (zemberek.morphology.morphotactics.MorphemeTransition)2 SuffixTransition (zemberek.morphology.morphotactics.SuffixTransition)2 EnumSet (java.util.EnumSet)1