Search in sources :

Example 1 with StemNode

use of zemberek.morphology.lexicon.graph.StemNode in project zemberek-nlp by ahmetaa.

the class WordAnalyzer method analyze.

public List<WordAnalysis> analyze(String input) {
    // get stem candidates.
    List<StemNode> candidates = Lists.newArrayListWithCapacity(3);
    for (int i = 1; i <= input.length(); i++) {
        String stem = input.substring(0, i);
        candidates.addAll(graph.getMatchingStemNodes(stem));
    }
    // generate starting tokens with suffix root nodes.
    List<Token> initialTokens = Lists.newArrayListWithCapacity(5);
    for (StemNode candidate : candidates) {
        String rest = input.substring(candidate.surfaceForm.length());
        initialTokens.add(new Token(candidate, Lists.newArrayList(candidate.getSuffixRootSurfaceNode()), rest));
    }
    // traverse suffix graph.
    List<WordAnalysis> result = Lists.newArrayListWithCapacity(3);
    traverseSuffixes(initialTokens, result);
    return result;
}
Also used : StemNode(zemberek.morphology.lexicon.graph.StemNode)

Example 2 with StemNode

use of zemberek.morphology.lexicon.graph.StemNode in project zemberek-nlp by ahmetaa.

the class SimpleGenerator method getTokens.

private List<GenerationToken> getTokens(DictionaryItem item, List<Suffix> suffixes) {
    // find nodes for the dictionary item.
    List<StemNode> nodeList = new ArrayList<>();
    if (singeStems.containsKey(item)) {
        nodeList.add(singeStems.get(item));
    } else if (multiStems.containsKey(item)) {
        nodeList.addAll(multiStems.get(item));
    }
    // generate starting tokens with suffix root nodes.
    List<GenerationToken> initialTokens = new ArrayList<>(2);
    for (StemNode candidate : nodeList) {
        initialTokens.add(new GenerationToken(candidate, suffixes));
    }
    // traverse suffix graph.
    List<GenerationToken> result = new ArrayList<>(2);
    traverseSuffixes(initialTokens, result);
    return result;
}
Also used : StemNode(zemberek.morphology.lexicon.graph.StemNode) ArrayList(java.util.ArrayList)

Example 3 with StemNode

use of zemberek.morphology.lexicon.graph.StemNode in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method generateModifiedRootNodes.

private StemNode[] generateModifiedRootNodes(DictionaryItem dicItem) {
    if (dicItem.hasAttribute(Special)) {
        return handleSpecialStems(dicItem);
    }
    TurkishLetterSequence modifiedSeq = new TurkishLetterSequence(dicItem.pronunciation, alphabet);
    EnumSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    EnumSet<PhoneticAttribute> modifiedAttrs = originalAttrs.clone();
    EnumSet<PhoneticExpectation> originalExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    EnumSet<PhoneticExpectation> modifiedExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                TurkicLetter last = modifiedSeq.lastLetter();
                TurkicLetter modifiedLetter = alphabet.voice(last);
                if (modifiedLetter == null) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    modifiedLetter = TurkishAlphabet.L_g;
                }
                modifiedSeq.changeLetter(modifiedSeq.length() - 1, modifiedLetter);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case Doubling:
                modifiedSeq.append(modifiedSeq.lastLetter());
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case LastVowelDrop:
                if (modifiedSeq.lastLetter().isVowel()) {
                    modifiedSeq.delete(modifiedSeq.length() - 1);
                    modifiedExpectations.add(PhoneticExpectation.ConsonantStart);
                } else {
                    modifiedSeq.delete(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalExpectations.add(PhoneticExpectation.ConsonantStart);
                    }
                    modifiedExpectations.add(PhoneticExpectation.VowelStart);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                modifiedSeq.delete(modifiedSeq.length() - 1);
                if (modifiedSeq.hasVowel()) {
                    modifiedAttrs = calculateAttributes(modifiedSeq);
                }
                break;
            default:
                break;
        }
    }
    StemNode original = new StemNode(dicItem.root, dicItem, originalAttrs, originalExpectations);
    StemNode modified = new StemNode(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedExpectations);
    SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(dicItem);
    original.exclusiveSuffixData = roots[0];
    modified.exclusiveSuffixData = roots[1];
    if (original.equals(modified)) {
        return new StemNode[] { original };
    }
    modified.setTermination(TerminationType.NON_TERMINAL);
    if (dicItem.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
        original.setTermination(TerminationType.NON_TERMINAL);
    }
    return new StemNode[] { original, modified };
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) LexiconException(zemberek.morphology.lexicon.LexiconException) StemNode(zemberek.morphology.lexicon.graph.StemNode) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixData(zemberek.morphology.lexicon.graph.SuffixData)

Example 4 with StemNode

use of zemberek.morphology.lexicon.graph.StemNode in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method generate.

/**
 * Generates StemNode objects from the dictionary item.
 * <p>Most of the time a single StemNode is generated.
 *
 * @param item DictionaryItem
 * @return one or more StemNode objects.
 */
public StemNode[] generate(DictionaryItem item) {
    if (hasModifierAttribute(item)) {
        return generateModifiedRootNodes(item);
    } else {
        SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(item);
        EnumSet<PhoneticAttribute> phoneticAttributes = calculateAttributes(item.pronunciation);
        StemNode stemNode = new StemNode(item.root, item, TerminationType.TERMINAL, phoneticAttributes, EnumSet.noneOf(PhoneticExpectation.class));
        stemNode.exclusiveSuffixData = roots[0];
        return new StemNode[] { stemNode };
    }
}
Also used : PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) StemNode(zemberek.morphology.lexicon.graph.StemNode) SuffixData(zemberek.morphology.lexicon.graph.SuffixData) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Example 5 with StemNode

use of zemberek.morphology.lexicon.graph.StemNode in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method handleSpecialStems.

// handle special words such as demek-diyecek , beni-bana
private StemNode[] handleSpecialStems(DictionaryItem item) {
    TurkishSuffixes turkishSuffixes = (TurkishSuffixes) suffixProvider;
    String id = item.getId();
    if (id.equals("yemek_Verb")) {
        StemNode[] stems;
        stems = new StemNode[3];
        stems[0] = new StemNode("ye", item, TerminationType.TERMINAL, calculateAttributes(item.root));
        stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_Ye.allConnections());
        EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
        attrs.remove(PhoneticAttribute.LastLetterVowel);
        attrs.add(PhoneticAttribute.LastLetterConsonant);
        stems[1] = new StemNode("y", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
        stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
        stems[2] = new StemNode("yi", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
        stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Yi.allConnections());
        return stems;
    } else if (id.equals("demek_Verb")) {
        StemNode[] stems;
        stems = new StemNode[3];
        stems[0] = new StemNode("de", item, TerminationType.TERMINAL, calculateAttributes(item.root));
        stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_De.allConnections());
        EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
        attrs.remove(PhoneticAttribute.LastLetterVowel);
        attrs.add(PhoneticAttribute.LastLetterConsonant);
        stems[1] = new StemNode("d", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
        stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
        stems[2] = new StemNode("di", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
        stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Di.allConnections());
        return stems;
    } else if (id.equals("ben_Pron_Pers") || id.equals("sen_Pron_Pers")) {
        StemNode[] stems;
        stems = new StemNode[2];
        if (item.lemma.equals("ben")) {
            stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
            stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Ben.allConnections());
            stems[1] = new StemNode("ban", item, TerminationType.NON_TERMINAL, calculateAttributes("ban"));
        } else {
            stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
            stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Sen.allConnections());
            stems[1] = new StemNode("san", item, TerminationType.NON_TERMINAL, calculateAttributes("san"));
        }
        stems[1].exclusiveSuffixData.add(turkishSuffixes.PersPron_BanSan);
        return stems;
    } else {
        throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
    }
}
Also used : PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) EnumSet(java.util.EnumSet) StemNode(zemberek.morphology.lexicon.graph.StemNode) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute)

Aggregations

StemNode (zemberek.morphology.lexicon.graph.StemNode)6 PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)3 PhoneticExpectation (zemberek.core.turkish.PhoneticExpectation)3 SuffixData (zemberek.morphology.lexicon.graph.SuffixData)2 ArrayList (java.util.ArrayList)1 EnumSet (java.util.EnumSet)1 RootAttribute (zemberek.core.turkish.RootAttribute)1 TurkicLetter (zemberek.core.turkish.TurkicLetter)1 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)1 LexiconException (zemberek.morphology.lexicon.LexiconException)1