Search in sources :

Example 1 with SuffixSurfaceNode

use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGeneratorTest method emptyFormTest.

@Test
public void emptyFormTest() {
    SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
    SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded, LastLetterConsonant), "");
    Assert.assertEquals("", surfaceNode.surfaceForm);
    Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastVowelBack, LastVowelRounded, LastLetterConsonant)));
}
Also used : SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode) Test(org.junit.Test)

Example 2 with SuffixSurfaceNode

use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGeneratorTest method novowelFormTest.

@Test
public void novowelFormTest() {
    SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
    SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded, LastLetterVowel), "m");
    Assert.assertEquals("m", surfaceNode.surfaceForm);
    Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastVowelBack, LastVowelRounded, LastLetterConsonant)));
}
Also used : SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode) Test(org.junit.Test)

Example 3 with SuffixSurfaceNode

use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGeneratorTest method suffixFormAHarmonyTest.

@Test
public void suffixFormAHarmonyTest() {
    SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
    SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack), "lAr");
    Assert.assertEquals("lar", surfaceNode.surfaceForm);
    Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastLetterConsonant, LastVowelBack, LastVowelUnrounded)));
    surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded), "lAr");
    Assert.assertEquals("lar", surfaceNode.surfaceForm);
    surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelFrontal, LastVowelRounded), "lAr");
    Assert.assertEquals("ler", surfaceNode.surfaceForm);
}
Also used : SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode) Test(org.junit.Test)

Example 4 with SuffixSurfaceNode

use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGenerator method generate.

public List<SuffixSurfaceNode> generate(EnumSet<PhoneticAttribute> attrs, EnumSet<PhoneticExpectation> expectations, SuffixData suffixData, SuffixForm suffixForm) {
    List<SuffixToken> tokenList = Lists.newArrayList(new SuffixStringTokenizer(suffixForm.generation));
    // zero length token
    if (tokenList.size() == 0) {
        return Lists.newArrayList(new SuffixSurfaceNode(suffixForm, "", attrs.clone(), expectations.clone(), suffixData, suffixForm.terminationType));
    }
    List<SuffixSurfaceNode> forms = new ArrayList<SuffixSurfaceNode>(1);
    // generation of forms. normally only one form is generated. But in situations like cI~k, two Forms are generated.
    TurkishLetterSequence seq = new TurkishLetterSequence();
    int index = 0;
    for (SuffixToken token : tokenList) {
        EnumSet<PhoneticAttribute> formAttrs = defineMorphemicAttributes(seq, attrs);
        switch(token.type) {
            case LETTER:
                seq.append(token.letter);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case A_WOVEL:
                if (index == 0 && attrs.contains(LastLetterVowel)) {
                    break;
                }
                TurkicLetter lA = TurkicLetter.UNDEFINED;
                if (formAttrs.contains(LastVowelBack)) {
                    lA = L_a;
                } else if (formAttrs.contains(LastVowelFrontal)) {
                    lA = L_e;
                }
                if (lA == TurkicLetter.UNDEFINED) {
                    throw new IllegalArgumentException("Cannot generate A form!");
                }
                seq.append(lA);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case I_WOVEL:
                if (index == 0 && attrs.contains(LastLetterVowel)) {
                    break;
                }
                TurkicLetter li = TurkicLetter.UNDEFINED;
                if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelRounded))) {
                    li = L_u;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelUnrounded))) {
                    li = L_ii;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelRounded))) {
                    li = L_uu;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelUnrounded))) {
                    li = L_i;
                }
                if (li == TurkicLetter.UNDEFINED) {
                    throw new IllegalArgumentException("Cannot generate I form!");
                }
                seq.append(li);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case APPEND:
                if (formAttrs.contains(LastLetterVowel)) {
                    seq.append(token.letter);
                }
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case DEVOICE_FIRST:
                TurkicLetter ld = token.letter;
                if (formAttrs.contains(LastLetterVoiceless)) {
                    ld = Turkish.Alphabet.devoice(token.letter);
                }
                seq.append(ld);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case VOICE_LAST:
                ld = token.letter;
                seq.append(ld);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.ConsonantStart), suffixData, suffixForm.terminationType));
                    seq.changeLast(Turkish.Alphabet.voice(token.letter));
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.VowelStart), suffixData, TerminationType.NON_TERMINAL));
                }
                break;
        }
        index++;
    }
    return forms;
}
Also used : TurkicLetter(zemberek.core.turkish.TurkicLetter) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) ArrayList(java.util.ArrayList) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode)

Example 5 with SuffixSurfaceNode

use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.

the class WordAnalyzer method dumpTraverse.

private void dumpTraverse(List<Token> current, List<WordAnalysis> completed) {
    if (current.size() > 50) {
        current = pruneCyclicPaths(current);
    }
    List<Token> newtokens = Lists.newArrayList();
    for (Token token : current) {
        boolean matchFound = false;
        for (SuffixSurfaceNode successor : token.currentSurfaceNode.getSuccessors()) {
            if (token.tail.startsWith(successor.surfaceForm)) {
                System.out.println(successor.getSuffixForm().getId());
                final Token copy = token.getCopy(successor);
                if (token.tail.length() > 0) {
                    newtokens.add(copy);
                    matchFound = true;
                } else {
                    if (successor.termination != TerminationType.NON_TERMINAL) {
                        newtokens.add(copy);
                        matchFound = true;
                    }
                }
            }
        }
        if (!matchFound) {
            if (token.tail.length() == 0 && token.terminal) {
                completed.add(token.getResult());
            } else {
                System.out.println("Failed:" + token.getResult());
            }
        }
    }
    if (!newtokens.isEmpty()) {
        dumpTraverse(newtokens, completed);
    }
}
Also used : SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode)

Aggregations

SuffixSurfaceNode (zemberek.morphology.lexicon.graph.SuffixSurfaceNode)9 Test (org.junit.Test)5 ArrayList (java.util.ArrayList)2 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)2 IntValueMap (zemberek.core.collections.IntValueMap)1 PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)1 TurkicLetter (zemberek.core.turkish.TurkicLetter)1