use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGeneratorTest method emptyFormTest.
@Test
public void emptyFormTest() {
SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded, LastLetterConsonant), "");
Assert.assertEquals("", surfaceNode.surfaceForm);
Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastVowelBack, LastVowelRounded, LastLetterConsonant)));
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGeneratorTest method novowelFormTest.
@Test
public void novowelFormTest() {
SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded, LastLetterVowel), "m");
Assert.assertEquals("m", surfaceNode.surfaceForm);
Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastVowelBack, LastVowelRounded, LastLetterConsonant)));
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGeneratorTest method suffixFormAHarmonyTest.
@Test
public void suffixFormAHarmonyTest() {
SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
SuffixSurfaceNode surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack), "lAr");
Assert.assertEquals("lar", surfaceNode.surfaceForm);
Assert.assertTrue(surfaceNode.getAttributes().containsAll(Arrays.asList(LastLetterConsonant, LastVowelBack, LastVowelUnrounded)));
surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded), "lAr");
Assert.assertEquals("lar", surfaceNode.surfaceForm);
surfaceNode = getFirstNodeNoExpectatios(sfg, set(LastVowelFrontal, LastVowelRounded), "lAr");
Assert.assertEquals("ler", surfaceNode.surfaceForm);
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGenerator method generate.
public List<SuffixSurfaceNode> generate(EnumSet<PhoneticAttribute> attrs, EnumSet<PhoneticExpectation> expectations, SuffixData suffixData, SuffixForm suffixForm) {
List<SuffixToken> tokenList = Lists.newArrayList(new SuffixStringTokenizer(suffixForm.generation));
// zero length token
if (tokenList.size() == 0) {
return Lists.newArrayList(new SuffixSurfaceNode(suffixForm, "", attrs.clone(), expectations.clone(), suffixData, suffixForm.terminationType));
}
List<SuffixSurfaceNode> forms = new ArrayList<SuffixSurfaceNode>(1);
// generation of forms. normally only one form is generated. But in situations like cI~k, two Forms are generated.
TurkishLetterSequence seq = new TurkishLetterSequence();
int index = 0;
for (SuffixToken token : tokenList) {
EnumSet<PhoneticAttribute> formAttrs = defineMorphemicAttributes(seq, attrs);
switch(token.type) {
case LETTER:
seq.append(token.letter);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case A_WOVEL:
if (index == 0 && attrs.contains(LastLetterVowel)) {
break;
}
TurkicLetter lA = TurkicLetter.UNDEFINED;
if (formAttrs.contains(LastVowelBack)) {
lA = L_a;
} else if (formAttrs.contains(LastVowelFrontal)) {
lA = L_e;
}
if (lA == TurkicLetter.UNDEFINED) {
throw new IllegalArgumentException("Cannot generate A form!");
}
seq.append(lA);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case I_WOVEL:
if (index == 0 && attrs.contains(LastLetterVowel)) {
break;
}
TurkicLetter li = TurkicLetter.UNDEFINED;
if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelRounded))) {
li = L_u;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelUnrounded))) {
li = L_ii;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelRounded))) {
li = L_uu;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelUnrounded))) {
li = L_i;
}
if (li == TurkicLetter.UNDEFINED) {
throw new IllegalArgumentException("Cannot generate I form!");
}
seq.append(li);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case APPEND:
if (formAttrs.contains(LastLetterVowel)) {
seq.append(token.letter);
}
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case DEVOICE_FIRST:
TurkicLetter ld = token.letter;
if (formAttrs.contains(LastLetterVoiceless)) {
ld = Turkish.Alphabet.devoice(token.letter);
}
seq.append(ld);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case VOICE_LAST:
ld = token.letter;
seq.append(ld);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.ConsonantStart), suffixData, suffixForm.terminationType));
seq.changeLast(Turkish.Alphabet.voice(token.letter));
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.VowelStart), suffixData, TerminationType.NON_TERMINAL));
}
break;
}
index++;
}
return forms;
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class WordAnalyzer method dumpTraverse.
private void dumpTraverse(List<Token> current, List<WordAnalysis> completed) {
if (current.size() > 50) {
current = pruneCyclicPaths(current);
}
List<Token> newtokens = Lists.newArrayList();
for (Token token : current) {
boolean matchFound = false;
for (SuffixSurfaceNode successor : token.currentSurfaceNode.getSuccessors()) {
if (token.tail.startsWith(successor.surfaceForm)) {
System.out.println(successor.getSuffixForm().getId());
final Token copy = token.getCopy(successor);
if (token.tail.length() > 0) {
newtokens.add(copy);
matchFound = true;
} else {
if (successor.termination != TerminationType.NON_TERMINAL) {
newtokens.add(copy);
matchFound = true;
}
}
}
}
if (!matchFound) {
if (token.tail.length() == 0 && token.terminal) {
completed.add(token.getResult());
} else {
System.out.println("Failed:" + token.getResult());
}
}
}
if (!newtokens.isEmpty()) {
dumpTraverse(newtokens, completed);
}
}
Aggregations