use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class WordAnalyzer method pruneCyclicPaths.
// for preventing excessive branching during search, we remove paths that has more than 3 repeating suffix forms.
private List<Token> pruneCyclicPaths(List<Token> tokens) {
List<Token> result = new ArrayList<>();
for (Token token : tokens) {
boolean remove = false;
IntValueMap<String> typeCounts = new IntValueMap<>(10);
for (SuffixSurfaceNode node : token.surfaceNodeHistory) {
if (typeCounts.addOrIncrement(node.getSuffixForm().id) > MAX_REPEATING_SUFFIX_TYPE_COUNT) {
remove = true;
break;
}
}
if (!remove) {
result.add(token);
}
}
return result;
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class WordAnalyzer method traverseSuffixes.
private void traverseSuffixes(List<Token> current, List<WordAnalysis> completed) {
if (current.size() > 50) {
current = pruneCyclicPaths(current);
}
List<Token> newTokens = Lists.newArrayList();
for (Token token : current) {
boolean matchFound = false;
for (SuffixSurfaceNode successor : token.currentSurfaceNode.getSuccessors()) {
if (token.tail.startsWith(successor.surfaceForm)) {
if (token.tail.length() > 0) {
newTokens.add(token.getCopy(successor));
matchFound = true;
} else {
if (successor.termination != TerminationType.NON_TERMINAL) {
newTokens.add(token.getCopy(successor));
matchFound = true;
}
}
} else {
// System.out.println("No match:" + successor.getSuffixForm().getId() );
}
}
if (!matchFound) {
if (token.tail.length() == 0 && token.terminal) {
completed.add(token.getResult());
}
}
}
if (!newTokens.isEmpty()) {
traverseSuffixes(newTokens, completed);
}
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGeneratorTest method suffixFormIHarmonyTest.
@Test
public void suffixFormIHarmonyTest() {
SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
SuffixSurfaceNode form = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelUnrounded), "sIn");
Assert.assertEquals("sın", form.surfaceForm);
Assert.assertTrue(form.getAttributes().containsAll(Arrays.asList(LastLetterConsonant, LastVowelBack, LastVowelUnrounded)));
form = getFirstNodeNoExpectatios(sfg, set(LastVowelBack, LastVowelRounded), "sInIz");
Assert.assertEquals("sunuz", form.surfaceForm);
}
use of zemberek.morphology.lexicon.graph.SuffixSurfaceNode in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGeneratorTest method surfaceFormFunctionalTest.
@Test
public void surfaceFormFunctionalTest() {
Triple[] triples = { new Triple("kalem", "lAr", "ler"), new Triple("kalem", "lArA", "lere"), new Triple("kan", "lAr", "lar"), new Triple("kan", "lArAt", "larat"), new Triple("kan", "Ar", "ar"), new Triple("kaba", "lAr", "lar"), new Triple("kaba", "Ar", "r"), new Triple("kedi", "lAr", "ler"), new Triple("kedi", "lArA", "lere"), new Triple("kart", "lAr", "lar"), new Triple("a", "lAr", "lar"), new Triple("ee", "lAr", "ler"), new Triple("kalem", "lIk", "lik"), new Triple("kedi", "lIk", "lik"), new Triple("kabak", "lIk", "lık"), new Triple("kuzu", "lIk", "luk"), new Triple("göz", "lIk", "lük"), new Triple("gö", "lIk", "lük"), new Triple("ö", "lIk", "lük"), new Triple("kalem", "lArI", "leri"), new Triple("arı", "lArI", "ları"), new Triple("odun", "lArI", "ları"), new Triple("odun", "lIrA", "lura"), new Triple("kale", "+yA", "ye"), new Triple("kale", "+nA", "ne"), new Triple("kalem", "+yA", "e"), new Triple("kale", "+yI", "yi"), new Triple("kalem", "+yI", "i"), new Triple("kale", "+yIr", "yir"), new Triple("kale", "+yAr", "yer"), new Triple("kale", "+In", "n"), new Triple("kale", "+An", "n"), new Triple("kalem", "InA", "ine"), new Triple("kale", "InI", "ni"), new Triple("kitap", ">cA", "ça"), new Triple("sarraf", ">cA", "ça"), new Triple("makas", ">cA", "ça"), new Triple("tokat", ">cA", "ça"), new Triple("kaş", ">cA", "ça"), new Triple("fok", ">cA", "ça"), new Triple("gitar", ">cA", "ca"), new Triple("kalem", ">cA", "ce"), new Triple("kale", ">cA", "ce"), new Triple("kitap", ">dAn", "tan"), new Triple("gitar", ">dIn", "dın"), new Triple("kalem", ">dA", "de"), new Triple("kale", ">dArI", "deri"), new Triple("kale", "+y>cI", "yci"), new Triple("kitap", "+y>cI", "çı") };
SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
for (Triple triple : triples) {
SuffixSurfaceNode form = getFirstNodeNoExpectatios(sfg, sfg.defineMorphemicAttributes(new TurkishLetterSequence(triple.predecessor, alphabet)), triple.generationWord);
Assert.assertEquals("Error in:" + triple, triple.expectedSurface, form.surfaceForm);
}
}
Aggregations