use of zemberek.morphology.morphotactics.StemTransition in project zemberek-nlp by ahmetaa.
the class WordGenerator method generate.
private List<Result> generate(String input, List<StemTransition> candidates, List<Morpheme> morphemes) {
if (debugMode) {
debugData = new AnalysisDebugData();
debugData.input = input;
debugData.candidateStemTransitions.addAll(candidates);
}
// generate initial search paths.
List<GenerationPath> paths = new ArrayList<>();
for (StemTransition candidate : candidates) {
// we set the tail as " " because in morphotactics, some conditions look for tail's size
// during graph walk. Because this is generation we let that condition pass always.
SearchPath searchPath = SearchPath.initialPath(candidate, " ");
List<Morpheme> morphemesInPath;
// we skip it if it matches with the initial morpheme of the graph visiting SearchPath object.
if (morphemes.size() > 0) {
if (morphemes.get(0).equals(searchPath.getCurrentState().morpheme)) {
morphemesInPath = morphemes.subList(1, morphemes.size());
} else {
morphemesInPath = new ArrayList<>(morphemes);
}
} else {
morphemesInPath = new ArrayList<>(0);
}
paths.add(new GenerationPath(searchPath, morphemesInPath));
}
// search graph.
List<GenerationPath> resultPaths = search(paths);
// generate results from successful paths.
List<Result> result = new ArrayList<>(resultPaths.size());
for (GenerationPath path : resultPaths) {
SingleAnalysis analysis = SingleAnalysis.fromSearchPath(path.path);
result.add(new Result(analysis.surfaceForm(), analysis));
if (debugMode) {
debugData.results.add(analysis);
}
}
return result;
}
use of zemberek.morphology.morphotactics.StemTransition in project zemberek-nlp by ahmetaa.
the class StemTransitionTrieBasedTest method testPrefix.
@Test
public void testPrefix() {
RootLexicon lexicon = getLexicon();
StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
List<StemTransition> matches = t.getPrefixMatches("kabağa", false);
Assert.assertEquals(3, matches.size());
Set<String> lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabağ", "kabak"));
matches = t.getPrefixMatches("kabak", false);
Assert.assertEquals(2, matches.size());
lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabak"));
matches = t.getPrefixMatches("kapak", false);
Assert.assertEquals(3, matches.size());
lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kapak"));
}
use of zemberek.morphology.morphotactics.StemTransition in project zemberek-nlp by ahmetaa.
the class StemTransitionTrieBasedTest method testItem.
@Test
public void testItem() {
RootLexicon lexicon = getLexicon();
StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
DictionaryItem item = lexicon.getItemById("kapak_Noun");
List<StemTransition> transitions = t.getTransitions(item);
Assert.assertEquals(2, transitions.size());
Set<String> surfaces = transitions.stream().map(s -> s.surface).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(surfaces, "kapak", "kapağ"));
}
use of zemberek.morphology.morphotactics.StemTransition in project zemberek-nlp by ahmetaa.
the class StemTransitionsMapBased method generateAsciiTolerantMap.
// TODO: this is kind of a hack. Because StemTransitions may be shared between
// analyzer classes, this may be necessary when one of them happens to be ascii tolerant
// and other is not.
private void generateAsciiTolerantMap() {
lock.writeLock().lock();
asciiKeys = HashMultimap.create(1000, 2);
try {
// generate MultiMap for ascii tolerant keys
for (String s : singleStems.keySet()) {
String ascii = TurkishAlphabet.INSTANCE.toAscii(s);
if (TurkishAlphabet.INSTANCE.containsAsciiRelated(s)) {
asciiKeys.put(ascii, s);
}
}
for (StemTransition st : multiStems.values()) {
String s = st.surface;
String ascii = TurkishAlphabet.INSTANCE.toAscii(s);
if (TurkishAlphabet.INSTANCE.containsAsciiRelated(s)) {
asciiKeys.put(ascii, s);
}
}
} finally {
lock.writeLock().unlock();
}
}
use of zemberek.morphology.morphotactics.StemTransition in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method generateModifiedRootNodes.
private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
MorphemeState modifiedRootState = null;
MorphemeState unmodifiedRootState = null;
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
char last = alphabet.lastChar(modifiedSeq);
char voiced = alphabet.voice(last);
if (last == voiced) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
voiced = 'g';
}
modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
// TODO: find a better way for this.
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case Doubling:
modifiedSeq.append(alphabet.lastChar(modifiedSeq));
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case LastVowelDrop:
TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
if (lastLetter.isVowel()) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
} else {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
} else {
unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
}
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
if (modifiedSeq.length() > 1) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
if (alphabet.containsVowel(modifiedSeq)) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
}
break;
default:
break;
}
}
if (unmodifiedRootState == null) {
unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
}
StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
// if modified root state is not defined in the switch block, get it from morphotactics.
if (modifiedRootState == null) {
modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
}
StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
if (original.equals(modified)) {
return Collections.singletonList(original);
}
return Lists.newArrayList(original, modified);
}
Aggregations