use of zemberek.morphology.morphotactics.MorphemeState in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method generateModifiedRootNodes.
private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
MorphemeState modifiedRootState = null;
MorphemeState unmodifiedRootState = null;
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
char last = alphabet.lastChar(modifiedSeq);
char voiced = alphabet.voice(last);
if (last == voiced) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
voiced = 'g';
}
modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
// TODO: find a better way for this.
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case Doubling:
modifiedSeq.append(alphabet.lastChar(modifiedSeq));
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case LastVowelDrop:
TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
if (lastLetter.isVowel()) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
} else {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
} else {
unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
}
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
if (modifiedSeq.length() > 1) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
if (alphabet.containsVowel(modifiedSeq)) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
}
break;
default:
break;
}
}
if (unmodifiedRootState == null) {
unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
}
StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
// if modified root state is not defined in the switch block, get it from morphotactics.
if (modifiedRootState == null) {
modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
}
StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
if (original.equals(modified)) {
return Collections.singletonList(original);
}
return Lists.newArrayList(original, modified);
}
use of zemberek.morphology.morphotactics.MorphemeState in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method handleSpecialRoots.
private List<StemTransition> handleSpecialRoots(DictionaryItem item) {
String id = item.getId();
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(item.pronunciation);
StemTransition original, modified;
MorphemeState unmodifiedRootState = morphotactics.getRootState(item, originalAttrs);
switch(id) {
case "içeri_Noun":
case "içeri_Adj":
case "dışarı_Adj":
case "dışarı_Noun":
case "dışarı_Postp":
case "yukarı_Noun":
case "ileri_Noun":
case "yukarı_Adj":
case "şura_Noun":
case "bura_Noun":
case "ora_Noun":
original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
MorphemeState rootForModified;
switch(item.primaryPos) {
case Noun:
rootForModified = morphotactics.nounLastVowelDropRoot_S;
break;
case Adjective:
rootForModified = morphotactics.adjLastVowelDropRoot_S;
break;
// TODO: check postpositive case. Maybe it is not required.
case PostPositive:
rootForModified = morphotactics.adjLastVowelDropRoot_S;
break;
default:
throw new IllegalStateException("No root morpheme state found for " + item);
}
String m = item.root.substring(0, item.root.length() - 1);
modified = new StemTransition(m, item, calculateAttributes(m), rootForModified);
modified.getPhoneticAttributes().add(PhoneticAttribute.ExpectsConsonant);
modified.getPhoneticAttributes().add(PhoneticAttribute.CannotTerminate);
return Lists.newArrayList(original, modified);
case "ben_Pron_Pers":
case "sen_Pron_Pers":
original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
if (item.lemma.equals("ben")) {
modified = new StemTransition("ban", item, calculateAttributes("ban"), morphotactics.pronPers_Mod_S);
} else {
modified = new StemTransition("san", item, calculateAttributes("san"), morphotactics.pronPers_Mod_S);
}
original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
return Lists.newArrayList(original, modified);
case "demek_Verb":
case "yemek_Verb":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.vDeYeRoot_S);
switch(item.lemma) {
case "demek":
modified = new StemTransition("di", item, calculateAttributes("di"), morphotactics.vDeYeRoot_S);
break;
default:
modified = new StemTransition("yi", item, calculateAttributes("yi"), morphotactics.vDeYeRoot_S);
}
return Lists.newArrayList(original, modified);
case "imek_Verb":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.imekRoot_S);
return Lists.newArrayList(original);
case "birbiri_Pron_Quant":
case "çoğu_Pron_Quant":
case "öbürü_Pron_Quant":
case "birçoğu_Pron_Quant":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.pronQuant_S);
switch(item.lemma) {
case "birbiri":
modified = new StemTransition("birbir", item, calculateAttributes("birbir"), morphotactics.pronQuantModified_S);
break;
case "çoğu":
modified = new StemTransition("çok", item, calculateAttributes("çok"), morphotactics.pronQuantModified_S);
break;
case "öbürü":
modified = new StemTransition("öbür", item, calculateAttributes("öbür"), morphotactics.pronQuantModified_S);
break;
default:
modified = new StemTransition("birçok", item, calculateAttributes("birçok"), morphotactics.pronQuantModified_S);
break;
}
original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
return Lists.newArrayList(original, modified);
default:
throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
}
}
Aggregations