use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method handleSpecialRoots.
private List<StemTransition> handleSpecialRoots(DictionaryItem item) {
String id = item.getId();
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(item.pronunciation);
StemTransition original, modified;
MorphemeState unmodifiedRootState = morphotactics.getRootState(item, originalAttrs);
switch(id) {
case "içeri_Noun":
case "içeri_Adj":
case "dışarı_Adj":
case "dışarı_Noun":
case "dışarı_Postp":
case "yukarı_Noun":
case "ileri_Noun":
case "yukarı_Adj":
case "şura_Noun":
case "bura_Noun":
case "ora_Noun":
original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
MorphemeState rootForModified;
switch(item.primaryPos) {
case Noun:
rootForModified = morphotactics.nounLastVowelDropRoot_S;
break;
case Adjective:
rootForModified = morphotactics.adjLastVowelDropRoot_S;
break;
// TODO: check postpositive case. Maybe it is not required.
case PostPositive:
rootForModified = morphotactics.adjLastVowelDropRoot_S;
break;
default:
throw new IllegalStateException("No root morpheme state found for " + item);
}
String m = item.root.substring(0, item.root.length() - 1);
modified = new StemTransition(m, item, calculateAttributes(m), rootForModified);
modified.getPhoneticAttributes().add(PhoneticAttribute.ExpectsConsonant);
modified.getPhoneticAttributes().add(PhoneticAttribute.CannotTerminate);
return Lists.newArrayList(original, modified);
case "ben_Pron_Pers":
case "sen_Pron_Pers":
original = new StemTransition(item.root, item, originalAttrs, unmodifiedRootState);
if (item.lemma.equals("ben")) {
modified = new StemTransition("ban", item, calculateAttributes("ban"), morphotactics.pronPers_Mod_S);
} else {
modified = new StemTransition("san", item, calculateAttributes("san"), morphotactics.pronPers_Mod_S);
}
original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
return Lists.newArrayList(original, modified);
case "demek_Verb":
case "yemek_Verb":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.vDeYeRoot_S);
switch(item.lemma) {
case "demek":
modified = new StemTransition("di", item, calculateAttributes("di"), morphotactics.vDeYeRoot_S);
break;
default:
modified = new StemTransition("yi", item, calculateAttributes("yi"), morphotactics.vDeYeRoot_S);
}
return Lists.newArrayList(original, modified);
case "imek_Verb":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.imekRoot_S);
return Lists.newArrayList(original);
case "birbiri_Pron_Quant":
case "çoğu_Pron_Quant":
case "öbürü_Pron_Quant":
case "birçoğu_Pron_Quant":
original = new StemTransition(item.root, item, originalAttrs, morphotactics.pronQuant_S);
switch(item.lemma) {
case "birbiri":
modified = new StemTransition("birbir", item, calculateAttributes("birbir"), morphotactics.pronQuantModified_S);
break;
case "çoğu":
modified = new StemTransition("çok", item, calculateAttributes("çok"), morphotactics.pronQuantModified_S);
break;
case "öbürü":
modified = new StemTransition("öbür", item, calculateAttributes("öbür"), morphotactics.pronQuantModified_S);
break;
default:
modified = new StemTransition("birçok", item, calculateAttributes("birçok"), morphotactics.pronQuantModified_S);
break;
}
original.getPhoneticAttributes().add(PhoneticAttribute.UnModifiedPronoun);
modified.getPhoneticAttributes().add(PhoneticAttribute.ModifiedPronoun);
return Lists.newArrayList(original, modified);
default:
throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
}
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class AttributesHelper method getMorphemicAttributes.
public static AttributeSet<PhoneticAttribute> getMorphemicAttributes(CharSequence seq, AttributeSet<PhoneticAttribute> predecessorAttrs) {
if (seq.length() == 0) {
return predecessorAttrs.copy();
}
AttributeSet<PhoneticAttribute> attrs = new AttributeSet<>();
if (alphabet.containsVowel(seq)) {
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVowel()) {
attrs.add(LastLetterVowel);
} else {
attrs.add(LastLetterConsonant);
}
TurkicLetter lastVowel = last.isVowel() ? last : alphabet.getLastVowel(seq);
if (lastVowel.isFrontal()) {
attrs.add(LastVowelFrontal);
} else {
attrs.add(LastVowelBack);
}
if (lastVowel.isRounded()) {
attrs.add(LastVowelRounded);
} else {
attrs.add(LastVowelUnrounded);
}
if (alphabet.getFirstLetter(seq).isVowel()) {
attrs.add(FirstLetterVowel);
} else {
attrs.add(FirstLetterConsonant);
}
} else {
// we transfer vowel attributes from the predecessor attributes.
attrs.copyFrom(predecessorAttrs);
attrs.addAll(NO_VOWEL_ATTRIBUTES);
attrs.remove(LastLetterVowel);
attrs.remove(ExpectsConsonant);
}
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVoiceless()) {
attrs.add(LastLetterVoiceless);
if (last.isStopConsonant()) {
// kitap
attrs.add(LastLetterVoicelessStop);
}
} else {
attrs.add(LastLetterVoiced);
}
return attrs;
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemNodeGenerator method handleSpecialStems.
// handle special words such as demek-diyecek , beni-bana
private StemNode[] handleSpecialStems(DictionaryItem item) {
TurkishSuffixes turkishSuffixes = (TurkishSuffixes) suffixProvider;
String id = item.getId();
if (id.equals("yemek_Verb")) {
StemNode[] stems;
stems = new StemNode[3];
stems[0] = new StemNode("ye", item, TerminationType.TERMINAL, calculateAttributes(item.root));
stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_Ye.allConnections());
EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
attrs.remove(PhoneticAttribute.LastLetterVowel);
attrs.add(PhoneticAttribute.LastLetterConsonant);
stems[1] = new StemNode("y", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
stems[2] = new StemNode("yi", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Yi.allConnections());
return stems;
} else if (id.equals("demek_Verb")) {
StemNode[] stems;
stems = new StemNode[3];
stems[0] = new StemNode("de", item, TerminationType.TERMINAL, calculateAttributes(item.root));
stems[0].exclusiveSuffixData.add(turkishSuffixes.Verb_De.allConnections());
EnumSet<PhoneticAttribute> attrs = calculateAttributes(item.root);
attrs.remove(PhoneticAttribute.LastLetterVowel);
attrs.add(PhoneticAttribute.LastLetterConsonant);
stems[1] = new StemNode("d", item, TerminationType.NON_TERMINAL, attrs, EnumSet.noneOf(PhoneticExpectation.class));
stems[1].exclusiveSuffixData.add(turkishSuffixes.Verb_De_Ye_Prog.allConnections());
stems[2] = new StemNode("di", item, TerminationType.NON_TERMINAL, calculateAttributes(item.root));
stems[2].exclusiveSuffixData.add(turkishSuffixes.Verb_Di.allConnections());
return stems;
} else if (id.equals("ben_Pron_Pers") || id.equals("sen_Pron_Pers")) {
StemNode[] stems;
stems = new StemNode[2];
if (item.lemma.equals("ben")) {
stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Ben.allConnections());
stems[1] = new StemNode("ban", item, TerminationType.NON_TERMINAL, calculateAttributes("ban"));
} else {
stems[0] = new StemNode(item.root, item, TerminationType.TERMINAL, calculateAttributes(item.root));
stems[0].exclusiveSuffixData.add(turkishSuffixes.PersPron_Sen.allConnections());
stems[1] = new StemNode("san", item, TerminationType.NON_TERMINAL, calculateAttributes("san"));
}
stems[1].exclusiveSuffixData.add(turkishSuffixes.PersPron_BanSan);
return stems;
} else {
throw new IllegalArgumentException("Lexicon Item with special stem change cannot be handled:" + item);
}
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemTransitionGenerator method generate.
/**
* Generates StemTransition objects from the dictionary item. <p>Most of the time a single
* StemNode is generated.
*
* @param item DictionaryItem
* @return one or more StemTransition objects.
*/
public List<StemTransition> generate(DictionaryItem item) {
if (specialRoots.contains(item.id)) {
return handleSpecialRoots(item);
}
if (hasModifierAttribute(item)) {
return generateModifiedRootNodes(item);
} else {
AttributeSet<PhoneticAttribute> phoneticAttributes = calculateAttributes(item.pronunciation);
StemTransition transition = new StemTransition(item.root, item, phoneticAttributes, morphotactics.getRootState(item, phoneticAttributes));
return Lists.newArrayList(transition);
}
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class RuleBasedAnalyzer method advance.
// for all allowed matching outgoing transitions, new paths are generated.
// Transition `conditions` are used for checking if a `search path`
// is allowed to pass a transition.
private List<SearchPath> advance(SearchPath path) {
List<SearchPath> newPaths = new ArrayList<>(2);
// for all outgoing transitions.
for (MorphemeTransition transition : path.currentState.getOutgoing()) {
SuffixTransition suffixTransition = (SuffixTransition) transition;
// if tail is empty and this transitions surface is not empty, no need to check.
if (path.tail.isEmpty() && suffixTransition.hasSurfaceForm()) {
if (debugMode) {
debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Empty surface expected."));
}
continue;
}
String surface = SurfaceTransition.generateSurface(suffixTransition, path.phoneticAttributes);
// no need to go further if generated surface form is not a prefix of the paths's tail.
boolean tailStartsWith = asciiTolerant ? TurkishAlphabet.INSTANCE.startsWithIgnoreDiacritics(path.tail, surface) : path.tail.startsWith(surface);
if (!tailStartsWith) {
if (debugMode) {
debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Surface Mismatch:" + surface));
}
continue;
}
// if transition condition fails, add it to debug data.
if (debugMode && suffixTransition.getCondition() != null) {
Condition condition = suffixTransition.getCondition();
Condition failed;
if (condition instanceof CombinedCondition) {
failed = ((CombinedCondition) condition).getFailingCondition(path);
} else {
failed = condition.accept(path) ? null : condition;
}
if (failed != null) {
debugData.rejectedTransitions.put(path, new RejectedTransition(suffixTransition, "Condition → " + failed.toString()));
}
}
// check conditions.
if (!suffixTransition.canPass(path)) {
continue;
}
// epsilon (empty) transition. Add and continue. Use existing attributes.
if (!suffixTransition.hasSurfaceForm()) {
newPaths.add(path.getCopy(new SurfaceTransition("", suffixTransition), path.phoneticAttributes));
continue;
}
SurfaceTransition surfaceTransition = new SurfaceTransition(surface, suffixTransition);
// if tail is equal to surface, no need to calculate phonetic attributes.
boolean tailEqualsSurface = asciiTolerant ? TurkishAlphabet.INSTANCE.equalsIgnoreDiacritics(path.tail, surface) : path.tail.equals(surface);
AttributeSet<PhoneticAttribute> attributes = tailEqualsSurface ? path.phoneticAttributes.copy() : AttributesHelper.getMorphemicAttributes(surface, path.phoneticAttributes);
// This is required for suffixes like `cik` and `ciğ`
// an extra attribute is added if "cik" or "ciğ" is generated and matches the tail.
// if "cik" is generated, ExpectsConsonant attribute is added, so only a consonant starting
// suffix can follow. Likewise, if "ciğ" is produced, a vowel starting suffix is allowed.
attributes.remove(PhoneticAttribute.CannotTerminate);
SuffixTemplateToken lastToken = suffixTransition.getLastTemplateToken();
if (lastToken.type == TemplateTokenType.LAST_VOICED) {
attributes.add(PhoneticAttribute.ExpectsConsonant);
} else if (lastToken.type == TemplateTokenType.LAST_NOT_VOICED) {
attributes.add(PhoneticAttribute.ExpectsVowel);
attributes.add(PhoneticAttribute.CannotTerminate);
}
SearchPath p = path.getCopy(surfaceTransition, attributes);
newPaths.add(p);
}
return newPaths;
}
Aggregations