use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNode method printAttributes.
private void printAttributes(StringBuilder sb) {
if (!attributes.isEmpty()) {
sb.append(" [A:");
} else {
return;
}
int i = 0;
for (PhoneticAttribute attribute : attributes) {
sb.append(attribute.getStringForm());
if (i++ < attributes.size() - 1) {
sb.append(", ");
}
}
sb.append("]");
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemNodeGenerator method generateModifiedRootNodes.
private StemNode[] generateModifiedRootNodes(DictionaryItem dicItem) {
if (dicItem.hasAttribute(Special)) {
return handleSpecialStems(dicItem);
}
TurkishLetterSequence modifiedSeq = new TurkishLetterSequence(dicItem.pronunciation, alphabet);
EnumSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
EnumSet<PhoneticAttribute> modifiedAttrs = originalAttrs.clone();
EnumSet<PhoneticExpectation> originalExpectations = EnumSet.noneOf(PhoneticExpectation.class);
EnumSet<PhoneticExpectation> modifiedExpectations = EnumSet.noneOf(PhoneticExpectation.class);
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
TurkicLetter last = modifiedSeq.lastLetter();
TurkicLetter modifiedLetter = alphabet.voice(last);
if (modifiedLetter == null) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
modifiedLetter = TurkishAlphabet.L_g;
}
modifiedSeq.changeLetter(modifiedSeq.length() - 1, modifiedLetter);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalExpectations.add(PhoneticExpectation.ConsonantStart);
modifiedExpectations.add(PhoneticExpectation.VowelStart);
break;
case Doubling:
modifiedSeq.append(modifiedSeq.lastLetter());
originalExpectations.add(PhoneticExpectation.ConsonantStart);
modifiedExpectations.add(PhoneticExpectation.VowelStart);
break;
case LastVowelDrop:
if (modifiedSeq.lastLetter().isVowel()) {
modifiedSeq.delete(modifiedSeq.length() - 1);
modifiedExpectations.add(PhoneticExpectation.ConsonantStart);
} else {
modifiedSeq.delete(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalExpectations.add(PhoneticExpectation.ConsonantStart);
}
modifiedExpectations.add(PhoneticExpectation.VowelStart);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
modifiedSeq.delete(modifiedSeq.length() - 1);
if (modifiedSeq.hasVowel()) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
break;
default:
break;
}
}
StemNode original = new StemNode(dicItem.root, dicItem, originalAttrs, originalExpectations);
StemNode modified = new StemNode(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedExpectations);
SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(dicItem);
original.exclusiveSuffixData = roots[0];
modified.exclusiveSuffixData = roots[1];
if (original.equals(modified)) {
return new StemNode[] { original };
}
modified.setTermination(TerminationType.NON_TERMINAL);
if (dicItem.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
original.setTermination(TerminationType.NON_TERMINAL);
}
return new StemNode[] { original, modified };
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemNodeGenerator method generate.
/**
* Generates StemNode objects from the dictionary item.
* <p>Most of the time a single StemNode is generated.
*
* @param item DictionaryItem
* @return one or more StemNode objects.
*/
public StemNode[] generate(DictionaryItem item) {
if (hasModifierAttribute(item)) {
return generateModifiedRootNodes(item);
} else {
SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(item);
EnumSet<PhoneticAttribute> phoneticAttributes = calculateAttributes(item.pronunciation);
StemNode stemNode = new StemNode(item.root, item, TerminationType.TERMINAL, phoneticAttributes, EnumSet.noneOf(PhoneticExpectation.class));
stemNode.exclusiveSuffixData = roots[0];
return new StemNode[] { stemNode };
}
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class WordGenerator method advance.
// for all allowed matching outgoing transitions, new paths are generated.
// Transition conditions are used for checking if a search path is allowed to pass a transition.
private List<GenerationPath> advance(GenerationPath gPath) {
List<GenerationPath> newPaths = new ArrayList<>(2);
// for all outgoing transitions.
for (MorphemeTransition transition : gPath.path.getCurrentState().getOutgoing()) {
SuffixTransition suffixTransition = (SuffixTransition) transition;
// if there are no morphemes and this transitions surface is not empty, no need to check.
if (gPath.morphemes.isEmpty() && suffixTransition.hasSurfaceForm()) {
if (debugMode) {
debugData.rejectedTransitions.put(gPath.path, new RejectedTransition(suffixTransition, "Empty surface expected."));
}
continue;
}
// if transition surface is empty, here will pass.
if (!gPath.matches(suffixTransition)) {
if (debugMode) {
debugData.rejectedTransitions.put(gPath.path, new RejectedTransition(suffixTransition, "Morpheme mismatch." + suffixTransition.to.morpheme));
}
continue;
}
// if transition condition fails, add it to debug data.
if (debugMode && suffixTransition.getCondition() != null) {
Condition condition = suffixTransition.getCondition();
Condition failed;
if (condition instanceof CombinedCondition) {
failed = ((CombinedCondition) condition).getFailingCondition(gPath.path);
} else {
failed = condition.accept(gPath.path) ? null : condition;
}
if (failed != null) {
debugData.rejectedTransitions.put(gPath.path, new RejectedTransition(suffixTransition, "Condition → " + failed.toString()));
}
}
// check conditions.
if (!suffixTransition.canPass(gPath.path)) {
continue;
}
// epsilon transition. Add and continue. Use existing attributes.
if (!suffixTransition.hasSurfaceForm()) {
SearchPath pCopy = gPath.path.getCopyForGeneration(new SurfaceTransition("", suffixTransition), gPath.path.getPhoneticAttributes());
newPaths.add(gPath.copy(pCopy));
continue;
}
String surface = SurfaceTransition.generateSurface(suffixTransition, gPath.path.getPhoneticAttributes());
SurfaceTransition surfaceTransition = new SurfaceTransition(surface, suffixTransition);
// if tail is equal to surface, no need to calculate phonetic attributes.
AttributeSet<PhoneticAttribute> attributes = AttributesHelper.getMorphemicAttributes(surface, gPath.path.getPhoneticAttributes());
// This is required for suffixes like `cik` and `ciğ`
// an extra attribute is added if "cik" or "ciğ" is generated and matches the tail.
// if "cik" is generated, ExpectsConsonant attribute is added, so only a consonant starting
// suffix can follow. Likewise, if "ciğ" is produced, a vowel starting suffix is allowed.
attributes.remove(PhoneticAttribute.CannotTerminate);
SuffixTemplateToken lastToken = suffixTransition.getLastTemplateToken();
if (lastToken.getType() == TemplateTokenType.LAST_VOICED) {
attributes.add(PhoneticAttribute.ExpectsConsonant);
} else if (lastToken.getType() == TemplateTokenType.LAST_NOT_VOICED) {
attributes.add(PhoneticAttribute.ExpectsVowel);
attributes.add(PhoneticAttribute.CannotTerminate);
}
SearchPath p = gPath.path.getCopyForGeneration(surfaceTransition, attributes);
newPaths.add(gPath.copy(p));
}
return newPaths;
}
use of zemberek.core.turkish.PhoneticAttribute in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method generateModifiedRootNodes.
private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
MorphemeState modifiedRootState = null;
MorphemeState unmodifiedRootState = null;
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
char last = alphabet.lastChar(modifiedSeq);
char voiced = alphabet.voice(last);
if (last == voiced) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
voiced = 'g';
}
modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
// TODO: find a better way for this.
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case Doubling:
modifiedSeq.append(alphabet.lastChar(modifiedSeq));
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case LastVowelDrop:
TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
if (lastLetter.isVowel()) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
} else {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
} else {
unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
}
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
if (modifiedSeq.length() > 1) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
if (alphabet.containsVowel(modifiedSeq)) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
}
break;
default:
break;
}
}
if (unmodifiedRootState == null) {
unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
}
StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
// if modified root state is not defined in the switch block, get it from morphotactics.
if (modifiedRootState == null) {
modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
}
StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
if (original.equals(modified)) {
return Collections.singletonList(original);
}
return Lists.newArrayList(original, modified);
}
Aggregations