use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class SuffixSurfaceNodeGenerator method generate.
public List<SuffixSurfaceNode> generate(EnumSet<PhoneticAttribute> attrs, EnumSet<PhoneticExpectation> expectations, SuffixData suffixData, SuffixForm suffixForm) {
List<SuffixToken> tokenList = Lists.newArrayList(new SuffixStringTokenizer(suffixForm.generation));
// zero length token
if (tokenList.size() == 0) {
return Lists.newArrayList(new SuffixSurfaceNode(suffixForm, "", attrs.clone(), expectations.clone(), suffixData, suffixForm.terminationType));
}
List<SuffixSurfaceNode> forms = new ArrayList<SuffixSurfaceNode>(1);
// generation of forms. normally only one form is generated. But in situations like cI~k, two Forms are generated.
TurkishLetterSequence seq = new TurkishLetterSequence();
int index = 0;
for (SuffixToken token : tokenList) {
EnumSet<PhoneticAttribute> formAttrs = defineMorphemicAttributes(seq, attrs);
switch(token.type) {
case LETTER:
seq.append(token.letter);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case A_WOVEL:
if (index == 0 && attrs.contains(LastLetterVowel)) {
break;
}
TurkicLetter lA = TurkicLetter.UNDEFINED;
if (formAttrs.contains(LastVowelBack)) {
lA = L_a;
} else if (formAttrs.contains(LastVowelFrontal)) {
lA = L_e;
}
if (lA == TurkicLetter.UNDEFINED) {
throw new IllegalArgumentException("Cannot generate A form!");
}
seq.append(lA);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case I_WOVEL:
if (index == 0 && attrs.contains(LastLetterVowel)) {
break;
}
TurkicLetter li = TurkicLetter.UNDEFINED;
if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelRounded))) {
li = L_u;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelUnrounded))) {
li = L_ii;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelRounded))) {
li = L_uu;
} else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelUnrounded))) {
li = L_i;
}
if (li == TurkicLetter.UNDEFINED) {
throw new IllegalArgumentException("Cannot generate I form!");
}
seq.append(li);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case APPEND:
if (formAttrs.contains(LastLetterVowel)) {
seq.append(token.letter);
}
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case DEVOICE_FIRST:
TurkicLetter ld = token.letter;
if (formAttrs.contains(LastLetterVoiceless)) {
ld = Turkish.Alphabet.devoice(token.letter);
}
seq.append(ld);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
}
break;
case VOICE_LAST:
ld = token.letter;
seq.append(ld);
if (index == tokenList.size() - 1) {
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.ConsonantStart), suffixData, suffixForm.terminationType));
seq.changeLast(Turkish.Alphabet.voice(token.letter));
forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.VowelStart), suffixData, TerminationType.NON_TERMINAL));
}
break;
}
index++;
}
return forms;
}
use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class AttributesHelper method getMorphemicAttributes.
public static AttributeSet<PhoneticAttribute> getMorphemicAttributes(CharSequence seq, AttributeSet<PhoneticAttribute> predecessorAttrs) {
if (seq.length() == 0) {
return predecessorAttrs.copy();
}
AttributeSet<PhoneticAttribute> attrs = new AttributeSet<>();
if (alphabet.containsVowel(seq)) {
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVowel()) {
attrs.add(LastLetterVowel);
} else {
attrs.add(LastLetterConsonant);
}
TurkicLetter lastVowel = last.isVowel() ? last : alphabet.getLastVowel(seq);
if (lastVowel.isFrontal()) {
attrs.add(LastVowelFrontal);
} else {
attrs.add(LastVowelBack);
}
if (lastVowel.isRounded()) {
attrs.add(LastVowelRounded);
} else {
attrs.add(LastVowelUnrounded);
}
if (alphabet.getFirstLetter(seq).isVowel()) {
attrs.add(FirstLetterVowel);
} else {
attrs.add(FirstLetterConsonant);
}
} else {
// we transfer vowel attributes from the predecessor attributes.
attrs.copyFrom(predecessorAttrs);
attrs.addAll(NO_VOWEL_ATTRIBUTES);
attrs.remove(LastLetterVowel);
attrs.remove(ExpectsConsonant);
}
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVoiceless()) {
attrs.add(LastLetterVoiceless);
if (last.isStopConsonant()) {
// kitap
attrs.add(LastLetterVoicelessStop);
}
} else {
attrs.add(LastLetterVoiced);
}
return attrs;
}
use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class StemTransitionGenerator method generateModifiedRootNodes.
private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
MorphemeState modifiedRootState = null;
MorphemeState unmodifiedRootState = null;
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
char last = alphabet.getLastChar(modifiedSeq);
char voiced = alphabet.voice(last);
if (last == voiced) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
voiced = 'g';
}
modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
// TODO: find a better way for this.
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case Doubling:
modifiedSeq.append(alphabet.getLastChar(modifiedSeq));
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case LastVowelDrop:
TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
if (lastLetter.isVowel()) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
} else {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
} else {
unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
}
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
if (alphabet.containsVowel(modifiedSeq)) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
break;
default:
break;
}
}
if (unmodifiedRootState == null) {
unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
}
StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
// if modified root state is not defined in the switch block, get it from morphotactics.
if (modifiedRootState == null) {
modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
}
StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
if (original.equals(modified)) {
return Collections.singletonList(original);
}
return Lists.newArrayList(original, modified);
}
use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class StemNodeGenerator method generateModifiedRootNodes.
private StemNode[] generateModifiedRootNodes(DictionaryItem dicItem) {
if (dicItem.hasAttribute(Special)) {
return handleSpecialStems(dicItem);
}
TurkishLetterSequence modifiedSeq = new TurkishLetterSequence(dicItem.pronunciation, alphabet);
EnumSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
EnumSet<PhoneticAttribute> modifiedAttrs = originalAttrs.clone();
EnumSet<PhoneticExpectation> originalExpectations = EnumSet.noneOf(PhoneticExpectation.class);
EnumSet<PhoneticExpectation> modifiedExpectations = EnumSet.noneOf(PhoneticExpectation.class);
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
TurkicLetter last = modifiedSeq.lastLetter();
TurkicLetter modifiedLetter = alphabet.voice(last);
if (modifiedLetter == null) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
modifiedLetter = TurkishAlphabet.L_g;
}
modifiedSeq.changeLetter(modifiedSeq.length() - 1, modifiedLetter);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalExpectations.add(PhoneticExpectation.ConsonantStart);
modifiedExpectations.add(PhoneticExpectation.VowelStart);
break;
case Doubling:
modifiedSeq.append(modifiedSeq.lastLetter());
originalExpectations.add(PhoneticExpectation.ConsonantStart);
modifiedExpectations.add(PhoneticExpectation.VowelStart);
break;
case LastVowelDrop:
if (modifiedSeq.lastLetter().isVowel()) {
modifiedSeq.delete(modifiedSeq.length() - 1);
modifiedExpectations.add(PhoneticExpectation.ConsonantStart);
} else {
modifiedSeq.delete(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalExpectations.add(PhoneticExpectation.ConsonantStart);
}
modifiedExpectations.add(PhoneticExpectation.VowelStart);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
modifiedSeq.delete(modifiedSeq.length() - 1);
if (modifiedSeq.hasVowel()) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
break;
default:
break;
}
}
StemNode original = new StemNode(dicItem.root, dicItem, originalAttrs, originalExpectations);
StemNode modified = new StemNode(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedExpectations);
SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(dicItem);
original.exclusiveSuffixData = roots[0];
modified.exclusiveSuffixData = roots[1];
if (original.equals(modified)) {
return new StemNode[] { original };
}
modified.setTermination(TerminationType.NON_TERMINAL);
if (dicItem.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
original.setTermination(TerminationType.NON_TERMINAL);
}
return new StemNode[] { original, modified };
}
use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class StemTransitionsBase method generateModifiedRootNodes.
private List<StemTransition> generateModifiedRootNodes(DictionaryItem dicItem) {
StringBuilder modifiedSeq = new StringBuilder(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
AttributeSet<PhoneticAttribute> modifiedAttrs = originalAttrs.copy();
MorphemeState modifiedRootState = null;
MorphemeState unmodifiedRootState = null;
for (RootAttribute attribute : dicItem.attributes) {
// generate other boundary attributes and modified root state.
switch(attribute) {
case Voicing:
char last = alphabet.lastChar(modifiedSeq);
char voiced = alphabet.voice(last);
if (last == voiced) {
throw new LexiconException("Voicing letter is not proper in:" + dicItem);
}
if (dicItem.lemma.endsWith("nk")) {
voiced = 'g';
}
modifiedSeq.setCharAt(modifiedSeq.length() - 1, voiced);
modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
// TODO: find a better way for this.
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case Doubling:
modifiedSeq.append(alphabet.lastChar(modifiedSeq));
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
break;
case LastVowelDrop:
TurkicLetter lastLetter = alphabet.getLastLetter(modifiedSeq);
if (lastLetter.isVowel()) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
modifiedAttrs.add(PhoneticAttribute.ExpectsConsonant);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
} else {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 2);
if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
originalAttrs.add(PhoneticAttribute.ExpectsConsonant);
} else {
unmodifiedRootState = morphotactics.verbLastVowelDropUnmodRoot_S;
modifiedRootState = morphotactics.verbLastVowelDropModRoot_S;
}
modifiedAttrs.add(PhoneticAttribute.ExpectsVowel);
modifiedAttrs.add(PhoneticAttribute.CannotTerminate);
}
break;
case InverseHarmony:
originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
originalAttrs.remove(PhoneticAttribute.LastVowelBack);
modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
break;
case ProgressiveVowelDrop:
if (modifiedSeq.length() > 1) {
modifiedSeq.deleteCharAt(modifiedSeq.length() - 1);
if (alphabet.containsVowel(modifiedSeq)) {
modifiedAttrs = calculateAttributes(modifiedSeq);
}
modifiedAttrs.add(PhoneticAttribute.LastLetterDropped);
}
break;
default:
break;
}
}
if (unmodifiedRootState == null) {
unmodifiedRootState = morphotactics.getRootState(dicItem, originalAttrs);
}
StemTransition original = new StemTransition(dicItem.root, dicItem, originalAttrs, unmodifiedRootState);
// if modified root state is not defined in the switch block, get it from morphotactics.
if (modifiedRootState == null) {
modifiedRootState = morphotactics.getRootState(dicItem, modifiedAttrs);
}
StemTransition modified = new StemTransition(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedRootState);
if (original.equals(modified)) {
return Collections.singletonList(original);
}
return Lists.newArrayList(original, modified);
}
Aggregations