use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class AttributesHelper method getMorphemicAttributes.
public static AttributeSet<PhoneticAttribute> getMorphemicAttributes(CharSequence seq, AttributeSet<PhoneticAttribute> predecessorAttrs) {
if (seq.length() == 0) {
return predecessorAttrs.copy();
}
AttributeSet<PhoneticAttribute> attrs = new AttributeSet<>();
if (alphabet.containsVowel(seq)) {
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVowel()) {
attrs.add(LastLetterVowel);
} else {
attrs.add(LastLetterConsonant);
}
TurkicLetter lastVowel = last.isVowel() ? last : alphabet.getLastVowel(seq);
if (lastVowel.isFrontal()) {
attrs.add(LastVowelFrontal);
} else {
attrs.add(LastVowelBack);
}
if (lastVowel.isRounded()) {
attrs.add(LastVowelRounded);
} else {
attrs.add(LastVowelUnrounded);
}
if (alphabet.getFirstLetter(seq).isVowel()) {
attrs.add(FirstLetterVowel);
} else {
attrs.add(FirstLetterConsonant);
}
} else {
// we transfer vowel attributes from the predecessor attributes.
attrs.copyFrom(predecessorAttrs);
attrs.addAll(NO_VOWEL_ATTRIBUTES);
attrs.remove(LastLetterVowel);
attrs.remove(ExpectsConsonant);
}
TurkicLetter last = alphabet.getLastLetter(seq);
if (last.isVoiceless()) {
attrs.add(LastLetterVoiceless);
if (last.isStopConsonant()) {
// kitap
attrs.add(LastLetterVoicelessStop);
}
} else {
attrs.add(LastLetterVoiced);
}
return attrs;
}
use of zemberek.core.turkish.TurkicLetter in project zemberek-nlp by ahmetaa.
the class StrictTurkishSyllableParser method letterCountForLastSyllable.
/**
* Returns the letter count of the last syllable of a given word. This algorithm does not work for
* words that starts with [tr-,st-,pr-] or ends with [-trak] Foreign letter words also cannot be
* processed.
*
* @param seq: TurkishLetterSequence object.
* @return Size of the last syllable. It can be 1,2,3 or 4. Returns -1 if syllable rules are not
* met.
*/
private int letterCountForLastSyllable(TurkishLetterSequence seq) {
final int length = seq.length();
TurkicLetter current = seq.getLetter(length - 1);
TurkicLetter previous = seq.getLetter(length - 2);
if (length == 0) {
return -1;
}
if (current.isVowel()) {
// seq consist of a single vowel
if (length == 1) {
return 1;
}
// current and previous letters are vowels. Eg. "saa"
if (previous.isVowel()) {
return 1;
}
// length is two and previous is vowel. Eg. "ya"
if (length == 2) {
return 2;
}
TurkicLetter twoBefore = seq.getLetter(length - 3);
// ste-tos-kop -> ste
if (!twoBefore.isVowel() && length == 3) {
return 3;
}
return 2;
} else {
// single consonant.
if (length == 1) {
return -1;
}
TurkicLetter twoBefore = seq.getLetter(length - 3);
if (previous.isVowel()) {
// For words like [el, al] or two letter before is vowel. (`at` in sa-at)
if (length == 2 || twoBefore.isVowel()) {
return 2;
}
TurkicLetter threeBefore = seq.getLetter(length - 4);
// seq uc harfli (kal, sel) ya da uc onceki harf sesli (kanat),
if (length == 3 || threeBefore.isVowel()) {
return 3;
}
// Such as tren, strateji, krank, angstrom.
if (length == 4) {
return -1;
}
TurkicLetter fourBefore = seq.getLetter(length - 5);
if (!fourBefore.isVowel()) {
return 3;
}
return 3;
} else {
if (length == 2 || !twoBefore.isVowel()) {
return -1;
}
TurkicLetter threeBefore = seq.getLetter(length - 4);
if (length > 3 && !threeBefore.isVowel()) {
return 4;
}
return 3;
}
}
}
Aggregations