use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method tryNumeral.
private List<SingleAnalysis> tryNumeral(Token token) {
String s = token.getText();
s = s.toLowerCase(TurkishAlphabet.TR);
StemAndEnding se = getFromNumeral(s);
String lemma;
if (se.stem.endsWith(".")) {
String ss = se.stem.substring(0, se.stem.length() - 1);
lemma = numeralEndingMachine.find(ss);
lemma = ordinalMap.get(lemma);
} else {
lemma = numeralEndingMachine.find(se.stem);
}
List<SingleAnalysis> results = Lists.newArrayListWithCapacity(1);
for (Numerals numerals : Numerals.values()) {
Matcher m = numerals.pattern.matcher(se.stem);
if (m.find()) {
String toParse;
if (se.ending.length() > 0 && lemma.equals("dört") && ALPHABET.isVowel(se.ending.charAt(0))) {
toParse = "dörd" + se.ending;
} else {
toParse = lemma + se.ending;
}
List<SingleAnalysis> res = analyzer.analyze(toParse);
for (SingleAnalysis re : res) {
if (re.getDictionaryItem().primaryPos != PrimaryPos.Numeral) {
continue;
}
DictionaryItem runTimeItem = new DictionaryItem(se.stem, se.stem, s + lemma, PrimaryPos.Numeral, numerals.secondaryPos);
runTimeItem.attributes.add(RootAttribute.Runtime);
results.add(re.copyFor(runTimeItem, se.stem));
}
}
}
return results;
}
use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method tryWordWithApostrophe.
private List<SingleAnalysis> tryWordWithApostrophe(String word, SecondaryPos secondaryPos) {
String normalized = TurkishAlphabet.INSTANCE.normalizeApostrophe(word);
int index = normalized.indexOf('\'');
if (index <= 0 || index == normalized.length() - 1) {
return Collections.emptyList();
}
String stem = normalized.substring(0, index);
String ending = normalized.substring(index + 1);
StemAndEnding se = new StemAndEnding(stem, ending);
// TODO: should we remove dots with normalization?
String stemNormalized = TurkishAlphabet.INSTANCE.normalize(se.stem).replaceAll("[.]", "");
String endingNormalized = TurkishAlphabet.INSTANCE.normalize(se.ending);
String pronunciation = guessPronunciation(stemNormalized);
boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
boolean pronunciationPossible = alphabet.containsVowel(pronunciation);
DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : (pronunciationPossible ? stem : word), stemNormalized, pronunciation, PrimaryPos.Noun, secondaryPos);
if (!pronunciationPossible) {
List<SingleAnalysis> result = new ArrayList<>(1);
result.add(SingleAnalysis.dummy(word, item));
return result;
}
boolean itemDoesNotExist = !lexicon.containsItem(item);
if (itemDoesNotExist) {
item.attributes.add(RootAttribute.Runtime);
analyzer.getStemTransitions().addDictionaryItem(item);
}
String toParse = stemNormalized + endingNormalized;
List<SingleAnalysis> noQuotesParses = analyzer.analyze(toParse);
if (itemDoesNotExist) {
analyzer.getStemTransitions().removeDictionaryItem(item);
}
List<SingleAnalysis> analyses = noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStem().equals(stemNormalized)).collect(Collectors.toList());
return analyses;
}
use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.
the class TurkishMorphology method analyzeWordsWithApostrophe.
public List<SingleAnalysis> analyzeWordsWithApostrophe(String word) {
int index = word.indexOf('\'');
if (index <= 0 || index == word.length() - 1) {
return Collections.emptyList();
}
StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
String withoutQuote = word.replace("'", "");
List<SingleAnalysis> noQuotesParses = analyzer.analyze(withoutQuote);
if (noQuotesParses.size() == 0) {
return Collections.emptyList();
}
// words like "Hastanesi'ne". Should we accept Hastanesi or Hastane?
return noQuotesParses.stream().filter(a -> a.getDictionaryItem().primaryPos == PrimaryPos.Noun && (a.containsMorpheme(TurkishMorphotactics.p3sg) || a.getStem().equals(stem))).collect(Collectors.toList());
}
use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method getForRomanNumeral.
private List<SingleAnalysis> getForRomanNumeral(Token token) {
String content = token.getText();
StemAndEnding se;
if (content.contains("'")) {
int i = content.indexOf('\'');
se = new StemAndEnding(content.substring(0, i), content.substring(i + 1));
} else {
se = new StemAndEnding(content, "");
}
String ss = se.stem;
if (se.stem.endsWith(".")) {
ss = se.stem.substring(0, se.stem.length() - 1);
}
int decimal = TurkishNumbers.romanToDecimal(ss);
if (decimal == -1) {
return new ArrayList<>(0);
}
String lemma;
if (se.stem.endsWith(".")) {
lemma = numeralEndingMachine.find(String.valueOf(decimal));
lemma = ordinalMap.get(lemma);
} else {
lemma = numeralEndingMachine.find(String.valueOf(decimal));
}
List<SingleAnalysis> results = Lists.newArrayListWithCapacity(1);
String toParse;
if (se.ending.length() > 0 && lemma.equals("dört") && ALPHABET.isVowel(se.ending.charAt(0))) {
toParse = "dörd" + se.ending;
} else {
toParse = lemma + se.ending;
}
List<SingleAnalysis> res = analyzer.analyze(toParse);
for (SingleAnalysis re : res) {
if (re.getDictionaryItem().primaryPos != PrimaryPos.Numeral) {
continue;
}
DictionaryItem runTimeItem = new DictionaryItem(se.stem, se.stem, content + lemma, PrimaryPos.Numeral, SecondaryPos.RomanNumeral);
runTimeItem.attributes.add(RootAttribute.Runtime);
results.add(re.copyFor(runTimeItem, se.stem));
}
return results;
}
Aggregations