use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.
the class TurkishMorphology method analyzeWordsWithApostrophe.
private List<WordAnalysis> analyzeWordsWithApostrophe(String word) {
int index = word.indexOf('\'');
if (index >= 0) {
if (index == 0 || index == word.length() - 1) {
return Collections.emptyList();
}
StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
String withoutQuote = word.replaceAll("'", "");
List<WordAnalysis> noQuotesParses = wordAnalyzer.analyze(withoutQuote);
if (noQuotesParses.size() == 0) {
return Collections.emptyList();
}
return noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStems().contains(stem)).collect(Collectors.toList());
} else {
return Collections.emptyList();
}
}
use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method analyze.
public synchronized List<WordAnalysis> analyze(String word) {
if (word.contains("?")) {
return Collections.emptyList();
}
if (!Strings.containsNone(word, "0123456789")) {
return parseNumeral(word);
}
int index = word.indexOf('\'');
if (index >= 0) {
if (index == 0 || index == word.length() - 1) {
return Collections.emptyList();
}
StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
String ending = TurkishAlphabet.INSTANCE.normalize(se.ending);
String pronunciation = guessPronunciation(stem);
DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(stem), stem, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
itemProp.attributes.add(RootAttribute.Runtime);
graph.addDictionaryItem(itemProp);
String toParse = stem + ending;
List<WordAnalysis> properResults = parser.analyze(toParse);
graph.removeDictionaryItem(itemProp);
return properResults;
} else if (Character.isUpperCase(word.charAt(0))) {
String normalized = TurkishAlphabet.INSTANCE.normalize(word);
String pronunciation = guessPronunciation(normalized);
DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(normalized), normalized, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
itemProp.attributes.add(RootAttribute.Runtime);
graph.addDictionaryItem(itemProp);
// TODO eliminate gross code duplication
List<WordAnalysis> properResults = parser.analyze(normalized);
graph.removeDictionaryItem(itemProp);
return properResults;
}
return Collections.emptyList();
}
use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method parseNumeral.
public List<WordAnalysis> parseNumeral(String s) {
StemAndEnding se = getFromNumeral(s);
String lemma;
if (se.stem.endsWith(".")) {
String ss = se.stem.substring(0, se.stem.length() - 1);
lemma = numeralEndingMachine.find(ss);
lemma = ordinalMap.get(lemma);
} else {
lemma = numeralEndingMachine.find(se.stem);
}
List<WordAnalysis> results = Lists.newArrayListWithCapacity(1);
for (TurkishDictionaryLoader.Digit digit : TurkishDictionaryLoader.Digit.values()) {
Matcher m = digit.pattern.matcher(se.stem);
if (m.find()) {
String toParse;
if (se.ending.length() > 0 && lemma.equals("dört") && TurkishAlphabet.INSTANCE.isVowel(se.ending.charAt(0))) {
toParse = "dörd" + se.ending;
} else {
toParse = lemma + se.ending;
}
List<WordAnalysis> res = turkishParser.getWordAnalyzer().analyze(toParse);
for (WordAnalysis re : res) {
if (re.dictionaryItem.primaryPos != PrimaryPos.Numeral) {
continue;
}
re.dictionaryItem = new DictionaryItem(se.stem, se.stem, s + lemma, PrimaryPos.Numeral, digit.secondaryPos);
re.dictionaryItem.attributes.add(RootAttribute.Runtime);
re.root = se.stem;
results.add(re);
}
}
}
return results;
}
Aggregations