use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.
the class _MorphologicalAmbiguityResolverExperiment method collect.
private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
List<String> sentences = getSentences(p);
TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
int tokenCount = 0;
int sentenceCount = 0;
List<SingleAnalysisSentence> result = new ArrayList<>();
for (String sentence : sentences) {
sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
sentence = sentence.replaceAll("[\\u00ad]", "");
sentence = sentence.replaceAll("[…]", "...");
List<Single> singleAnalysisWords = new ArrayList<>();
List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
boolean failed = false;
int i = 0;
for (Token token : tokens) {
tokenCount++;
String rawWord = token.getText();
String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
WordAnalysis results;
if (cache.containsKey(word)) {
results = cache.get(word);
} else {
results = analyzer.analyze(word);
cache.put(word, results);
}
if (results.analysisCount() == 0) {
if (Strings.containsNone(word, "0123456789-.")) {
failedWords.add(word);
}
}
if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
failed = true;
break;
} else {
List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
if (filtered.size() == 0) {
failed = true;
break;
}
singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
i++;
}
}
if (!failed) {
result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
}
sentenceCount++;
if (sentenceCount % 2000 == 0) {
Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
}
}
return result;
}
use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method tryWordWithApostrophe.
private List<SingleAnalysis> tryWordWithApostrophe(String word, SecondaryPos secondaryPos) {
String normalized = TurkishAlphabet.INSTANCE.normalizeApostrophe(word);
int index = normalized.indexOf('\'');
if (index <= 0 || index == normalized.length() - 1) {
return Collections.emptyList();
}
String stem = normalized.substring(0, index);
String ending = normalized.substring(index + 1);
StemAndEnding se = new StemAndEnding(stem, ending);
// TODO: should we remove dots with normalization?
String stemNormalized = TurkishAlphabet.INSTANCE.normalize(se.stem).replaceAll("[.]", "");
String endingNormalized = TurkishAlphabet.INSTANCE.normalize(se.ending);
String pronunciation = guessPronunciation(stemNormalized);
boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
boolean pronunciationPossible = alphabet.containsVowel(pronunciation);
DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : (pronunciationPossible ? stem : word), stemNormalized, pronunciation, PrimaryPos.Noun, secondaryPos);
if (!pronunciationPossible) {
List<SingleAnalysis> result = new ArrayList<>(1);
result.add(SingleAnalysis.dummy(word, item));
return result;
}
boolean itemDoesNotExist = !lexicon.containsItem(item);
if (itemDoesNotExist) {
item.attributes.add(RootAttribute.Runtime);
analyzer.getStemTransitions().addDictionaryItem(item);
}
String toParse = stemNormalized + endingNormalized;
List<SingleAnalysis> noQuotesParses = analyzer.analyze(toParse);
if (itemDoesNotExist) {
analyzer.getStemTransitions().removeDictionaryItem(item);
}
List<SingleAnalysis> analyses = noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStem().equals(stemNormalized)).collect(Collectors.toList());
return analyses;
}
use of zemberek.core.turkish.SecondaryPos in project zemberek-nlp by ahmetaa.
the class UnidentifiedTokenAnalyzer method analyze.
public synchronized List<SingleAnalysis> analyze(Token token) {
SecondaryPos sPos = guessSecondaryPosType(token);
String word = token.getText();
// TODO: for now, for regular words and numbers etc, use the analyze method.
if (sPos == SecondaryPos.None) {
if (word.contains("?")) {
return Collections.emptyList();
}
if (alphabet.containsDigit(word)) {
return tryNumeral(token);
} else {
return analyzeWord(word, word.contains(".") ? SecondaryPos.Abbreviation : SecondaryPos.ProperNoun);
}
}
if (sPos == SecondaryPos.RomanNumeral) {
return getForRomanNumeral(token);
}
if (sPos == SecondaryPos.Date || sPos == SecondaryPos.Clock) {
return tryNumeral(token);
}
// TODO: consider returning analysis results without interfering with analyzer.
String normalized = nonLettersPattern.matcher(word).replaceAll("");
DictionaryItem item = new DictionaryItem(word, word, normalized, PrimaryPos.Noun, sPos);
if (sPos == SecondaryPos.HashTag || sPos == SecondaryPos.Email || sPos == SecondaryPos.Url || sPos == SecondaryPos.Mention) {
return analyzeWord(word, sPos);
}
boolean itemDoesNotExist = !lexicon.containsItem(item);
if (itemDoesNotExist) {
item.attributes.add(RootAttribute.Runtime);
analyzer.getStemTransitions().addDictionaryItem(item);
}
List<SingleAnalysis> results = analyzer.analyze(word);
if (itemDoesNotExist) {
analyzer.getStemTransitions().removeDictionaryItem(item);
}
return results;
}
Aggregations