Search in sources :

Example 1 with StemAndEnding

use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method tryNumeral.

private List<SingleAnalysis> tryNumeral(Token token) {
    String s = token.getText();
    s = s.toLowerCase(TurkishAlphabet.TR);
    StemAndEnding se = getFromNumeral(s);
    String lemma;
    if (se.stem.endsWith(".")) {
        String ss = se.stem.substring(0, se.stem.length() - 1);
        lemma = numeralEndingMachine.find(ss);
        lemma = ordinalMap.get(lemma);
    } else {
        lemma = numeralEndingMachine.find(se.stem);
    }
    List<SingleAnalysis> results = Lists.newArrayListWithCapacity(1);
    for (Numerals numerals : Numerals.values()) {
        Matcher m = numerals.pattern.matcher(se.stem);
        if (m.find()) {
            String toParse;
            if (se.ending.length() > 0 && lemma.equals("dört") && ALPHABET.isVowel(se.ending.charAt(0))) {
                toParse = "dörd" + se.ending;
            } else {
                toParse = lemma + se.ending;
            }
            List<SingleAnalysis> res = analyzer.analyze(toParse);
            for (SingleAnalysis re : res) {
                if (re.getDictionaryItem().primaryPos != PrimaryPos.Numeral) {
                    continue;
                }
                DictionaryItem runTimeItem = new DictionaryItem(se.stem, se.stem, s + lemma, PrimaryPos.Numeral, numerals.secondaryPos);
                runTimeItem.attributes.add(RootAttribute.Runtime);
                results.add(re.copyFor(runTimeItem, se.stem));
            }
        }
    }
    return results;
}
Also used : StemAndEnding(zemberek.core.turkish.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Matcher(java.util.regex.Matcher)

Example 2 with StemAndEnding

use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method tryWordWithApostrophe.

private List<SingleAnalysis> tryWordWithApostrophe(String word, SecondaryPos secondaryPos) {
    String normalized = TurkishAlphabet.INSTANCE.normalizeApostrophe(word);
    int index = normalized.indexOf('\'');
    if (index <= 0 || index == normalized.length() - 1) {
        return Collections.emptyList();
    }
    String stem = normalized.substring(0, index);
    String ending = normalized.substring(index + 1);
    StemAndEnding se = new StemAndEnding(stem, ending);
    // TODO: should we remove dots with normalization?
    String stemNormalized = TurkishAlphabet.INSTANCE.normalize(se.stem).replaceAll("[.]", "");
    String endingNormalized = TurkishAlphabet.INSTANCE.normalize(se.ending);
    String pronunciation = guessPronunciation(stemNormalized);
    boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
    boolean pronunciationPossible = alphabet.containsVowel(pronunciation);
    DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : (pronunciationPossible ? stem : word), stemNormalized, pronunciation, PrimaryPos.Noun, secondaryPos);
    if (!pronunciationPossible) {
        List<SingleAnalysis> result = new ArrayList<>(1);
        result.add(SingleAnalysis.dummy(word, item));
        return result;
    }
    boolean itemDoesNotExist = !lexicon.containsItem(item);
    if (itemDoesNotExist) {
        item.attributes.add(RootAttribute.Runtime);
        analyzer.getStemTransitions().addDictionaryItem(item);
    }
    String toParse = stemNormalized + endingNormalized;
    List<SingleAnalysis> noQuotesParses = analyzer.analyze(toParse);
    if (itemDoesNotExist) {
        analyzer.getStemTransitions().removeDictionaryItem(item);
    }
    List<SingleAnalysis> analyses = noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStem().equals(stemNormalized)).collect(Collectors.toList());
    return analyses;
}
Also used : StemAndEnding(zemberek.core.turkish.StemAndEnding) StemAndEnding(zemberek.core.turkish.StemAndEnding) RootAttribute(zemberek.core.turkish.RootAttribute) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Turkish(zemberek.core.turkish.Turkish) List(java.util.List) Token(zemberek.tokenization.Token) Lists(com.google.common.collect.Lists) Matcher(java.util.regex.Matcher) PronunciationGuesser(zemberek.morphology.analysis.tr.PronunciationGuesser) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) TurkishNumbers(zemberek.morphology.analysis.tr.TurkishNumbers) Map(java.util.Map) PrimaryPos(zemberek.core.turkish.PrimaryPos) Pattern(java.util.regex.Pattern) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Collections(java.util.Collections) SecondaryPos(zemberek.core.turkish.SecondaryPos) TurkishNumeralEndingMachine(zemberek.morphology.analysis.tr.TurkishNumeralEndingMachine) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList)

Example 3 with StemAndEnding

use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.

the class TurkishMorphology method analyzeWordsWithApostrophe.

public List<SingleAnalysis> analyzeWordsWithApostrophe(String word) {
    int index = word.indexOf('\'');
    if (index <= 0 || index == word.length() - 1) {
        return Collections.emptyList();
    }
    StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
    String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
    String withoutQuote = word.replace("'", "");
    List<SingleAnalysis> noQuotesParses = analyzer.analyze(withoutQuote);
    if (noQuotesParses.size() == 0) {
        return Collections.emptyList();
    }
    // words like "Hastanesi'ne". Should we accept Hastanesi or Hastane?
    return noQuotesParses.stream().filter(a -> a.getDictionaryItem().primaryPos == PrimaryPos.Noun && (a.containsMorpheme(TurkishMorphotactics.p3sg) || a.getStem().equals(stem))).collect(Collectors.toList());
}
Also used : StemAndEnding(zemberek.core.turkish.StemAndEnding) AmbiguityResolver(zemberek.morphology.ambiguity.AmbiguityResolver) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) StemAndEnding(zemberek.core.turkish.StemAndEnding) TextUtil(zemberek.core.text.TextUtil) Stopwatch(com.google.common.base.Stopwatch) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) PrimaryPos(zemberek.core.turkish.PrimaryPos) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) InformalTurkishMorphotactics(zemberek.morphology.morphotactics.InformalTurkishMorphotactics) RuleBasedAnalyzer(zemberek.morphology.analysis.RuleBasedAnalyzer) WordGenerator(zemberek.morphology.generator.WordGenerator) IOException(java.io.IOException) PerceptronAmbiguityResolver(zemberek.morphology.ambiguity.PerceptronAmbiguityResolver) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Collections(java.util.Collections) UnidentifiedTokenAnalyzer(zemberek.morphology.analysis.UnidentifiedTokenAnalyzer) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis)

Example 4 with StemAndEnding

use of zemberek.core.turkish.StemAndEnding in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method getForRomanNumeral.

private List<SingleAnalysis> getForRomanNumeral(Token token) {
    String content = token.getText();
    StemAndEnding se;
    if (content.contains("'")) {
        int i = content.indexOf('\'');
        se = new StemAndEnding(content.substring(0, i), content.substring(i + 1));
    } else {
        se = new StemAndEnding(content, "");
    }
    String ss = se.stem;
    if (se.stem.endsWith(".")) {
        ss = se.stem.substring(0, se.stem.length() - 1);
    }
    int decimal = TurkishNumbers.romanToDecimal(ss);
    if (decimal == -1) {
        return new ArrayList<>(0);
    }
    String lemma;
    if (se.stem.endsWith(".")) {
        lemma = numeralEndingMachine.find(String.valueOf(decimal));
        lemma = ordinalMap.get(lemma);
    } else {
        lemma = numeralEndingMachine.find(String.valueOf(decimal));
    }
    List<SingleAnalysis> results = Lists.newArrayListWithCapacity(1);
    String toParse;
    if (se.ending.length() > 0 && lemma.equals("dört") && ALPHABET.isVowel(se.ending.charAt(0))) {
        toParse = "dörd" + se.ending;
    } else {
        toParse = lemma + se.ending;
    }
    List<SingleAnalysis> res = analyzer.analyze(toParse);
    for (SingleAnalysis re : res) {
        if (re.getDictionaryItem().primaryPos != PrimaryPos.Numeral) {
            continue;
        }
        DictionaryItem runTimeItem = new DictionaryItem(se.stem, se.stem, content + lemma, PrimaryPos.Numeral, SecondaryPos.RomanNumeral);
        runTimeItem.attributes.add(RootAttribute.Runtime);
        results.add(re.copyFor(runTimeItem, se.stem));
    }
    return results;
}
Also used : StemAndEnding(zemberek.core.turkish.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList)

Aggregations

StemAndEnding (zemberek.core.turkish.StemAndEnding)4 ArrayList (java.util.ArrayList)3 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)3 Collections (java.util.Collections)2 List (java.util.List)2 Matcher (java.util.regex.Matcher)2 Collectors (java.util.stream.Collectors)2 PrimaryPos (zemberek.core.turkish.PrimaryPos)2 Turkish (zemberek.core.turkish.Turkish)2 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)2 RootLexicon (zemberek.morphology.lexicon.RootLexicon)2 Token (zemberek.tokenization.Token)2 Stopwatch (com.google.common.base.Stopwatch)1 Lists (com.google.common.collect.Lists)1 IOException (java.io.IOException)1 Map (java.util.Map)1 TimeUnit (java.util.concurrent.TimeUnit)1 Pattern (java.util.regex.Pattern)1 Log (zemberek.core.logging.Log)1 TextUtil (zemberek.core.text.TextUtil)1