Search in sources :

Example 1 with StemAndEnding

use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.

the class TurkishMorphology method analyzeWordsWithApostrophe.

private List<WordAnalysis> analyzeWordsWithApostrophe(String word) {
    int index = word.indexOf('\'');
    if (index >= 0) {
        if (index == 0 || index == word.length() - 1) {
            return Collections.emptyList();
        }
        StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
        String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
        String withoutQuote = word.replaceAll("'", "");
        List<WordAnalysis> noQuotesParses = wordAnalyzer.analyze(withoutQuote);
        if (noQuotesParses.size() == 0) {
            return Collections.emptyList();
        }
        return noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStems().contains(stem)).collect(Collectors.toList());
    } else {
        return Collections.emptyList();
    }
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) Arrays(java.util.Arrays) LoadingCache(com.google.common.cache.LoadingCache) Serializer(zemberek.morphology.lexicon.Serializer) Stopwatch(com.google.common.base.Stopwatch) SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Lists(com.google.common.collect.Lists) SimpleGenerator(zemberek.morphology.generator.SimpleGenerator) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Charsets(com.google.common.base.Charsets) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph) WordAnalyzer(zemberek.morphology.analysis.WordAnalyzer) Resources(com.google.common.io.Resources) Files(java.nio.file.Files) Collection(java.util.Collection) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) CacheLoader(com.google.common.cache.CacheLoader) TimeUnit(java.util.concurrent.TimeUnit) StemAndEnding(zemberek.morphology.structure.StemAndEnding) List(java.util.List) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Preconditions(com.google.common.base.Preconditions) CacheBuilder(com.google.common.cache.CacheBuilder) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Collections(java.util.Collections) CacheStats(com.google.common.cache.CacheStats) InputStream(java.io.InputStream) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 2 with StemAndEnding

use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method analyze.

public synchronized List<WordAnalysis> analyze(String word) {
    if (word.contains("?")) {
        return Collections.emptyList();
    }
    if (!Strings.containsNone(word, "0123456789")) {
        return parseNumeral(word);
    }
    int index = word.indexOf('\'');
    if (index >= 0) {
        if (index == 0 || index == word.length() - 1) {
            return Collections.emptyList();
        }
        StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
        String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
        String ending = TurkishAlphabet.INSTANCE.normalize(se.ending);
        String pronunciation = guessPronunciation(stem);
        DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(stem), stem, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
        itemProp.attributes.add(RootAttribute.Runtime);
        graph.addDictionaryItem(itemProp);
        String toParse = stem + ending;
        List<WordAnalysis> properResults = parser.analyze(toParse);
        graph.removeDictionaryItem(itemProp);
        return properResults;
    } else if (Character.isUpperCase(word.charAt(0))) {
        String normalized = TurkishAlphabet.INSTANCE.normalize(word);
        String pronunciation = guessPronunciation(normalized);
        DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(normalized), normalized, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
        itemProp.attributes.add(RootAttribute.Runtime);
        graph.addDictionaryItem(itemProp);
        // TODO eliminate gross code duplication
        List<WordAnalysis> properResults = parser.analyze(normalized);
        graph.removeDictionaryItem(itemProp);
        return properResults;
    }
    return Collections.emptyList();
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) List(java.util.List)

Example 3 with StemAndEnding

use of zemberek.morphology.structure.StemAndEnding in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method parseNumeral.

public List<WordAnalysis> parseNumeral(String s) {
    StemAndEnding se = getFromNumeral(s);
    String lemma;
    if (se.stem.endsWith(".")) {
        String ss = se.stem.substring(0, se.stem.length() - 1);
        lemma = numeralEndingMachine.find(ss);
        lemma = ordinalMap.get(lemma);
    } else {
        lemma = numeralEndingMachine.find(se.stem);
    }
    List<WordAnalysis> results = Lists.newArrayListWithCapacity(1);
    for (TurkishDictionaryLoader.Digit digit : TurkishDictionaryLoader.Digit.values()) {
        Matcher m = digit.pattern.matcher(se.stem);
        if (m.find()) {
            String toParse;
            if (se.ending.length() > 0 && lemma.equals("dört") && TurkishAlphabet.INSTANCE.isVowel(se.ending.charAt(0))) {
                toParse = "dörd" + se.ending;
            } else {
                toParse = lemma + se.ending;
            }
            List<WordAnalysis> res = turkishParser.getWordAnalyzer().analyze(toParse);
            for (WordAnalysis re : res) {
                if (re.dictionaryItem.primaryPos != PrimaryPos.Numeral) {
                    continue;
                }
                re.dictionaryItem = new DictionaryItem(se.stem, se.stem, s + lemma, PrimaryPos.Numeral, digit.secondaryPos);
                re.dictionaryItem.attributes.add(RootAttribute.Runtime);
                re.root = se.stem;
                results.add(re);
            }
        }
    }
    return results;
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Matcher(java.util.regex.Matcher)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)3 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)3 StemAndEnding (zemberek.morphology.structure.StemAndEnding)3 List (java.util.List)2 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)2 Charsets (com.google.common.base.Charsets)1 Preconditions (com.google.common.base.Preconditions)1 Stopwatch (com.google.common.base.Stopwatch)1 CacheBuilder (com.google.common.cache.CacheBuilder)1 CacheLoader (com.google.common.cache.CacheLoader)1 CacheStats (com.google.common.cache.CacheStats)1 LoadingCache (com.google.common.cache.LoadingCache)1 Lists (com.google.common.collect.Lists)1 Resources (com.google.common.io.Resources)1 File (java.io.File)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1