Search in sources :

Example 61 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzer method analyze.

public SentenceAnalysis analyze(String sentence) {
    SentenceAnalysis sentenceParse = new SentenceAnalysis();
    String preprocessed = preProcess(sentence);
    for (String s : Splitter.on(" ").omitEmptyStrings().trimResults().split(preprocessed)) {
        List<WordAnalysis> parses = turkishMorphology.analyze(s);
        sentenceParse.addParse(s, parses);
    }
    return sentenceParse;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 62 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzer method bestParse.

/**
 * Returns the best parse of a sentence.
 *
 * @param sentence sentence
 * @return best parse.
 */
public List<WordAnalysis> bestParse(String sentence) {
    SentenceAnalysis parse = analyze(sentence);
    disambiguate(parse);
    List<WordAnalysis> bestParse = Lists.newArrayListWithCapacity(parse.size());
    for (SentenceAnalysis.Entry entry : parse) {
        bestParse.add(entry.parses.get(0));
    }
    return bestParse;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 63 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method parseNumeral.

public List<WordAnalysis> parseNumeral(String s) {
    StemAndEnding se = getFromNumeral(s);
    String lemma;
    if (se.stem.endsWith(".")) {
        String ss = se.stem.substring(0, se.stem.length() - 1);
        lemma = numeralEndingMachine.find(ss);
        lemma = ordinalMap.get(lemma);
    } else {
        lemma = numeralEndingMachine.find(se.stem);
    }
    List<WordAnalysis> results = Lists.newArrayListWithCapacity(1);
    for (TurkishDictionaryLoader.Digit digit : TurkishDictionaryLoader.Digit.values()) {
        Matcher m = digit.pattern.matcher(se.stem);
        if (m.find()) {
            String toParse;
            if (se.ending.length() > 0 && lemma.equals("dört") && TurkishAlphabet.INSTANCE.isVowel(se.ending.charAt(0))) {
                toParse = "dörd" + se.ending;
            } else {
                toParse = lemma + se.ending;
            }
            List<WordAnalysis> res = turkishParser.getWordAnalyzer().analyze(toParse);
            for (WordAnalysis re : res) {
                if (re.dictionaryItem.primaryPos != PrimaryPos.Numeral) {
                    continue;
                }
                re.dictionaryItem = new DictionaryItem(se.stem, se.stem, s + lemma, PrimaryPos.Numeral, digit.secondaryPos);
                re.dictionaryItem.attributes.add(RootAttribute.Runtime);
                re.root = se.stem;
                results.add(re);
            }
        }
    }
    return results;
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Matcher(java.util.regex.Matcher)

Example 64 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class AddNewDictionaryItem method printResults.

private void printResults(List<WordAnalysis> results) {
    int i = 1;
    for (WordAnalysis result : results) {
        String str = result.formatLong();
        if (result.dictionaryItem.attributes.contains(RootAttribute.Runtime)) {
            str = str + " (Generated by UnidentifiedTokenParser)";
        }
        System.out.println(i + " - " + str);
        i++;
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 65 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzerTest method shouldParseSmallCaseProperNounsWithSingleQuote.

@Test
public void shouldParseSmallCaseProperNounsWithSingleQuote() throws IOException {
    HashSet<String> expected = Sets.newHashSet("[(İstanbul:istanbul) (Noun,Prop;A3sg+P2sg:un+Nom)]", "[(İstanbul:istanbul) (Noun,Prop;A3sg+Pnon+Gen:un)]");
    TurkishMorphology parser = TurkishMorphology.builder().addTextDictionaryResources("dev-lexicon.txt").build();
    UnidentifiedTokenAnalyzer uiParser = new UnidentifiedTokenAnalyzer(parser);
    List<WordAnalysis> results = uiParser.analyze("İstanbul'un");
    Assert.assertEquals(2, results.size());
    for (WordAnalysis result : results) {
        Assert.assertTrue(expected.contains(result.formatLong()));
    }
    results = uiParser.analyze("istanbul'un");
    Assert.assertEquals(2, results.size());
    for (WordAnalysis result : results) {
        Assert.assertTrue(expected.contains(result.formatLong()));
    }
    results = uiParser.analyze("Ankara'ya");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
    results = uiParser.analyze("ankara'ya");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Ankara:ankara) (Noun,Prop;A3sg+Pnon+Dat:ya)]", results.get(0).formatLong());
    // Karaman does not exist in dictionary
    results = uiParser.analyze("Karaman");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Nom)]", results.get(0).formatLong());
    results = uiParser.analyze("karaman'a");
    Assert.assertEquals(1, results.size());
    Assert.assertEquals("[(Karaman:karaman) (Noun,Prop;A3sg+Pnon+Dat:a)]", results.get(0).formatLong());
    results = uiParser.analyze("karaman");
    Assert.assertEquals(0, results.size());
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) UnidentifiedTokenAnalyzer(zemberek.morphology.analysis.tr.UnidentifiedTokenAnalyzer) Test(org.junit.Test)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5