Search in sources :

Example 11 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.

private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
    List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
    Stopwatch sw = Stopwatch.createStarted();
    long wc = 0;
    int s = 0;
    Histogram<String> unknownStuff = new Histogram<>();
    for (String sentence : sentences) {
        SentenceAnalysis parse = parser.analyze(sentence);
        for (SentenceAnalysis.Entry entry : parse) {
            List<WordAnalysis> parses = entry.parses;
            for (WordAnalysis wordAnalysis : parses) {
                if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
                    unknownStuff.add(wordAnalysis.getSurfaceForm());
                }
            }
        }
        wc += parse.size();
        // parser.disambiguate(parse);
        s++;
        if (s % 10000 == 0) {
            System.out.println(s);
            System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
        }
    }
    try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
        for (String s1 : unknownStuff.getSortedList()) {
            pw.println(s1 + " " + unknownStuff.getCount(s1));
        }
    }
    System.out.println("Word count = " + wc);
    System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter)

Example 12 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzerTest method shouldCreateUnidentifiedTokenParserSuccessfully.

@Test
public void shouldCreateUnidentifiedTokenParserSuccessfully() throws IOException {
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    UnidentifiedTokenAnalyzer uiParser = new UnidentifiedTokenAnalyzer(parser);
    List<WordAnalysis> results = uiParser.analyze("Ankara'ya");
    for (WordAnalysis result : results) {
        System.out.println(result);
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) UnidentifiedTokenAnalyzer(zemberek.morphology.analysis.tr.UnidentifiedTokenAnalyzer) Test(org.junit.Test)

Example 13 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method testWordAnalysis.

@Test
@Ignore("Not a Test.")
public void testWordAnalysis() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<WordAnalysis> results = morphology.analyze("phpye");
    for (WordAnalysis result : results) {
        Log.info(result.formatLong());
        Log.info("\tStems = " + result.getStems());
        Log.info("\tLemmas = " + result.getLemmas());
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 14 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishMorphology method analyzeWordsWithApostrophe.

private List<WordAnalysis> analyzeWordsWithApostrophe(String word) {
    int index = word.indexOf('\'');
    if (index >= 0) {
        if (index == 0 || index == word.length() - 1) {
            return Collections.emptyList();
        }
        StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
        String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
        String withoutQuote = word.replaceAll("'", "");
        List<WordAnalysis> noQuotesParses = wordAnalyzer.analyze(withoutQuote);
        if (noQuotesParses.size() == 0) {
            return Collections.emptyList();
        }
        return noQuotesParses.stream().filter(noQuotesParse -> noQuotesParse.getStems().contains(stem)).collect(Collectors.toList());
    } else {
        return Collections.emptyList();
    }
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) Arrays(java.util.Arrays) LoadingCache(com.google.common.cache.LoadingCache) Serializer(zemberek.morphology.lexicon.Serializer) Stopwatch(com.google.common.base.Stopwatch) SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Lists(com.google.common.collect.Lists) SimpleGenerator(zemberek.morphology.generator.SimpleGenerator) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Charsets(com.google.common.base.Charsets) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph) WordAnalyzer(zemberek.morphology.analysis.WordAnalyzer) Resources(com.google.common.io.Resources) Files(java.nio.file.Files) Collection(java.util.Collection) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) CacheLoader(com.google.common.cache.CacheLoader) TimeUnit(java.util.concurrent.TimeUnit) StemAndEnding(zemberek.morphology.structure.StemAndEnding) List(java.util.List) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Preconditions(com.google.common.base.Preconditions) CacheBuilder(com.google.common.cache.CacheBuilder) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Collections(java.util.Collections) CacheStats(com.google.common.cache.CacheStats) InputStream(java.io.InputStream) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 15 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method analyze.

public synchronized List<WordAnalysis> analyze(String word) {
    if (word.contains("?")) {
        return Collections.emptyList();
    }
    if (!Strings.containsNone(word, "0123456789")) {
        return parseNumeral(word);
    }
    int index = word.indexOf('\'');
    if (index >= 0) {
        if (index == 0 || index == word.length() - 1) {
            return Collections.emptyList();
        }
        StemAndEnding se = new StemAndEnding(word.substring(0, index), word.substring(index + 1));
        String stem = TurkishAlphabet.INSTANCE.normalize(se.stem);
        String ending = TurkishAlphabet.INSTANCE.normalize(se.ending);
        String pronunciation = guessPronunciation(stem);
        DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(stem), stem, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
        itemProp.attributes.add(RootAttribute.Runtime);
        graph.addDictionaryItem(itemProp);
        String toParse = stem + ending;
        List<WordAnalysis> properResults = parser.analyze(toParse);
        graph.removeDictionaryItem(itemProp);
        return properResults;
    } else if (Character.isUpperCase(word.charAt(0))) {
        String normalized = TurkishAlphabet.INSTANCE.normalize(word);
        String pronunciation = guessPronunciation(normalized);
        DictionaryItem itemProp = new DictionaryItem(Turkish.capitalize(normalized), normalized, pronunciation, PrimaryPos.Noun, SecondaryPos.ProperNoun);
        itemProp.attributes.add(RootAttribute.Runtime);
        graph.addDictionaryItem(itemProp);
        // TODO eliminate gross code duplication
        List<WordAnalysis> properResults = parser.analyze(normalized);
        graph.removeDictionaryItem(itemProp);
        return properResults;
    }
    return Collections.emptyList();
}
Also used : StemAndEnding(zemberek.morphology.structure.StemAndEnding) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) List(java.util.List)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5