Search in sources :

Example 71 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method ambiguousGroupStats.

public void ambiguousGroupStats(String filename) throws IOException {
    List<String> lines = readAll(filename);
    Histogram<String> uniques = new Histogram<>(1000000);
    Map<String, Histogram<String>> ambiguityGroups = Maps.newHashMap();
    int total = 0;
    for (String line : lines) {
        for (String s : splitter.split(line)) {
            WordAnalysis results = parser.analyze(s);
            if (++total % 50000 == 0) {
                System.out.println("Processed: " + total);
            }
            if (results.analysisCount() > 1) {
                String key = generateKeyFromParse(results);
                uniques.add(key);
                Histogram<String> members = ambiguityGroups.get(key);
                if (members == null) {
                    members = new Histogram<>();
                    ambiguityGroups.put(key, members);
                }
                members.add(s);
            }
        }
    }
    System.out.println("Total: " + total);
    Stats st = new Stats(0.1);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (st.overCutoff(count)) {
            String p1 = percentStr(count, st.allCounts);
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count + "    " + pp(p1));
            Histogram<String> members = ambiguityGroups.get(s);
            for (String member : members.getSortedList()) {
                int memberCount = members.getCount(member);
                if (pct(memberCount, count) > 0.1) {
                    System.out.println(member + " : " + members.getCount(member));
                }
            }
            System.out.println();
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 72 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method noParse.

public void noParse(String... filename) throws IOException {
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    for (String file : filename) {
        List<String> lines = readAll(file);
        Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
        for (String line : lines) {
            for (String s : splitter.split(line)) {
                WordAnalysis results = parser.analyze(s);
                total++;
                if (total % 50000 == 0) {
                    System.out.println("Processed: " + total);
                }
                if (results.analysisCount() == 0) {
                    uniques.add(s);
                }
            }
        }
        System.out.println("Total: " + total);
    }
    Stats st = new Stats(0.0002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (count > 5) {
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count);
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 73 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class DisambiguateSentences method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Bol baharatlı bir yemek yaptıralım.";
    Log.info("Sentence  = " + sentence);
    List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
    Log.info("Sentence word analysis result:");
    for (WordAnalysis entry : analyses) {
        Log.info("Word = " + entry.getInput());
        for (SingleAnalysis analysis : entry) {
            Log.info(analysis.formatLong());
        }
    }
    SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
    Log.info("\nAfter ambiguity resolution : ");
    result.bestAnalysis().forEach(Log::info);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Log(zemberek.core.logging.Log) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 74 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class HunspellOperations method generateAnnotationFileMultiSplit.

private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
    List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> annotations = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
        for (SingleAnalysis s : analysis) {
            if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
                continue;
            }
            String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
            List<String> tokens = Splitter.on(" ").splitToList(surfaces);
            String stem = tokens.get(0);
            for (int i = 0; i < tokens.size(); i++) {
                String morpheme = tokens.get(i);
                if (i > 0) {
                    stem = stem + morpheme;
                }
                List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
                String ending = String.join(" ", morphemes);
                if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
                    if (ending.length() > 0) {
                        stemEndings.add(word + " " + stem + " " + ending);
                    }
                /*else {
              stemEndings.add(word + " " + stem);
            }*/
                }
            }
        }
        annotations.add(String.join(",", stemEndings));
    }
    Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 75 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method unambiguous.

private boolean unambiguous(String sentence) {
    for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) {
        WordAnalysis analyses = morphology.analyze(token);
        Set<String> lemmas = new HashSet<>();
        for (SingleAnalysis analysis : analyses) {
            lemmas.add(analysis.getDictionaryItem().normalizedLemma());
        }
        if (lemmas.size() > 1) {
            return false;
        }
    }
    return true;
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5