Search in sources :

Example 56 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method noParse.

public void noParse(String... filename) throws IOException {
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    for (String file : filename) {
        List<String> lines = readAll(file);
        Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
        for (String line : lines) {
            for (String s : splitter.split(line)) {
                List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
                total++;
                if (total % 50000 == 0) {
                    System.out.println("Processed: " + total);
                }
                if (results.size() == 0) {
                    uniques.add(s);
                }
            }
        }
        System.out.println("Total: " + total);
    }
    Stats st = new Stats(0.0002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (count > 5) {
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count);
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 57 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class Z3MarkovModelDisambiguator method disambiguate.

@Override
public void disambiguate(SentenceAnalysis sentenceParse) {
    Ambiguous[] ambiguousSeq = getAmbiguousSequence(sentenceParse);
    int[] bestSequence = bestSequence(ambiguousSeq);
    for (int i = 0; i < bestSequence.length; i++) {
        List<WordAnalysis> results = sentenceParse.getParses(i);
        if (results.size() == 1) {
            continue;
        }
        WordAnalysis tmp = results.get(0);
        results.set(0, results.get(bestSequence[i]));
        results.set(bestSequence[i], tmp);
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 58 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class Z3MarkovModelDisambiguator method getAmbiguousSequence.

public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
    Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
    awords[0] = startWord;
    awords[1] = startWord;
    int i = 2;
    for (SentenceAnalysis.Entry entry : sentence) {
        int[] roots = new int[entry.parses.size()];
        int[] lastIgs = new int[entry.parses.size()];
        int j = 0;
        for (WordAnalysis parse : entry.parses) {
            String rootPart = parse.dictionaryItem.lemma;
            WordAnalysis.InflectionalGroup firstIg = parse.inflectionalGroups.get(0);
            if (firstIg.suffixList.size() == 0) {
                rootPart += firstIg.formatNoSurface();
            } else {
                String s = firstIg.formatNoSurface();
                String suffixPart = Strings.subStringAfterFirst(s, ";");
                if (suffixPart.equals("A3sg+Pnon+Nom)")) {
                    rootPart += (Strings.subStringUntilFirst(s, ";") + ")");
                }
            }
            roots[j] = rootLm.getVocabulary().indexOf(rootPart);
            String igPart;
            int igSize = parse.inflectionalGroups.size();
            if (igSize > 1 && parse.inflectionalGroups.get(igSize - 2).suffixList.size() == 0) {
                igPart = parse.inflectionalGroups.get(igSize - 2).formatNoSurface() + parse.getLastIg();
            } else {
                igPart = parse.getLastIg().formatNoSurface();
            }
            lastIgs[j] = igLm.getVocabulary().indexOf(igPart);
            j++;
        }
        awords[i] = new Ambiguous(roots, lastIgs);
        i++;
    }
    awords[i] = endWord;
    return awords;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 59 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class Z3ModelA method disambiguate.

@Override
public void disambiguate(SentenceAnalysis sentenceParse) {
    Ambiguous[] ambiguousSeq = getAmbiguousSequence(sentenceParse);
    int[] bestSequence = bestSequence(ambiguousSeq);
    for (int i = 0; i < bestSequence.length; i++) {
        List<WordAnalysis> results = sentenceParse.getParses(i);
        if (results.size() == 1) {
            continue;
        }
        WordAnalysis tmp = results.get(0);
        results.set(0, results.get(bestSequence[i]));
        results.set(bestSequence[i], tmp);
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 60 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class Z3ModelA method getAmbiguousSequence.

public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
    Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
    awords[0] = startWord;
    awords[1] = startWord;
    int i = 2;
    for (SentenceAnalysis.Entry entry : sentence) {
        int[] roots = new int[entry.parses.size()];
        int[][] igs = new int[entry.parses.size()][];
        int j = 0;
        for (WordAnalysis parse : entry.parses) {
            String rootPart = parse.dictionaryItem.lemma;
            roots[j] = rootLm.getVocabulary().indexOf(rootPart);
            igs[j] = new int[parse.inflectionalGroups.size()];
            for (int k = 0; j < parse.inflectionalGroups.size(); k++) {
                igs[j][k] = igLm.getVocabulary().indexOf(parse.inflectionalGroups.get(k).formatNoSurface());
            }
            j++;
        }
        awords[i] = new Ambiguous(roots, igs);
        i++;
    }
    awords[i] = endWord;
    return awords;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5