Search in sources :

Example 16 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 17 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguousExampleFinder method extractSentences.

private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
    List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
    Path out = Paths.get("data/ambiguity/sentences.txt");
    Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
        for (String word : ambiguousWords) {
            Log.info(word);
            List<String> sentences = finder.getSentences(word, 3, 5, 10);
            pw.println(word);
            sentences.forEach(pw::println);
            pw.println();
            for (String sentence : sentences) {
                SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
                if (containsUnkown(analysis)) {
                    continue;
                }
                pwMorph.format("S:%s%n", sentence);
                for (SentenceWordAnalysis sw : analysis) {
                    WordAnalysis wa = sw.getWordAnalysis();
                    pwMorph.println(wa.getInput());
                    SingleAnalysis best = sw.getBestAnalysis();
                    for (SingleAnalysis singleAnalysis : wa) {
                        boolean isBest = singleAnalysis.equals(best);
                        if (wa.analysisCount() == 1) {
                            pwMorph.println(singleAnalysis.formatLong());
                        } else {
                            pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                        }
                    }
                }
                pwMorph.println();
            }
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 18 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class DistanceBasedStemmer method findStems.

public void findStems(String str) {
    str = "<s> <s> " + str + " </s> </s>";
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
    List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
    for (int i = 2; i < analysis.size() - 2; i++) {
        SentenceWordAnalysis swa = swaList.get(i);
        String s = swaList.get(i).getWordAnalysis().getInput();
        List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
        List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
        WordAnalysis wordResults = swa.getWordAnalysis();
        Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
        List<ScoredItem<String>> scores = new ArrayList<>();
        for (String stem : stems) {
            if (!distances.containsWord(stem)) {
                Log.info("Cannot find %s in vocab.", stem);
                continue;
            }
            List<WordDistances.Distance> distances = this.distances.getDistance(stem);
            float score = totalDistance(stem, bigramContext);
            int k = 0;
            for (WordDistances.Distance distance : distances) {
                /*                    if (s.equals(distance.word)) {
                        continue;
                    }*/
                score += distance(s, distance.word);
                if (k++ == 10) {
                    break;
                }
            }
            scores.add(new ScoredItem<>(stem, score));
        }
        Collections.sort(scores);
        Log.info("%n%s : ", s);
        for (ScoredItem<String> score : scores) {
            Log.info("Lemma = %s Score = %.7f", score.item, score.score);
        }
    }
    Log.info("==== Z disambiguation result ===== ");
    for (SentenceWordAnalysis a : analysis) {
        Log.info("%n%s : ", a.getWordAnalysis().getInput());
        LinkedHashSet<String> items = new LinkedHashSet<>();
        for (SingleAnalysis wa : a.getWordAnalysis()) {
            items.add(wa.getDictionaryItem().toString());
        }
        for (String item : items) {
            Log.info("%s", item);
        }
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Scanner(java.util.Scanner) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 19 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationExampleBase method replaceWordsWithLemma.

protected String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(0));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 20 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class Z3MarkovModelDisambiguator method getAmbiguousSequence.

public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
    Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
    awords[0] = startWord;
    awords[1] = startWord;
    int i = 2;
    for (SentenceAnalysis.Entry entry : sentence) {
        int[] roots = new int[entry.parses.size()];
        int[] lastIgs = new int[entry.parses.size()];
        int j = 0;
        for (WordAnalysis parse : entry.parses) {
            String rootPart = parse.dictionaryItem.lemma;
            WordAnalysis.InflectionalGroup firstIg = parse.inflectionalGroups.get(0);
            if (firstIg.suffixList.size() == 0) {
                rootPart += firstIg.formatNoSurface();
            } else {
                String s = firstIg.formatNoSurface();
                String suffixPart = Strings.subStringAfterFirst(s, ";");
                if (suffixPart.equals("A3sg+Pnon+Nom)")) {
                    rootPart += (Strings.subStringUntilFirst(s, ";") + ")");
                }
            }
            roots[j] = rootLm.getVocabulary().indexOf(rootPart);
            String igPart;
            int igSize = parse.inflectionalGroups.size();
            if (igSize > 1 && parse.inflectionalGroups.get(igSize - 2).suffixList.size() == 0) {
                igPart = parse.inflectionalGroups.get(igSize - 2).formatNoSurface() + parse.getLastIg();
            } else {
                igPart = parse.getLastIg().formatNoSurface();
            }
            lastIgs[j] = igLm.getVocabulary().indexOf(igPart);
            j++;
        }
        awords[i] = new Ambiguous(roots, lastIgs);
        i++;
    }
    awords[i] = endWord;
    return awords;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)35 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)22 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)19 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)10 PrintWriter (java.io.PrintWriter)5 Histogram (zemberek.core.collections.Histogram)5 Test (org.junit.Test)4 Token (zemberek.tokenization.Token)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 Path (java.nio.file.Path)3 Ignore (org.junit.Ignore)3 Log (zemberek.core.logging.Log)3 Lists (com.google.common.collect.Lists)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2