Search in sources :

Example 6 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class Scripts method saveUnambiguous.

public static void saveUnambiguous(List<String> sentences, TurkishMorphology morphology, Path out) throws IOException {
    try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
        for (String sentence : sentences) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
                continue;
            }
            pwMorph.format("S:%s%n", sentence);
            for (SentenceWordAnalysis sw : analysis) {
                WordAnalysis wa = sw.getWordAnalysis();
                pwMorph.println(wa.getInput());
                SingleAnalysis best = sw.getBestAnalysis();
                for (SingleAnalysis singleAnalysis : wa) {
                    boolean isBest = singleAnalysis.equals(best);
                    if (wa.analysisCount() == 1) {
                        pwMorph.println(singleAnalysis.formatLong());
                    } else {
                        pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                    }
                }
            }
            pwMorph.println();
        }
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 7 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class PerceptronAmbiguityResolver method disambiguate.

@Override
public SentenceAnalysis disambiguate(String sentence, List<WordAnalysis> allAnalyses) {
    DecodeResult best = decoder.bestPath(allAnalyses);
    List<SentenceWordAnalysis> l = new ArrayList<>();
    for (int i = 0; i < allAnalyses.size(); i++) {
        WordAnalysis wordAnalysis = allAnalyses.get(i);
        SingleAnalysis analysis = best.bestParse.get(i);
        l.add(new SentenceWordAnalysis(analysis, wordAnalysis));
    }
    return new SentenceAnalysis(sentence, l);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 8 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationConsole method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 9 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 10 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguousExampleFinder method extractSentences.

private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
    List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
    Path out = Paths.get("data/ambiguity/sentences.txt");
    Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
        for (String word : ambiguousWords) {
            Log.info(word);
            List<String> sentences = finder.getSentences(word, 3, 5, 10);
            pw.println(word);
            sentences.forEach(pw::println);
            pw.println();
            for (String sentence : sentences) {
                SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
                if (containsUnkown(analysis)) {
                    continue;
                }
                pwMorph.format("S:%s%n", sentence);
                for (SentenceWordAnalysis sw : analysis) {
                    WordAnalysis wa = sw.getWordAnalysis();
                    pwMorph.println(wa.getInput());
                    SingleAnalysis best = sw.getBestAnalysis();
                    for (SingleAnalysis singleAnalysis : wa) {
                        boolean isBest = singleAnalysis.equals(best);
                        if (wa.analysisCount() == 1) {
                            pwMorph.println(singleAnalysis.formatLong());
                        } else {
                            pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                        }
                    }
                }
                pwMorph.println();
            }
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)18 ArrayList (java.util.ArrayList)12 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)8 TurkishMorphology (zemberek.morphology.TurkishMorphology)7 PrintWriter (java.io.PrintWriter)4 Histogram (zemberek.core.collections.Histogram)4 Path (java.nio.file.Path)3 Token (zemberek.tokenization.Token)3 Lists (com.google.common.collect.Lists)2 IOException (java.io.IOException)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2 Scanner (java.util.Scanner)2 Collectors (java.util.stream.Collectors)2 Log (zemberek.core.logging.Log)2 Files (java.nio.file.Files)1