Search in sources :

Example 1 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class PerceptronAmbiguityResolver method disambiguate.

@Override
public SentenceAnalysis disambiguate(String sentence, List<WordAnalysis> allAnalyses) {
    DecodeResult best = decoder.bestPath(allAnalyses);
    List<SentenceWordAnalysis> l = new ArrayList<>();
    for (int i = 0; i < allAnalyses.size(); i++) {
        WordAnalysis wordAnalysis = allAnalyses.get(i);
        SingleAnalysis analysis = best.bestParse.get(i);
        l.add(new SentenceWordAnalysis(analysis, wordAnalysis));
    }
    return new SentenceAnalysis(sentence, l);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 2 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationConsole method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 3 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 4 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguousExampleFinder method extractSentences.

private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
    List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
    Path out = Paths.get("data/ambiguity/sentences.txt");
    Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
        for (String word : ambiguousWords) {
            Log.info(word);
            List<String> sentences = finder.getSentences(word, 3, 5, 10);
            pw.println(word);
            sentences.forEach(pw::println);
            pw.println();
            for (String sentence : sentences) {
                SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
                if (containsUnkown(analysis)) {
                    continue;
                }
                pwMorph.format("S:%s%n", sentence);
                for (SentenceWordAnalysis sw : analysis) {
                    WordAnalysis wa = sw.getWordAnalysis();
                    pwMorph.println(wa.getInput());
                    SingleAnalysis best = sw.getBestAnalysis();
                    for (SingleAnalysis singleAnalysis : wa) {
                        boolean isBest = singleAnalysis.equals(best);
                        if (wa.analysisCount() == 1) {
                            pwMorph.println(singleAnalysis.formatLong());
                        } else {
                            pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                        }
                    }
                }
                pwMorph.println();
            }
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 5 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class DistanceBasedStemmer method findStems.

public void findStems(String str) {
    str = "<s> <s> " + str + " </s> </s>";
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
    List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
    for (int i = 2; i < analysis.size() - 2; i++) {
        SentenceWordAnalysis swa = swaList.get(i);
        String s = swaList.get(i).getWordAnalysis().getInput();
        List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
        List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
        WordAnalysis wordResults = swa.getWordAnalysis();
        Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
        List<ScoredItem<String>> scores = new ArrayList<>();
        for (String stem : stems) {
            if (!distances.containsWord(stem)) {
                Log.info("Cannot find %s in vocab.", stem);
                continue;
            }
            List<WordDistances.Distance> distances = this.distances.getDistance(stem);
            float score = totalDistance(stem, bigramContext);
            int k = 0;
            for (WordDistances.Distance distance : distances) {
                /*                    if (s.equals(distance.word)) {
                        continue;
                    }*/
                score += distance(s, distance.word);
                if (k++ == 10) {
                    break;
                }
            }
            scores.add(new ScoredItem<>(stem, score));
        }
        Collections.sort(scores);
        Log.info("%n%s : ", s);
        for (ScoredItem<String> score : scores) {
            Log.info("Lemma = %s Score = %.7f", score.item, score.score);
        }
    }
    Log.info("==== Z disambiguation result ===== ");
    for (SentenceWordAnalysis a : analysis) {
        Log.info("%n%s : ", a.getWordAnalysis().getInput());
        LinkedHashSet<String> items = new LinkedHashSet<>();
        for (SingleAnalysis wa : a.getWordAnalysis()) {
            items.add(wa.getDictionaryItem().toString());
        }
        for (String item : items) {
            Log.info("%s", item);
        }
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Scanner(java.util.Scanner) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)18 ArrayList (java.util.ArrayList)12 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)8 TurkishMorphology (zemberek.morphology.TurkishMorphology)7 PrintWriter (java.io.PrintWriter)4 Histogram (zemberek.core.collections.Histogram)4 Path (java.nio.file.Path)3 Token (zemberek.tokenization.Token)3 Lists (com.google.common.collect.Lists)2 IOException (java.io.IOException)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2 Scanner (java.util.Scanner)2 Collectors (java.util.stream.Collectors)2 Log (zemberek.core.logging.Log)2 Files (java.nio.file.Files)1