Search in sources :

Example 26 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(lemmas.size() - 1));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 27 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class AnalyzeWords method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String word = "kalemi";
    Log.info("Word = " + word);
    WordAnalysis results = morphology.analyze(word);
    for (SingleAnalysis result : results) {
        Log.info("Lexical and Surface : " + result.formatLong());
        Log.info("Only Lexical        : " + result.formatLexical());
        Log.info("Oflazer style       : " + AnalysisFormatters.OFLAZER_STYLE.format(result));
        Log.info();
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 28 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguousExampleFinder method extractSentences.

private static void extractSentences(TurkishMorphology morphology, AmbiguousExampleFinder finder) throws Exception {
    List<String> ambiguousWords = Files.readAllLines(Paths.get("data/ambiguity/zemberek-ambigious-words.txt"), StandardCharsets.UTF_8).subList(0, 100);
    Path out = Paths.get("data/ambiguity/sentences.txt");
    Path morph = Paths.get("data/ambiguity/sentences.morph.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwMorph = new PrintWriter(morph.toFile(), "utf-8")) {
        for (String word : ambiguousWords) {
            Log.info(word);
            List<String> sentences = finder.getSentences(word, 3, 5, 10);
            pw.println(word);
            sentences.forEach(pw::println);
            pw.println();
            for (String sentence : sentences) {
                SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
                if (containsUnkown(analysis)) {
                    continue;
                }
                pwMorph.format("S:%s%n", sentence);
                for (SentenceWordAnalysis sw : analysis) {
                    WordAnalysis wa = sw.getWordAnalysis();
                    pwMorph.println(wa.getInput());
                    SingleAnalysis best = sw.getBestAnalysis();
                    for (SingleAnalysis singleAnalysis : wa) {
                        boolean isBest = singleAnalysis.equals(best);
                        if (wa.analysisCount() == 1) {
                            pwMorph.println(singleAnalysis.formatLong());
                        } else {
                            pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                        }
                    }
                }
                pwMorph.println();
            }
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 29 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class DistanceBasedStemmer method findStems.

public void findStems(String str) {
    str = "<s> <s> " + str + " </s> </s>";
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
    List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
    for (int i = 2; i < analysis.size() - 2; i++) {
        SentenceWordAnalysis swa = swaList.get(i);
        String s = swaList.get(i).getWordAnalysis().getInput();
        List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
        List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
        WordAnalysis wordResults = swa.getWordAnalysis();
        Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
        List<ScoredItem<String>> scores = new ArrayList<>();
        for (String stem : stems) {
            if (!distances.containsWord(stem)) {
                Log.info("Cannot find %s in vocab.", stem);
                continue;
            }
            List<WordDistances.Distance> distances = this.distances.getDistance(stem);
            float score = totalDistance(stem, bigramContext);
            int k = 0;
            for (WordDistances.Distance distance : distances) {
                /*                    if (s.equals(distance.word)) {
                        continue;
                    }*/
                score += distance(s, distance.word);
                if (k++ == 10) {
                    break;
                }
            }
            scores.add(new ScoredItem<>(stem, score));
        }
        Collections.sort(scores);
        Log.info("%n%s : ", s);
        for (ScoredItem<String> score : scores) {
            Log.info("Lemma = %s Score = %.7f", score.item, score.score);
        }
    }
    Log.info("==== Z disambiguation result ===== ");
    for (SentenceWordAnalysis a : analysis) {
        Log.info("%n%s : ", a.getWordAnalysis().getInput());
        LinkedHashSet<String> items = new LinkedHashSet<>();
        for (SingleAnalysis wa : a.getWordAnalysis()) {
            items.add(wa.getDictionaryItem().toString());
        }
        for (String item : items) {
            Log.info("%s", item);
        }
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Scanner(java.util.Scanner) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 30 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationExampleBase method replaceWordsWithLemma.

protected String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(0));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3