Search in sources :

Example 11 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class DistanceBasedStemmer method findStems.

public void findStems(String str) {
    str = "<s> <s> " + str + " </s> </s>";
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(str);
    List<SentenceWordAnalysis> swaList = analysis.getWordAnalyses();
    for (int i = 2; i < analysis.size() - 2; i++) {
        SentenceWordAnalysis swa = swaList.get(i);
        String s = swaList.get(i).getWordAnalysis().getInput();
        List<String> bigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i - 2).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 2).getWordAnalysis().getInput()));
        List<String> unigramContext = Lists.newArrayList(normalize(swaList.get(i - 1).getWordAnalysis().getInput()), normalize(swaList.get(i + 1).getWordAnalysis().getInput()));
        WordAnalysis wordResults = swa.getWordAnalysis();
        Set<String> stems = wordResults.stream().map(a -> normalize(a.getDictionaryItem().lemma)).collect(Collectors.toSet());
        List<ScoredItem<String>> scores = new ArrayList<>();
        for (String stem : stems) {
            if (!distances.containsWord(stem)) {
                Log.info("Cannot find %s in vocab.", stem);
                continue;
            }
            List<WordDistances.Distance> distances = this.distances.getDistance(stem);
            float score = totalDistance(stem, bigramContext);
            int k = 0;
            for (WordDistances.Distance distance : distances) {
                /*                    if (s.equals(distance.word)) {
                        continue;
                    }*/
                score += distance(s, distance.word);
                if (k++ == 10) {
                    break;
                }
            }
            scores.add(new ScoredItem<>(stem, score));
        }
        Collections.sort(scores);
        Log.info("%n%s : ", s);
        for (ScoredItem<String> score : scores) {
            Log.info("Lemma = %s Score = %.7f", score.item, score.score);
        }
    }
    Log.info("==== Z disambiguation result ===== ");
    for (SentenceWordAnalysis a : analysis) {
        Log.info("%n%s : ", a.getWordAnalysis().getInput());
        LinkedHashSet<String> items = new LinkedHashSet<>();
        for (SingleAnalysis wa : a.getWordAnalysis()) {
            items.add(wa.getDictionaryItem().toString());
        }
        for (String item : items) {
            Log.info("%s", item);
        }
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Scanner(java.util.Scanner) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Locale(java.util.Locale) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 12 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationExampleBase method replaceWordsWithLemma.

protected String replaceWordsWithLemma(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(0));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 13 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class AmbiguityResolutionTests method issue157ShouldNotThrowNPE.

@Test
public void issue157ShouldNotThrowNPE() {
    String input = "Yıldız Kızlar Dünya Şampiyonası FIVB'nin düzenlediği ve 18 " + "yaşının altındaki voleybolcuların katılabildiği bir şampiyonadır.";
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
    Assert.assertEquals(TurkishTokenizer.DEFAULT.tokenize(input).size(), analysis.size());
    for (SentenceWordAnalysis sentenceWordAnalysis : analysis) {
        String token = sentenceWordAnalysis.getWordAnalysis().getInput();
        SingleAnalysis an = sentenceWordAnalysis.getBestAnalysis();
        System.out.println(token + " = " + an.formatLong());
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) Test(org.junit.Test)

Example 14 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class MorphologyConsole method run.

@Override
public void run() {
    Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
    if (disableUnknownAnalysis) {
        b.disableUnidentifiedTokenAnalyzer();
    }
    if (enableInformalWordAnalysis) {
        b.useInformalAnalysis();
    }
    TurkishMorphology morphology = b.build();
    String input;
    System.out.println("Enter word or sentence. Type `quit` or `Ctrl+C` to exit.:");
    Scanner sc = new Scanner(System.in);
    input = sc.nextLine();
    while (!input.equals("quit")) {
        if (input.trim().length() == 0) {
            System.out.println("Empty line cannot be processed.");
            input = sc.nextLine();
            continue;
        }
        SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
        System.out.format("%nS:%s%n", input);
        for (SentenceWordAnalysis sw : analysis) {
            WordAnalysis wa = sw.getWordAnalysis();
            System.out.println(wa.getInput());
            SingleAnalysis best = sw.getBestAnalysis();
            for (SingleAnalysis singleAnalysis : wa) {
                boolean isBest = singleAnalysis.equals(best);
                if (wa.analysisCount() == 1) {
                    System.out.println(singleAnalysis.formatLong());
                } else {
                    System.out.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                }
            }
        }
        System.out.println();
        input = sc.nextLine();
    }
}
Also used : Scanner(java.util.Scanner) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Builder(zemberek.morphology.TurkishMorphology.Builder) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 15 with SentenceWordAnalysis

use of zemberek.morphology.analysis.SentenceWordAnalysis in project zemberek-nlp by ahmetaa.

the class PreprocessTurkishCorpus method replaceWordsWithLemma.

private String replaceWordsWithLemma(String sentence) {
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        if (best.isUnknown()) {
            res.add(e.getWordAnalysis().getInput());
            continue;
        }
        List<String> lemmas = best.getLemmas();
        res.add(lemmas.get(0));
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)18 ArrayList (java.util.ArrayList)12 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)8 TurkishMorphology (zemberek.morphology.TurkishMorphology)7 PrintWriter (java.io.PrintWriter)4 Histogram (zemberek.core.collections.Histogram)4 Path (java.nio.file.Path)3 Token (zemberek.tokenization.Token)3 Lists (com.google.common.collect.Lists)2 IOException (java.io.IOException)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2 Scanner (java.util.Scanner)2 Collectors (java.util.stream.Collectors)2 Log (zemberek.core.logging.Log)2 Files (java.nio.file.Files)1