Search in sources :

Example 41 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class HunspellOperations method generateAnnotationFileMultiSplit.

private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
    List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> annotations = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
        for (SingleAnalysis s : analysis) {
            if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
                continue;
            }
            String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
            List<String> tokens = Splitter.on(" ").splitToList(surfaces);
            String stem = tokens.get(0);
            for (int i = 0; i < tokens.size(); i++) {
                String morpheme = tokens.get(i);
                if (i > 0) {
                    stem = stem + morpheme;
                }
                List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
                String ending = String.join(" ", morphemes);
                if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
                    if (ending.length() > 0) {
                        stemEndings.add(word + " " + stem + " " + ending);
                    }
                /*else {
              stemEndings.add(word + " " + stem);
            }*/
                }
            }
        }
        annotations.add(String.join(",", stemEndings));
    }
    Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 42 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method unambiguous.

private boolean unambiguous(String sentence) {
    for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) {
        WordAnalysis analyses = morphology.analyze(token);
        Set<String> lemmas = new HashSet<>();
        for (SingleAnalysis analysis : analyses) {
            lemmas.add(analysis.getDictionaryItem().normalizedLemma());
        }
        if (lemmas.size() > 1) {
            return false;
        }
    }
    return true;
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 43 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class Scripts method saveUnambiguous.

public static void saveUnambiguous(List<SentenceAnalysis> sentences, Path out) throws IOException {
    try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
        for (SentenceAnalysis analysis : sentences) {
            if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
                continue;
            }
            pwMorph.format("S:%s%n", analysis.getSentence());
            for (SentenceWordAnalysis sw : analysis) {
                WordAnalysis wa = sw.getWordAnalysis();
                pwMorph.println(wa.getInput());
                SingleAnalysis best = sw.getBestAnalysis();
                for (SingleAnalysis singleAnalysis : wa) {
                    boolean isBest = singleAnalysis.equals(best);
                    if (wa.analysisCount() == 1) {
                        pwMorph.println(singleAnalysis.formatLong());
                    } else {
                        pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                    }
                }
            }
            pwMorph.println();
        }
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 44 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ClassificationExampleBase method splitWords.

protected String splitWords(String sentence) {
    List<String> tokens = Splitter.on(" ").splitToList(sentence);
    // assume first is label. Remove label from sentence for morphological analysis.
    String label = tokens.get(0);
    tokens = tokens.subList(1, tokens.size());
    sentence = String.join(" ", tokens);
    if (sentence.length() == 0) {
        return sentence;
    }
    SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
    List<String> res = new ArrayList<>();
    // add label first.
    res.add(label);
    for (SentenceWordAnalysis e : analysis) {
        SingleAnalysis best = e.getBestAnalysis();
        String input = e.getWordAnalysis().getInput();
        if (best.isUnknown()) {
            res.add(input);
            continue;
        }
        List<String> lemmas = best.getLemmas();
        String l = lemmas.get(0);
        if (l.length() < input.length()) {
            res.add(l);
            String substring = input.substring(l.length());
            res.add("_" + substring);
        } else {
            res.add(l);
        }
    }
    return String.join(" ", res);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 45 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ChangeStem method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
    String word = "simidime";
    Log.info("Input Word = " + word);
    WordAnalysis results = morphology.analyze(word);
    for (SingleAnalysis result : results) {
        List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes());
        for (Result s : generated) {
            Log.info("Input analysis: " + result.formatLong());
            Log.info("After stem change, word = " + s.surface);
            Log.info("After stem change, Analysis = " + s.analysis.formatLong());
        }
    }
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Result(zemberek.morphology.generator.WordGenerator.Result)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3