Search in sources :

Example 6 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.

private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
    List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
    for (Path file : files) {
        Log.info("Processing %s", file);
        LinkedHashSet<String> sentences = getSentences(file);
        List<List<String>> group = group(new ArrayList<>(sentences), 5000);
        for (List<String> lines : group) {
            Log.info("Collected %d words.", wordAnalyses.size());
            LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
            for (String sentence : toProcess) {
                try {
                    SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
                    for (SentenceWordAnalysis analysis : sentenceAnalysis) {
                        HashSet<String> stems = new HashSet<>(4);
                        for (SingleAnalysis s : analysis.getWordAnalysis()) {
                            stems.add(s.getStem());
                            if (stems.size() > minCount) {
                                wordAnalyses.add(analysis.getWordAnalysis());
                                break;
                            }
                        }
                    }
                } catch (Exception e) {
                    Log.warn("Error in sentence %s", sentence);
                }
            }
        }
        if (wordAnalyses.size() > wordCount) {
            break;
        }
    }
    String s = inputRoot.toFile().getName();
    Path amb = outRoot.resolve(s + "-amb.txt");
    try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
        for (WordAnalysis wa : wordAnalyses.getSortedList()) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
}
Also used : Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) Predicate(java.util.function.Predicate) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) Collection(java.util.Collection) IOException(java.io.IOException) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) IOException(java.io.IOException) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 7 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method readmeExample2.

@Test
@Ignore("Not a Test")
public void readmeExample2() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WordAnalysis result = morphology.analyze("kitabımızsa");
    for (SingleAnalysis analysis : result) {
        System.out.println(analysis.formatLong());
        System.out.println("\tStems = " + analysis.getStems());
        System.out.println("\tLemmas = " + analysis.getLemmas());
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 8 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method ambiguousWords.

@Test
@Ignore("Not a Test.")
public void ambiguousWords() throws IOException {
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    Path correct = outDir.resolve("zemberek-parses.txt");
    Path outAmbAn = outDir.resolve("zemberek-ambigious-analyses.txt");
    Path outAmbWord = outDir.resolve("zemberek-ambigious-words.txt");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> words = Files.readAllLines(correct).subList(0, 100_000);
    List<String> ambWords = new ArrayList<>();
    List<WordAnalysis> amb = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect() || analysis.analysisCount() == 1) {
        } else {
            HashSet<String> stems = new HashSet<>(4);
            for (SingleAnalysis s : analysis) {
                stems.add(s.getStem());
                if (stems.size() > 1) {
                    amb.add(analysis);
                    ambWords.add(word);
                    break;
                }
            }
        }
    }
    Log.info("Writing %d words", amb.size());
    try (PrintWriter pwa = new PrintWriter(outAmbAn.toFile(), "utf-8")) {
        for (WordAnalysis wa : amb) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
    Files.write(outAmbWord, ambWords, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 9 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method disambiguationExample.

@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Yarın kar yağacak.";
    System.out.println("Sentence  = " + sentence);
    List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
    System.out.println("Before disambiguation.");
    for (WordAnalysis entry : analysis) {
        System.out.println("Word = " + entry.getInput());
        for (SingleAnalysis single : entry) {
            System.out.println(single.formatLong());
        }
    }
    System.out.println("\nAfter disambiguation.");
    SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
    after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 10 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method failedWordTestIssue124.

@Test
@Ignore("Not a Test.")
public void failedWordTestIssue124() throws IOException {
    Path failPath = DATA_PATH.resolve("fails.txt");
    LinkedHashSet<String> words = new LinkedHashSet<>(Files.readAllLines(failPath, StandardCharsets.UTF_8));
    LinkedHashSet<String> accepted = new LinkedHashSet<>();
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    for (String s : words) {
        WordAnalysis parses = parser.analyze(s);
        List<SingleAnalysis> analyses = parses.getAnalysisResults();
        for (SingleAnalysis parse : analyses) {
            if (parse.isUnknown() || parse.isRuntime()) {
                continue;
            }
            accepted.add(s);
        }
    }
    for (String s : accepted) {
        words.remove(s);
    }
    Path failReduced = DATA_PATH.resolve("fails-reduced.txt");
    try (PrintWriter pw = new PrintWriter(failReduced.toFile(), "utf-8")) {
        words.forEach(pw::println);
    }
    Log.info("Word count = %d Found = %d Not Found = %d", words.size(), accepted.size(), words.size() - accepted.size());
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3