Search in sources :

Example 41 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method memoryStressTest.

@Test
@Ignore("Not a Test.")
public void memoryStressTest() throws IOException {
    List<String> words = Files.readAllLines(Paths.get("dunya"));
    TurkishMorphology parser = TurkishMorphology.builder().setLexicon(RootLexicon.fromResources(TurkishDictionaryLoader.DEFAULT_DICTIONARY_RESOURCES)).build();
    int c = 0;
    for (int i = 0; i < 100; i++) {
        Stopwatch sw = Stopwatch.createStarted();
        for (String s : words) {
            WordAnalysis parses = parser.analyze(s);
            c += parses.analysisCount();
        }
        Log.info(sw.elapsed(TimeUnit.MILLISECONDS));
        Log.info(parser.toString());
    }
    Log.info(c);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 42 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method readmeExample2.

@Test
@Ignore("Not a Test")
public void readmeExample2() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WordAnalysis result = morphology.analyze("kitabımızsa");
    for (SingleAnalysis analysis : result) {
        System.out.println(analysis.formatLong());
        System.out.println("\tStems = " + analysis.getStems());
        System.out.println("\tLemmas = " + analysis.getLemmas());
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 43 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method ambiguousWords.

@Test
@Ignore("Not a Test.")
public void ambiguousWords() throws IOException {
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    Path correct = outDir.resolve("zemberek-parses.txt");
    Path outAmbAn = outDir.resolve("zemberek-ambigious-analyses.txt");
    Path outAmbWord = outDir.resolve("zemberek-ambigious-words.txt");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> words = Files.readAllLines(correct).subList(0, 100_000);
    List<String> ambWords = new ArrayList<>();
    List<WordAnalysis> amb = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect() || analysis.analysisCount() == 1) {
        } else {
            HashSet<String> stems = new HashSet<>(4);
            for (SingleAnalysis s : analysis) {
                stems.add(s.getStem());
                if (stems.size() > 1) {
                    amb.add(analysis);
                    ambWords.add(word);
                    break;
                }
            }
        }
    }
    Log.info("Writing %d words", amb.size());
    try (PrintWriter pwa = new PrintWriter(outAmbAn.toFile(), "utf-8")) {
        for (WordAnalysis wa : amb) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
    Files.write(outAmbWord, ambWords, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 44 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method disambiguationExample.

@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Yarın kar yağacak.";
    System.out.println("Sentence  = " + sentence);
    List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
    System.out.println("Before disambiguation.");
    for (WordAnalysis entry : analysis) {
        System.out.println("Word = " + entry.getInput());
        for (SingleAnalysis single : entry) {
            System.out.println(single.formatLong());
        }
    }
    System.out.println("\nAfter disambiguation.");
    SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
    after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 45 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method failedWordTestIssue124.

@Test
@Ignore("Not a Test.")
public void failedWordTestIssue124() throws IOException {
    Path failPath = DATA_PATH.resolve("fails.txt");
    LinkedHashSet<String> words = new LinkedHashSet<>(Files.readAllLines(failPath, StandardCharsets.UTF_8));
    LinkedHashSet<String> accepted = new LinkedHashSet<>();
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    for (String s : words) {
        WordAnalysis parses = parser.analyze(s);
        List<SingleAnalysis> analyses = parses.getAnalysisResults();
        for (SingleAnalysis parse : analyses) {
            if (parse.isUnknown() || parse.isRuntime()) {
                continue;
            }
            accepted.add(s);
        }
    }
    for (String s : accepted) {
        words.remove(s);
    }
    Path failReduced = DATA_PATH.resolve("fails-reduced.txt");
    try (PrintWriter pw = new PrintWriter(failReduced.toFile(), "utf-8")) {
        words.forEach(pw::println);
    }
    Log.info("Word count = %d Found = %d Not Found = %d", words.size(), accepted.size(), words.size() - accepted.size());
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5