Search in sources :

Example 11 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class RuleBasedDisambiguatorTest method test.

@Test
public void test() throws IOException {
    // String input = "ABD Açık Serena Williams'ın";
    // String input = "Çünkü birbirine tezat oluşturuyor.";
    // String input = "O anda gördüm.";
    // String input = "Aklımıza ilk gelen emeği öncelemek.";
    // String input = "Petrolün Türkiye üzerinden dünya pazarına satılması.";
    String input = "4 Neden önemli?";
    // String input = "Sadece partimi iktidar yaptım.";
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    // Rules rules = new Rules();
    // rules.pairLexRules.add(PairRule.fromLine("Aklı*|aklı* [akıl:Noun] *"));
    RuleBasedDisambiguator disambiguator = new RuleBasedDisambiguator(analyzer, Rules.fromResources());
    ResultSentence resultSentence = disambiguator.disambiguate(input);
    System.out.println(resultSentence.allIgnoredCount());
    for (AmbiguityAnalysis a : resultSentence.results) {
        a.getForTrainingOutput().forEach(System.out::println);
    }
}
Also used : AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) TurkishMorphology(zemberek.morphology.TurkishMorphology) RuleBasedDisambiguator(zemberek.morphology.ambiguity.RuleBasedDisambiguator) Test(org.junit.Test)

Example 12 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method findZemberekMissingOrDifferent.

@Test
@Ignore("Not a Test.")
public void findZemberekMissingOrDifferent() throws IOException {
    Path path = DATA_PATH.resolve("out");
    LinkedHashSet<String> oSet = new LinkedHashSet<>(TextIO.loadLines(path.resolve("dictionary-from-analysis.txt")).stream().filter(s -> !s.contains("Prop")).collect(Collectors.toList()));
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    List<String> zemberekTypes = new ArrayList<>(parser.getLexicon().size());
    for (DictionaryItem item : parser.getLexicon()) {
        String lemma = /*item.primaryPos == PrimaryPos.Verb ? item.lemma.replaceAll("mek$|mak$", "") : */
        item.lemma;
        lemma = TurkishAlphabet.INSTANCE.normalizeCircumflex(lemma);
        String primaryString = /*item.primaryPos == PrimaryPos.Adverb ? "Adverb" :*/
        item.primaryPos.shortForm;
        String pos = item.secondaryPos == null || item.secondaryPos == SecondaryPos.UnknownSec || item.secondaryPos == SecondaryPos.None ? "[P:" + primaryString + "]" : "[P:" + primaryString + "," + item.secondaryPos.shortForm + "]";
        zemberekTypes.add(lemma + " " + pos);
        if (pos.equals("[P:Noun]")) {
            zemberekTypes.add(lemma + " [P:Adj]");
        }
        if (pos.equals("[P:Adj]")) {
            zemberekTypes.add(lemma + " [P:Noun]");
        }
    }
    zemberekTypes.sort(turkishCollator::compare);
    Files.write(path.resolve("found-in-zemberek"), zemberekTypes);
    LinkedHashSet<String> zSet = new LinkedHashSet<>(zemberekTypes);
    oSet.removeAll(zSet);
    Files.write(path.resolve("not-found-in-zemberek"), oSet);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 13 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method memoryStressTest.

@Test
@Ignore("Not a Test.")
public void memoryStressTest() throws IOException {
    List<String> words = Files.readAllLines(Paths.get("dunya"));
    TurkishMorphology parser = TurkishMorphology.builder().setLexicon(RootLexicon.fromResources(TurkishDictionaryLoader.DEFAULT_DICTIONARY_RESOURCES)).build();
    int c = 0;
    for (int i = 0; i < 100; i++) {
        Stopwatch sw = Stopwatch.createStarted();
        for (String s : words) {
            WordAnalysis parses = parser.analyze(s);
            c += parses.analysisCount();
        }
        Log.info(sw.elapsed(TimeUnit.MILLISECONDS));
        Log.info(parser.toString());
    }
    Log.info(c);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 14 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method readmeExample2.

@Test
@Ignore("Not a Test")
public void readmeExample2() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WordAnalysis result = morphology.analyze("kitabımızsa");
    for (SingleAnalysis analysis : result) {
        System.out.println(analysis.formatLong());
        System.out.println("\tStems = " + analysis.getStems());
        System.out.println("\tLemmas = " + analysis.getLemmas());
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 15 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method ambiguousWords.

@Test
@Ignore("Not a Test.")
public void ambiguousWords() throws IOException {
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    Path correct = outDir.resolve("zemberek-parses.txt");
    Path outAmbAn = outDir.resolve("zemberek-ambigious-analyses.txt");
    Path outAmbWord = outDir.resolve("zemberek-ambigious-words.txt");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> words = Files.readAllLines(correct).subList(0, 100_000);
    List<String> ambWords = new ArrayList<>();
    List<WordAnalysis> amb = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect() || analysis.analysisCount() == 1) {
        } else {
            HashSet<String> stems = new HashSet<>(4);
            for (SingleAnalysis s : analysis) {
                stems.add(s.getStem());
                if (stems.size() > 1) {
                    amb.add(analysis);
                    ambWords.add(word);
                    break;
                }
            }
        }
    }
    Log.info("Writing %d words", amb.size());
    try (PrintWriter pwa = new PrintWriter(outAmbAn.toFile(), "utf-8")) {
        for (WordAnalysis wa : amb) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
    Files.write(outAmbWord, ambWords, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5