Search in sources :

Example 1 with TurkishMorphotactics

use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.

the class StemTransitionTrieBasedTest method testPrefix.

@Test
public void testPrefix() {
    RootLexicon lexicon = getLexicon();
    StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
    List<StemTransition> matches = t.getPrefixMatches("kabağa", false);
    Assert.assertEquals(3, matches.size());
    Set<String> lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
    Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabağ", "kabak"));
    matches = t.getPrefixMatches("kabak", false);
    Assert.assertEquals(2, matches.size());
    lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
    Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabak"));
    matches = t.getPrefixMatches("kapak", false);
    Assert.assertEquals(3, matches.size());
    lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
    Assert.assertTrue(TestUtil.containsAll(lemmas, "kapak"));
}
Also used : List(java.util.List) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) Set(java.util.Set) Test(org.junit.Test) TestUtil(zemberek.core.io.TestUtil) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Assert(org.junit.Assert) Collectors(java.util.stream.Collectors) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) StemTransition(zemberek.morphology.morphotactics.StemTransition) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) StemTransition(zemberek.morphology.morphotactics.StemTransition) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Test(org.junit.Test)

Example 2 with TurkishMorphotactics

use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.

the class StemTransitionTrieBasedTest method testItem.

@Test
public void testItem() {
    RootLexicon lexicon = getLexicon();
    StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
    DictionaryItem item = lexicon.getItemById("kapak_Noun");
    List<StemTransition> transitions = t.getTransitions(item);
    Assert.assertEquals(2, transitions.size());
    Set<String> surfaces = transitions.stream().map(s -> s.surface).collect(Collectors.toSet());
    Assert.assertTrue(TestUtil.containsAll(surfaces, "kapak", "kapağ"));
}
Also used : List(java.util.List) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) Set(java.util.Set) Test(org.junit.Test) TestUtil(zemberek.core.io.TestUtil) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Assert(org.junit.Assert) Collectors(java.util.stream.Collectors) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) StemTransition(zemberek.morphology.morphotactics.StemTransition) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) StemTransition(zemberek.morphology.morphotactics.StemTransition) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Test(org.junit.Test)

Example 3 with TurkishMorphotactics

use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.

the class CoverageTest method checkCoverage.

private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    TurkishMorphotactics morphotactics = new TurkishMorphotactics(lexicon);
    RuleBasedAnalyzer analyzer = RuleBasedAnalyzer.instance(morphotactics);
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    Log.info("Thread count = %d", threadCount);
    ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
    CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
    Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
    Stopwatch sw = Stopwatch.createStarted();
    int batchCount = 0;
    int batchSize = 20_000;
    while (!lines.isEmpty()) {
        List<String> batch = new ArrayList<>(batchSize);
        int j = 0;
        while (j < batchSize && !lines.isEmpty()) {
            batch.add(lines.poll());
            j++;
        }
        if (batch.size() > 0) {
            service.submit(() -> {
                List<String> failed = new ArrayList<>(batchSize / 2);
                List<String> passed = new ArrayList<>(batchSize);
                for (String s : batch) {
                    String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
                    List<SingleAnalysis> results = analyzer.analyze(c);
                    if (results.size() == 0) {
                        failed.add(s);
                    } else {
                    // passed.add(s);
                    }
                }
                return new Result(failed, passed, batch.size());
            });
            batchCount++;
        }
    }
    int i = 0;
    int total = 0;
    while (i < batchCount) {
        Result r = service.take().get();
        allResult.failedWords.addAll(r.failedWords);
        allResult.passedWords.addAll(r.passedWords);
        total += r.wordCount;
        if (total % (batchSize * 10) == 0) {
            logResult(allResult.failedWords, total, sw);
        }
        i++;
    }
    logResult(allResult.failedWords, total, sw);
    allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) ArrayList(java.util.ArrayList) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) ExecutorService(java.util.concurrent.ExecutorService) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 4 with TurkishMorphotactics

use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.

the class WordGeneratorTest method testGeneration4.

@Test
public void testGeneration4() {
    TurkishMorphotactics mo = getMorphotactics("elma");
    WordGenerator wordGenerator = new WordGenerator(mo);
    List<String> morphemes = Lists.newArrayList("Noun", "A3pl", "P1pl");
    List<Result> results = wordGenerator.generate(mo.getRootLexicon().getItemById("elma_Noun"), TurkishMorphotactics.getMorphemes(morphemes));
    Assert.assertTrue(results.size() > 0);
    Assert.assertEquals("elmalarımız", results.get(0).surface);
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) Result(zemberek.morphology.generator.WordGenerator.Result) Test(org.junit.Test)

Example 5 with TurkishMorphotactics

use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.

the class WordGeneratorTest method testGeneration5.

@Test
public void testGeneration5() {
    TurkishMorphotactics mo = getMorphotactics("yapmak");
    WordGenerator wordGenerator = new WordGenerator(mo);
    List<String> morphemes = Lists.newArrayList("Verb", "Opt", "A1pl");
    DictionaryItem item = mo.getRootLexicon().getItemById("yapmak_Verb");
    List<Result> results = wordGenerator.generate(item, TurkishMorphotactics.getMorphemes(morphemes));
    Assert.assertTrue(results.size() > 0);
    Assert.assertEquals("yapalım", results.get(0).surface);
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) Result(zemberek.morphology.generator.WordGenerator.Result) Test(org.junit.Test)

Aggregations

TurkishMorphotactics (zemberek.morphology.morphotactics.TurkishMorphotactics)5 Test (org.junit.Test)4 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)3 RootLexicon (zemberek.morphology.lexicon.RootLexicon)3 List (java.util.List)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Assert (org.junit.Assert)2 TestUtil (zemberek.core.io.TestUtil)2 Result (zemberek.morphology.generator.WordGenerator.Result)2 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)2 StemTransition (zemberek.morphology.morphotactics.StemTransition)2 Stopwatch (com.google.common.base.Stopwatch)1 ArrayList (java.util.ArrayList)1 ExecutorCompletionService (java.util.concurrent.ExecutorCompletionService)1 ExecutorService (java.util.concurrent.ExecutorService)1