use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.
the class StemTransitionTrieBasedTest method testPrefix.
@Test
public void testPrefix() {
RootLexicon lexicon = getLexicon();
StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
List<StemTransition> matches = t.getPrefixMatches("kabağa", false);
Assert.assertEquals(3, matches.size());
Set<String> lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabağ", "kabak"));
matches = t.getPrefixMatches("kabak", false);
Assert.assertEquals(2, matches.size());
lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kaba", "kabak"));
matches = t.getPrefixMatches("kapak", false);
Assert.assertEquals(3, matches.size());
lemmas = matches.stream().map(s -> s.item.lemma).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(lemmas, "kapak"));
}
use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.
the class StemTransitionTrieBasedTest method testItem.
@Test
public void testItem() {
RootLexicon lexicon = getLexicon();
StemTransitionsTrieBased t = new StemTransitionsTrieBased(lexicon, new TurkishMorphotactics(lexicon));
DictionaryItem item = lexicon.getItemById("kapak_Noun");
List<StemTransition> transitions = t.getTransitions(item);
Assert.assertEquals(2, transitions.size());
Set<String> surfaces = transitions.stream().map(s -> s.surface).collect(Collectors.toSet());
Assert.assertTrue(TestUtil.containsAll(surfaces, "kapak", "kapağ"));
}
use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.
the class CoverageTest method checkCoverage.
private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
TurkishMorphotactics morphotactics = new TurkishMorphotactics(lexicon);
RuleBasedAnalyzer analyzer = RuleBasedAnalyzer.instance(morphotactics);
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
Log.info("Thread count = %d", threadCount);
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
Stopwatch sw = Stopwatch.createStarted();
int batchCount = 0;
int batchSize = 20_000;
while (!lines.isEmpty()) {
List<String> batch = new ArrayList<>(batchSize);
int j = 0;
while (j < batchSize && !lines.isEmpty()) {
batch.add(lines.poll());
j++;
}
if (batch.size() > 0) {
service.submit(() -> {
List<String> failed = new ArrayList<>(batchSize / 2);
List<String> passed = new ArrayList<>(batchSize);
for (String s : batch) {
String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
List<SingleAnalysis> results = analyzer.analyze(c);
if (results.size() == 0) {
failed.add(s);
} else {
// passed.add(s);
}
}
return new Result(failed, passed, batch.size());
});
batchCount++;
}
}
int i = 0;
int total = 0;
while (i < batchCount) {
Result r = service.take().get();
allResult.failedWords.addAll(r.failedWords);
allResult.passedWords.addAll(r.passedWords);
total += r.wordCount;
if (total % (batchSize * 10) == 0) {
logResult(allResult.failedWords, total, sw);
}
i++;
}
logResult(allResult.failedWords, total, sw);
allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.
the class WordGeneratorTest method testGeneration4.
@Test
public void testGeneration4() {
TurkishMorphotactics mo = getMorphotactics("elma");
WordGenerator wordGenerator = new WordGenerator(mo);
List<String> morphemes = Lists.newArrayList("Noun", "A3pl", "P1pl");
List<Result> results = wordGenerator.generate(mo.getRootLexicon().getItemById("elma_Noun"), TurkishMorphotactics.getMorphemes(morphemes));
Assert.assertTrue(results.size() > 0);
Assert.assertEquals("elmalarımız", results.get(0).surface);
}
use of zemberek.morphology.morphotactics.TurkishMorphotactics in project zemberek-nlp by ahmetaa.
the class WordGeneratorTest method testGeneration5.
@Test
public void testGeneration5() {
TurkishMorphotactics mo = getMorphotactics("yapmak");
WordGenerator wordGenerator = new WordGenerator(mo);
List<String> morphemes = Lists.newArrayList("Verb", "Opt", "A1pl");
DictionaryItem item = mo.getRootLexicon().getItemById("yapmak_Verb");
List<Result> results = wordGenerator.generate(item, TurkishMorphotactics.getMorphemes(morphemes));
Assert.assertTrue(results.size() > 0);
Assert.assertEquals("yapalım", results.get(0).surface);
}
Aggregations