use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class DictionarySerializer method serializeDeserializeTest.
private static void serializeDeserializeTest() throws IOException {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
RootLexicon lexicon = morphology.getLexicon();
Dictionary.Builder builder = Dictionary.newBuilder();
for (DictionaryItem item : lexicon.getAllItems()) {
builder.addItems(convertToProto(item));
}
Dictionary dictionary = builder.build();
System.out.println("Total size of serialized dictionary: " + dictionary.getSerializedSize());
Path f = Files.createTempFile("lexicon", ".bin");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(f.toFile()));
bos.write(dictionary.toByteArray());
bos.close();
long start = System.currentTimeMillis();
byte[] serialized = Files.readAllBytes(f);
long end = System.currentTimeMillis();
Log.info("Dictionary loaded in %d ms.", (end - start));
start = System.currentTimeMillis();
Dictionary readDictionary = Dictionary.parseFrom(serialized);
end = System.currentTimeMillis();
Log.info("Dictionary deserialized in %d ms.", (end - start));
System.out.println("Total size of read dictionary: " + readDictionary.getSerializedSize());
start = System.currentTimeMillis();
RootLexicon loadedLexicon = new RootLexicon();
for (LexiconProto.DictionaryItem item : readDictionary.getItemsList()) {
loadedLexicon.add(convertToDictionaryItem(item));
}
end = System.currentTimeMillis();
Log.info("RootLexicon generated in %d ms.", (end - start));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class AmbiguityResolutionTests method shouldNotThrowException.
@Test
public void shouldNotThrowException() throws IOException {
List<String> lines = TextIO.loadLinesFromResource("corpora/cnn-turk-10k");
lines = lines.subList(0, 1000);
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
for (String line : lines) {
List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(line);
for (String sentence : sentences) {
morphology.analyzeAndDisambiguate(sentence);
}
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CharacterGraphDecoderTest method stemEndingTest2.
@Test
public void stemEndingTest2() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("üzmek", "yüz", "güz").build();
List<String> endings = Lists.newArrayList("düm");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("yüzdüm");
Assert.assertEquals(3, res.size());
assertContainsAll(res, "yüzdüm", "üzdüm", "güzdüm");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CharacterGraphDecoderTest method stemEndingTest3.
@Test
public void stemEndingTest3() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("o", "ol", "ola").build();
List<String> endings = Lists.newArrayList("arak", "acak");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("olarak");
assertContainsAll(res, "olarak", "olacak", "olaarak");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class CharacterGraphDecoderTest method stemEndingTest.
@Test
public void stemEndingTest() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("Türkiye", "Bayram").build();
List<String> endings = Lists.newArrayList("ında", "de");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
CharacterGraphDecoder spellChecker = new CharacterGraphDecoder(graph.stemGraph);
List<ScoredItem<String>> res = spellChecker.getSuggestionsWithScores("türkiyede");
assertContainsAll(res, "türkiyede");
}
Aggregations