use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method getTurkishMorphology.
static TurkishMorphology getTurkishMorphology(boolean asciiTolerant) throws IOException {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(200_000, 400_000).build();
RootLexicon lexicon = TurkishDictionaryLoader.loadFromResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict");
TurkishMorphology.Builder builder = TurkishMorphology.builder().setLexicon(lexicon).disableUnidentifiedTokenAnalyzer().setCache(cache);
if (asciiTolerant) {
builder.ignoreDiacriticsInAnalysis();
}
return builder.build();
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestWordPerformanceWord.
@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceWord() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
CharacterGraph graph = new CharacterGraph();
Path r = Paths.get("../data/zemberek-oflazer/oflazer-zemberek-parsed.txt");
List<String> words = Files.readAllLines(r, StandardCharsets.UTF_8).subList(0, 1000_000);
Log.info("Total word count = %d", words.size());
words.forEach(s -> graph.addWord(s, Node.TYPE_WORD));
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph);
NgramLanguageModel lm = getLm("lm-unigram.slm");
run(spellChecker, lm);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestVerb1.
@Test
public void suggestVerb1() {
TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("okumak").build();
List<String> endings = Lists.newArrayList("dum");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph.stemGraph);
List<String> res = spellChecker.suggestForWord("okudm");
Assert.assertTrue(res.contains("okudum"));
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method formatNumbersTest.
// TODO: check for ordinals.
@Test
public void formatNumbersTest() throws IOException {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("bir [P:Num]", "dört [P:Num;A:Voicing]", "üç [P:Num]", "beş [P:Num]").build();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
String[] inputs = { "1'e", "4'ten", "123'ü", "12,5'ten", "1'E", "4'TEN", "123'Ü", "12,5'TEN", "%1", "%1'i", "%1,3'ü" };
for (String input : inputs) {
Assert.assertTrue("Fail at " + input, spellChecker.check(input));
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method runSentence.
@Test
@Ignore("Slow. Uses actual data.")
public void runSentence() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
NgramLanguageModel lm = getLm("lm-bigram.slm");
List<String> sentences = TextIO.loadLinesFromResource("spell-checker-test-small.txt");
try (PrintWriter pw = new PrintWriter("bigram-test-result.txt")) {
for (String sentence : sentences) {
pw.println(sentence);
List<String> input = TurkishSpellChecker.tokenizeForSpelling(sentence);
for (int i = 0; i < input.size(); i++) {
String left = i == 0 ? null : input.get(i - 1);
String right = i == input.size() - 1 ? null : input.get(i + 1);
String word = input.get(i);
String deformed = applyDeformation(word);
List<String> res = spellChecker.suggestForWord(deformed, left, right, lm);
pw.println(String.format("%s %s[%s] %s -> %s", left, deformed, word, right, res.toString()));
}
pw.println();
}
}
}
Aggregations