use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method checkProperNounsTest.
@Test
public void checkProperNounsTest() throws IOException {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Ankara", "Iphone [Pr:ayfon]", "Google [Pr:gugıl]").build();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
String[] correct = { "Ankara", "ANKARA", "Ankara'da", "ANKARA'DA", "ANKARA'da", "Iphone'umun", "Google'dan", "Iphone", "Google", "Google'sa" };
for (String input : correct) {
Assert.assertTrue("Fail at " + input, spellChecker.check(input));
}
String[] fail = { "Ankara'", "ankara", "AnKARA", "Ankarada", "ankara'DA", "-Ankara" };
for (String input : fail) {
Assert.assertFalse("Fail at " + input, spellChecker.check(input));
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class BotExperiment method main.
public static void main(String[] args) throws IOException {
Path root = Paths.get("/media/ahmetaa/depo/normalization");
Path dataRoot = root.resolve("test-small");
Path rawLines = root.resolve("bot/raw");
Path nodup = root.resolve("bot/sentences-nodup");
Path sentencesNodup = root.resolve("bot/sentences-nodup");
Path sentencesNodupTokenized = root.resolve("bot/sentences-nodup-tokenized");
// Path sentencesNodupTokenized = root.resolve("bot/test");
Path output = root.resolve("bot/report.txt");
Path lmPath = root.resolve("lm.slm");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSentenceNormalizer normalizer = new TurkishSentenceNormalizer(morphology, dataRoot, lmPath);
preprocess(rawLines, nodup, sentencesNodup, sentencesNodupTokenized);
normalize(normalizer, sentencesNodupTokenized, output);
String input = "tmm bu akşm dönücem sana";
Log.info(input);
Log.info(String.join(" ", normalizer.normalize(input)));
Log.info("Done.");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestWordPerformanceStemEnding.
@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceStemEnding() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
NgramLanguageModel lm = getLm("lm-unigram.slm");
run(spellChecker, lm);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method performance.
@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
Log.info(lines.size() + " lines will be processed.");
Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
long tokenCount = 0;
long tokenCountNoPunct = 0;
Stopwatch clock = Stopwatch.createStarted();
TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
for (String line : lines) {
List<Token> tokens = lexer.tokenize(line);
tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
}
long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Token Count = " + tokenCount);
Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
Log.info("");
Log.info("Sentence word analysis test:");
int counter = 0;
clock.reset().start();
for (String line : lines) {
try {
List<WordAnalysis> res = analyzer.analyzeSentence(line);
// for preventing VM optimizations.
counter += res.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(analyzer.toString());
Log.info("");
Log.info("Disambiguation Test:");
analyzer.invalidateCache();
clock.reset().start();
for (String line : lines) {
try {
SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
// for preventing VM optimizations.
counter += results.size();
} catch (Exception e) {
Log.info(line);
e.printStackTrace();
}
}
elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
Log.info("Elapsed Time = " + elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
Log.info(counter);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ZemberekNlpScripts method parseLargeVocabularyZemberek2.
@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek2() throws IOException {
Path wordFile = DATA_PATH.resolve("all-words-sorted-freq.txt");
// Path wordFile = DATA_PATH.resolve("vocab-corpus-and-zemberek");
Path outDir = DATA_PATH.resolve("out");
Files.createDirectories(outDir);
Path outCorrect = outDir.resolve("zemberek-parses.txt");
Path outIncorrect = outDir.resolve("zemberek-incorrect.txt");
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
List<String> words = Files.readAllLines(wordFile).subList(0, 2000_000);
List<String> correct = new ArrayList<>();
List<String> incorrect = new ArrayList<>();
for (String word : words) {
if (morphology.analyze(word).isCorrect()) {
correct.add(word);
} else {
incorrect.add(word);
}
}
Files.write(outCorrect, correct, StandardCharsets.UTF_8);
Files.write(outIncorrect, incorrect, StandardCharsets.UTF_8);
}
Aggregations