Search in sources :

Example 71 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method checkProperNounsTest.

@Test
public void checkProperNounsTest() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Ankara", "Iphone [Pr:ayfon]", "Google [Pr:gugıl]").build();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    String[] correct = { "Ankara", "ANKARA", "Ankara'da", "ANKARA'DA", "ANKARA'da", "Iphone'umun", "Google'dan", "Iphone", "Google", "Google'sa" };
    for (String input : correct) {
        Assert.assertTrue("Fail at " + input, spellChecker.check(input));
    }
    String[] fail = { "Ankara'", "ankara", "AnKARA", "Ankarada", "ankara'DA", "-Ankara" };
    for (String input : fail) {
        Assert.assertFalse("Fail at " + input, spellChecker.check(input));
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 72 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class BotExperiment method main.

public static void main(String[] args) throws IOException {
    Path root = Paths.get("/media/ahmetaa/depo/normalization");
    Path dataRoot = root.resolve("test-small");
    Path rawLines = root.resolve("bot/raw");
    Path nodup = root.resolve("bot/sentences-nodup");
    Path sentencesNodup = root.resolve("bot/sentences-nodup");
    Path sentencesNodupTokenized = root.resolve("bot/sentences-nodup-tokenized");
    // Path sentencesNodupTokenized = root.resolve("bot/test");
    Path output = root.resolve("bot/report.txt");
    Path lmPath = root.resolve("lm.slm");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSentenceNormalizer normalizer = new TurkishSentenceNormalizer(morphology, dataRoot, lmPath);
    preprocess(rawLines, nodup, sentencesNodup, sentencesNodupTokenized);
    normalize(normalizer, sentencesNodupTokenized, output);
    String input = "tmm bu akşm dönücem sana";
    Log.info(input);
    Log.info(String.join(" ", normalizer.normalize(input)));
    Log.info("Done.");
}
Also used : Path(java.nio.file.Path) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 73 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method suggestWordPerformanceStemEnding.

@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceStemEnding() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    NgramLanguageModel lm = getLm("lm-unigram.slm");
    run(spellChecker, lm);
}
Also used : NgramLanguageModel(zemberek.lm.NgramLanguageModel) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 74 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method performance.

@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
    List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
    Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
    TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
    Log.info(lines.size() + " lines will be processed.");
    Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
    long tokenCount = 0;
    long tokenCountNoPunct = 0;
    Stopwatch clock = Stopwatch.createStarted();
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    for (String line : lines) {
        List<Token> tokens = lexer.tokenize(line);
        tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
        tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
    }
    long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Token Count = " + tokenCount);
    Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
    Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
    Log.info("");
    Log.info("Sentence word analysis test:");
    int counter = 0;
    clock.reset().start();
    for (String line : lines) {
        try {
            List<WordAnalysis> res = analyzer.analyzeSentence(line);
            // for preventing VM optimizations.
            counter += res.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(analyzer.toString());
    Log.info("");
    Log.info("Disambiguation Test:");
    analyzer.invalidateCache();
    clock.reset().start();
    for (String line : lines) {
        try {
            SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
            // for preventing VM optimizations.
            counter += results.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(counter);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 75 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberek2.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek2() throws IOException {
    Path wordFile = DATA_PATH.resolve("all-words-sorted-freq.txt");
    // Path wordFile = DATA_PATH.resolve("vocab-corpus-and-zemberek");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    Path outCorrect = outDir.resolve("zemberek-parses.txt");
    Path outIncorrect = outDir.resolve("zemberek-incorrect.txt");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> words = Files.readAllLines(wordFile).subList(0, 2000_000);
    List<String> correct = new ArrayList<>();
    List<String> incorrect = new ArrayList<>();
    for (String word : words) {
        if (morphology.analyze(word).isCorrect()) {
            correct.add(word);
        } else {
            incorrect.add(word);
        }
    }
    Files.write(outCorrect, correct, StandardCharsets.UTF_8);
    Files.write(outIncorrect, incorrect, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5