Search in sources :

Example 56 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ProcessNormalizationCorpus method main.

public static void main(String[] args) throws Exception {
    TurkishMorphology morphology = getTurkishMorphology();
    Path normalizationDataRoot = Paths.get("/home/aaa/data/normalization/test-large");
    Path lmPath = Paths.get("/home/aaa/data/normalization/lm.slm");
    TurkishSentenceNormalizer normalizationPreprocessor = new TurkishSentenceNormalizer(morphology, normalizationDataRoot, lmPath);
    ProcessNormalizationCorpus processor = new ProcessNormalizationCorpus(normalizationPreprocessor);
    Path corporaRoot = Paths.get("/home/aaa/data/corpora");
    Path outRoot = Paths.get("/home/aaa/data/normalization/corpus/clean");
    Path rootList = corporaRoot.resolve("clean-list");
    Files.createDirectories(outRoot);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, BLOCK_SIZE);
    // create vocabularies
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 10) {
        threadCount = 10;
    }
    processor.process(corpusProvider, threadCount, outRoot);
    Log.info("Done.");
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) TurkishMorphology(zemberek.morphology.TurkishMorphology) NormalizationVocabularyGenerator.getTurkishMorphology(zemberek.normalization.NormalizationVocabularyGenerator.getTurkishMorphology)

Example 57 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatToCase.

@Test
public void formatToCase() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("kış", "şiir", "Aydın", "Google [Pr:gugıl]").build();
    String[] inputs = { "aydında", "googledan", "Google", "şiirde", "kışçığa", "kış" };
    String[] expectedDefaultCase = { "Aydın'da", "Google'dan", "Google", "şiirde", "kışçığa", "kış" };
    String[] expectedLowerCase = { "aydın'da", "google'dan", "google", "şiirde", "kışçığa", "kış" };
    String[] expectedUpperCase = { "AYDIN'DA", "GOOGLE'DAN", "GOOGLE", "ŞİİRDE", "KIŞÇIĞA", "KIŞ" };
    String[] expectedCapitalCase = { "Aydın'da", "Google'dan", "Google", "Şiirde", "Kışçığa", "Kış" };
    String[] expectedUpperRootLowerEndingCase = { "AYDIN'da", "GOOGLE'dan", "GOOGLE", "ŞİİRde", "KIŞçığa", "KIŞ" };
    testCaseType(morphology, inputs, expectedDefaultCase, DEFAULT_CASE);
    testCaseType(morphology, inputs, expectedLowerCase, LOWER_CASE);
    testCaseType(morphology, inputs, expectedUpperCase, UPPER_CASE);
    testCaseType(morphology, inputs, expectedCapitalCase, TITLE_CASE);
    testCaseType(morphology, inputs, expectedUpperRootLowerEndingCase, UPPER_CASE_ROOT_LOWER_CASE_ENDING);
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 58 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatNumerals.

@Test
public void formatNumerals() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().build();
    String[] inputs = { "1e", "4ten", "123ü", "12,5ten" };
    String[] expected = { "1'e", "4'ten", "123'ü", "12,5ten" };
    WordAnalysisSurfaceFormatter formatter = new WordAnalysisSurfaceFormatter();
    int i = 0;
    for (String input : inputs) {
        WordAnalysis results = morphology.analyze(input);
        for (SingleAnalysis result : results) {
            if (result.getDictionaryItem().primaryPos == PrimaryPos.Numeral) {
                Assert.assertEquals(expected[i], formatter.format(result, "'"));
            }
        }
        i++;
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 59 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatNonProperNoun.

@Test
public void formatNonProperNoun() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("elma", "kitap", "demek", "evet").build();
    String[] inputs = { "elmamadaki", "elma", "kitalarımdan", "kitabımızsa", "diyebileceğimiz", "dedi", "evet" };
    WordAnalysisSurfaceFormatter formatter = new WordAnalysisSurfaceFormatter();
    for (String input : inputs) {
        WordAnalysis results = morphology.analyze(input);
        for (SingleAnalysis result : results) {
            Assert.assertEquals(input, formatter.format(result, null));
        }
    }
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Example 60 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordAnalysisSurfaceFormatterTest method formatKnownProperNounsNoQuote.

@Test
public void formatKnownProperNounsNoQuote() {
    TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Blah [A:NoQuote]").build();
    String[] inputs = { "blaha", "Blahta" };
    String[] expected = { "Blaha", "Blahta" };
    check(morphology, inputs, expected, null);
}
Also used : TurkishMorphology(zemberek.morphology.TurkishMorphology) Test(org.junit.Test)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5