use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class ProcessNormalizationCorpus method main.
public static void main(String[] args) throws Exception {
TurkishMorphology morphology = getTurkishMorphology();
Path normalizationDataRoot = Paths.get("/home/aaa/data/normalization/test-large");
Path lmPath = Paths.get("/home/aaa/data/normalization/lm.slm");
TurkishSentenceNormalizer normalizationPreprocessor = new TurkishSentenceNormalizer(morphology, normalizationDataRoot, lmPath);
ProcessNormalizationCorpus processor = new ProcessNormalizationCorpus(normalizationPreprocessor);
Path corporaRoot = Paths.get("/home/aaa/data/corpora");
Path outRoot = Paths.get("/home/aaa/data/normalization/corpus/clean");
Path rootList = corporaRoot.resolve("clean-list");
Files.createDirectories(outRoot);
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, rootList, BLOCK_SIZE);
// create vocabularies
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 10) {
threadCount = 10;
}
processor.process(corpusProvider, threadCount, outRoot);
Log.info("Done.");
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatToCase.
@Test
public void formatToCase() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("kış", "şiir", "Aydın", "Google [Pr:gugıl]").build();
String[] inputs = { "aydında", "googledan", "Google", "şiirde", "kışçığa", "kış" };
String[] expectedDefaultCase = { "Aydın'da", "Google'dan", "Google", "şiirde", "kışçığa", "kış" };
String[] expectedLowerCase = { "aydın'da", "google'dan", "google", "şiirde", "kışçığa", "kış" };
String[] expectedUpperCase = { "AYDIN'DA", "GOOGLE'DAN", "GOOGLE", "ŞİİRDE", "KIŞÇIĞA", "KIŞ" };
String[] expectedCapitalCase = { "Aydın'da", "Google'dan", "Google", "Şiirde", "Kışçığa", "Kış" };
String[] expectedUpperRootLowerEndingCase = { "AYDIN'da", "GOOGLE'dan", "GOOGLE", "ŞİİRde", "KIŞçığa", "KIŞ" };
testCaseType(morphology, inputs, expectedDefaultCase, DEFAULT_CASE);
testCaseType(morphology, inputs, expectedLowerCase, LOWER_CASE);
testCaseType(morphology, inputs, expectedUpperCase, UPPER_CASE);
testCaseType(morphology, inputs, expectedCapitalCase, TITLE_CASE);
testCaseType(morphology, inputs, expectedUpperRootLowerEndingCase, UPPER_CASE_ROOT_LOWER_CASE_ENDING);
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatNumerals.
@Test
public void formatNumerals() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().build();
String[] inputs = { "1e", "4ten", "123ü", "12,5ten" };
String[] expected = { "1'e", "4'ten", "123'ü", "12,5ten" };
WordAnalysisSurfaceFormatter formatter = new WordAnalysisSurfaceFormatter();
int i = 0;
for (String input : inputs) {
WordAnalysis results = morphology.analyze(input);
for (SingleAnalysis result : results) {
if (result.getDictionaryItem().primaryPos == PrimaryPos.Numeral) {
Assert.assertEquals(expected[i], formatter.format(result, "'"));
}
}
i++;
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatNonProperNoun.
@Test
public void formatNonProperNoun() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("elma", "kitap", "demek", "evet").build();
String[] inputs = { "elmamadaki", "elma", "kitalarımdan", "kitabımızsa", "diyebileceğimiz", "dedi", "evet" };
WordAnalysisSurfaceFormatter formatter = new WordAnalysisSurfaceFormatter();
for (String input : inputs) {
WordAnalysis results = morphology.analyze(input);
for (SingleAnalysis result : results) {
Assert.assertEquals(input, formatter.format(result, null));
}
}
}
use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.
the class WordAnalysisSurfaceFormatterTest method formatKnownProperNounsNoQuote.
@Test
public void formatKnownProperNounsNoQuote() {
TurkishMorphology morphology = TurkishMorphology.builder().disableCache().setLexicon("Blah [A:NoQuote]").build();
String[] inputs = { "blaha", "Blahta" };
String[] expected = { "Blaha", "Blahta" };
check(morphology, inputs, expected, null);
}
Aggregations