Search in sources :

Example 1 with TurkishSuffixes

use of zemberek.morphology.lexicon.tr.TurkishSuffixes in project lucene-solr-analysis-turkish by iorixxx.

the class Zemberek3StemFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    List<String> lines = new ArrayList<>();
    List<String> files = splitFileNames(dictionaryFiles);
    if (files.size() > 0) {
        for (String file : files) {
            List<String> wlist = getLines(loader, file.trim());
            lines.addAll(wlist);
        }
    }
    if (lines.isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    SuffixProvider suffixProvider = new TurkishSuffixes();
    RootLexicon lexicon = new TurkishDictionaryLoader(suffixProvider).load(lines);
    DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
    graph.addDictionaryItems(lexicon);
    parser = new WordParser(graph);
}
Also used : SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph) WordParser(zemberek.morphology.parser.WordParser)

Example 2 with TurkishSuffixes

use of zemberek.morphology.lexicon.tr.TurkishSuffixes in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method generateSuffixNames.

@Test
@Ignore("Not a Test.")
public void generateSuffixNames() throws IOException {
    TurkishSuffixes suffixes = new TurkishSuffixes();
    List<SuffixForm> forms = new ArrayList<>();
    for (SuffixForm form : suffixes.getAllForms()) {
        if (form instanceof NullSuffixForm) {
            continue;
        }
        forms.add(form);
    }
    forms.sort(Comparator.comparing(SuffixForm::getId));
    List<String> result = forms.stream().map(s -> s.id).collect(Collectors.toList());
    Files.write(Paths.get("suffix-list"), result);
}
Also used : TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TextUtil(zemberek.core.text.TextUtil) Stopwatch(com.google.common.base.Stopwatch) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Multimap(com.google.common.collect.Multimap) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) HashMultimap(com.google.common.collect.HashMultimap) Lists(com.google.common.collect.Lists) Locale(java.util.Locale) PrimaryPos(zemberek.core.turkish.PrimaryPos) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) LinkedHashSet(java.util.LinkedHashSet) Collator(java.text.Collator) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) SuffixForm(zemberek.morphology.lexicon.SuffixForm) Files(java.nio.file.Files) Collection(java.util.Collection) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Test(org.junit.Test) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) OflazerAnalyzerRunner(zemberek.morphology.external.OflazerAnalyzerRunner) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Ignore(org.junit.Ignore) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Comparator(java.util.Comparator) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) SuffixForm(zemberek.morphology.lexicon.SuffixForm) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) ArrayList(java.util.ArrayList) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

ArrayList (java.util.ArrayList)2 Splitter (com.google.common.base.Splitter)1 Stopwatch (com.google.common.base.Stopwatch)1 HashMultimap (com.google.common.collect.HashMultimap)1 LinkedHashMultimap (com.google.common.collect.LinkedHashMultimap)1 Lists (com.google.common.collect.Lists)1 Multimap (com.google.common.collect.Multimap)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 Collator (java.text.Collator)1 Collection (java.util.Collection)1 Comparator (java.util.Comparator)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Locale (java.util.Locale)1 Set (java.util.Set)1