Search in sources :

Example 1 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project lucene-solr-analysis-turkish by iorixxx.

the class Zemberek3StemFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    List<String> lines = new ArrayList<>();
    List<String> files = splitFileNames(dictionaryFiles);
    if (files.size() > 0) {
        for (String file : files) {
            List<String> wlist = getLines(loader, file.trim());
            lines.addAll(wlist);
        }
    }
    if (lines.isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    SuffixProvider suffixProvider = new TurkishSuffixes();
    RootLexicon lexicon = new TurkishDictionaryLoader(suffixProvider).load(lines);
    DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
    graph.addDictionaryItems(lexicon);
    parser = new WordParser(graph);
}
Also used : SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph) WordParser(zemberek.morphology.parser.WordParser)

Example 2 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method collect.

private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
    List<String> sentences = getSentences(p);
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
    int tokenCount = 0;
    int sentenceCount = 0;
    List<SingleAnalysisSentence> result = new ArrayList<>();
    for (String sentence : sentences) {
        sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
        sentence = sentence.replaceAll("[\\u00ad]", "");
        sentence = sentence.replaceAll("[…]", "...");
        List<Single> singleAnalysisWords = new ArrayList<>();
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        boolean failed = false;
        int i = 0;
        for (Token token : tokens) {
            tokenCount++;
            String rawWord = token.getText();
            String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
            List<_SingleAnalysis> results;
            if (cache.containsKey(word)) {
                results = cache.get(word);
            } else {
                results = analyzer.analyze(word);
                cache.put(word, results);
            }
            if (results.size() == 0) {
                if (Strings.containsNone(word, "0123456789-.")) {
                    failedWords.add(word);
                }
            }
            if (results.size() < 1 || results.size() > maxAnalysisCount) {
                failed = true;
                break;
            } else {
                results = results.stream().filter(s -> !(s.getItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
                if (results.size() == 0) {
                    failed = true;
                    break;
                }
                singleAnalysisWords.add(new Single(word, i, results));
                i++;
            }
        }
        if (!failed) {
            result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) InterpretingAnalyzer(zemberek.morphology._analyzer.InterpretingAnalyzer) Token(org.antlr.v4.runtime.Token) zemberek.morphology._analyzer._SingleAnalysis(zemberek.morphology._analyzer._SingleAnalysis) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 3 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class DictionaryOperations method saveProperNouns.

public static void saveProperNouns() throws IOException {
    // TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    Set<String> set = new HashSet<>();
    for (DictionaryItem item : lexicon) {
        String lemma = item.lemma;
        if (item.attributes.contains(RootAttribute.Dummy)) {
            continue;
        }
        if (item.secondaryPos != SecondaryPos.ProperNoun) {
            continue;
        }
        set.add(lemma);
    }
    List<String> list = new ArrayList<>(set);
    list.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("zemberek.proper.vocab"), list);
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) HashSet(java.util.HashSet)

Example 4 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class CoverageTest method checkCoverage.

private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    Log.info("Thread count = %d", threadCount);
    ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
    CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
    Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
    Stopwatch sw = Stopwatch.createStarted();
    int batchCount = 0;
    int batchSize = 20_000;
    while (!lines.isEmpty()) {
        List<String> batch = new ArrayList<>(batchSize);
        int j = 0;
        while (j < batchSize && !lines.isEmpty()) {
            batch.add(lines.poll());
            j++;
        }
        if (batch.size() > 0) {
            service.submit(() -> {
                List<String> failed = new ArrayList<>(batchSize / 2);
                List<String> passed = new ArrayList<>(batchSize);
                for (String s : batch) {
                    String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
                    List<_SingleAnalysis> results = analyzer.analyze(c);
                    if (results.size() == 0) {
                        failed.add(s);
                    } else {
                    // passed.add(s);
                    }
                }
                return new Result(failed, passed, batch.size());
            });
            batchCount++;
        }
    }
    int i = 0;
    int total = 0;
    while (i < batchCount) {
        Result r = service.take().get();
        allResult.failedWords.addAll(r.failedWords);
        allResult.passedWords.addAll(r.passedWords);
        total += r.wordCount;
        if (total % (batchSize * 10) == 0) {
            logResult(allResult.failedWords, total, sw);
        }
        i++;
    }
    logResult(allResult.failedWords, total, sw);
    allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) ArrayList(java.util.ArrayList) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) ExecutorService(java.util.concurrent.ExecutorService) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 5 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpusNoCache.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
    Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == TurkishLexer.Punctuation) {
                continue;
            }
            tokenCount++;
            List<_SingleAnalysis> results = analyzer.analyze(token.getText());
            if (results.size() == 0) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(org.antlr.v4.runtime.Token) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

RootLexicon (zemberek.morphology.lexicon.RootLexicon)9 ArrayList (java.util.ArrayList)6 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)4 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)3 Stopwatch (com.google.common.base.Stopwatch)2 File (java.io.File)2 HashSet (java.util.HashSet)2 Token (org.antlr.v4.runtime.Token)2 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)2 SuffixProvider (zemberek.morphology.lexicon.SuffixProvider)2 Path (java.nio.file.Path)1 LinkedHashSet (java.util.LinkedHashSet)1 ExecutorCompletionService (java.util.concurrent.ExecutorCompletionService)1 ExecutorService (java.util.concurrent.ExecutorService)1 Ignore (org.junit.Ignore)1 Test (org.junit.Test)1 Histogram (zemberek.core.collections.Histogram)1 PrimaryPos (zemberek.core.turkish.PrimaryPos)1 InterpretingAnalyzer (zemberek.morphology._analyzer.InterpretingAnalyzer)1 zemberek.morphology._analyzer._SingleAnalysis (zemberek.morphology._analyzer._SingleAnalysis)1