Search in sources :

Example 16 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class CoverageTest method checkCoverage.

private void checkCoverage(ArrayDeque<String> lines) throws IOException, InterruptedException, java.util.concurrent.ExecutionException {
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    TurkishMorphotactics morphotactics = new TurkishMorphotactics(lexicon);
    RuleBasedAnalyzer analyzer = RuleBasedAnalyzer.instance(morphotactics);
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    Log.info("Thread count = %d", threadCount);
    ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
    CompletionService<Result> service = new ExecutorCompletionService<>(executorService);
    Result allResult = new Result(new ArrayList<>(100000), new ArrayList<>(1000000), lines.size());
    Stopwatch sw = Stopwatch.createStarted();
    int batchCount = 0;
    int batchSize = 20_000;
    while (!lines.isEmpty()) {
        List<String> batch = new ArrayList<>(batchSize);
        int j = 0;
        while (j < batchSize && !lines.isEmpty()) {
            batch.add(lines.poll());
            j++;
        }
        if (batch.size() > 0) {
            service.submit(() -> {
                List<String> failed = new ArrayList<>(batchSize / 2);
                List<String> passed = new ArrayList<>(batchSize);
                for (String s : batch) {
                    String c = s.toLowerCase(Turkish.LOCALE).replaceAll("[']", "");
                    List<SingleAnalysis> results = analyzer.analyze(c);
                    if (results.size() == 0) {
                        failed.add(s);
                    } else {
                    // passed.add(s);
                    }
                }
                return new Result(failed, passed, batch.size());
            });
            batchCount++;
        }
    }
    int i = 0;
    int total = 0;
    while (i < batchCount) {
        Result r = service.take().get();
        allResult.failedWords.addAll(r.failedWords);
        allResult.passedWords.addAll(r.passedWords);
        total += r.wordCount;
        if (total % (batchSize * 10) == 0) {
            logResult(allResult.failedWords, total, sw);
        }
        i++;
    }
    logResult(allResult.failedWords, total, sw);
    allResult.failedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    allResult.passedWords.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-failed.txt"), allResult.failedWords, StandardCharsets.UTF_8);
    Files.write(Paths.get("../data/zemberek-oflazer/new-analyzer-passed.txt"), allResult.passedWords, StandardCharsets.UTF_8);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) ArrayList(java.util.ArrayList) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) ExecutorService(java.util.concurrent.ExecutorService) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 17 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class TurkishStopWords method generateFromDictionary.

static TurkishStopWords generateFromDictionary() throws IOException {
    Set<PrimaryPos> pos = Sets.newHashSet(PrimaryPos.Adverb, PrimaryPos.Conjunction, PrimaryPos.Determiner, PrimaryPos.Interjection, PrimaryPos.PostPositive, PrimaryPos.Numeral, PrimaryPos.Pronoun, PrimaryPos.Question);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Set<String> set = new HashSet<>();
    RootLexicon lexicon = morphology.getLexicon();
    for (DictionaryItem item : lexicon) {
        if (pos.contains(item.primaryPos)) {
            set.add(item.lemma);
        }
    }
    List<String> str = new ArrayList<>(set);
    str.sort(Turkish.STRING_COMPARATOR_ASC);
    return new TurkishStopWords(new LinkedHashSet<>(str));
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrimaryPos(zemberek.core.turkish.PrimaryPos) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 18 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class NormalizationVocabularyGenerator method getTurkishMorphology.

static TurkishMorphology getTurkishMorphology(boolean asciiTolerant) throws IOException {
    AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(200_000, 400_000).build();
    RootLexicon lexicon = TurkishDictionaryLoader.loadFromResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict");
    TurkishMorphology.Builder builder = TurkishMorphology.builder().setLexicon(lexicon).disableUnidentifiedTokenAnalyzer().setCache(cache);
    if (asciiTolerant) {
        builder.ignoreDiacriticsInAnalysis();
    }
    return builder.build();
}
Also used : AnalysisCache(zemberek.morphology.analysis.AnalysisCache) RootLexicon(zemberek.morphology.lexicon.RootLexicon) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 19 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method saveFullAttributes.

@Test
@Ignore("Not a unit test")
public void saveFullAttributes() throws IOException {
    RootLexicon items = TurkishDictionaryLoader.loadDefaultDictionaries();
    PrintWriter p = new PrintWriter(new File("dictionary-all-attributes.txt"), "utf-8");
    for (DictionaryItem item : items) {
        p.println(item.toString());
    }
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) RootLexicon(zemberek.morphology.lexicon.RootLexicon) File(java.io.File) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 20 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method dictionaryObsoleteCircumflexWordsCheck.

@Test
@Ignore("Not a Test.")
public void dictionaryObsoleteCircumflexWordsCheck() throws IOException {
    Path path = Paths.get("../data/vocabulary/words-with-circumflex-obsolete.txt");
    List<String> obsolete = Files.readAllLines(path, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    RootLexicon lexicon = morphology.getLexicon();
    List<String> single = new ArrayList<>();
    for (String s : obsolete) {
        List<DictionaryItem> items = lexicon.getMatchingItems(s);
        List<DictionaryItem> matchingItems = lexicon.getMatchingItems(TurkishAlphabet.INSTANCE.normalizeCircumflex(s));
        items.addAll(matchingItems);
        Log.info("%s = %s", s, items);
        if (items.size() == 1) {
            String line = items.get(0).toString();
            line = line.replace("[P:Noun]", "").trim();
            line = line.replace("[P:Noun, Prop]", "").trim();
            line = line.replace("P:Noun; ", "").trim();
            line = line.replace("P:Noun, Prop; ", "").trim();
            line = line.replace("P:Verb; ", "").trim();
            line = line.replace("[A:Voicing]", "").trim();
            single.add(line.replaceAll("\\s+", " ").trim());
        }
    }
    Path pathSingle = Paths.get("../data/vocabulary/words-with-circumflex-obsolete-single.txt");
    Files.write(pathSingle, single, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

RootLexicon (zemberek.morphology.lexicon.RootLexicon)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)12 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)6 File (java.io.File)4 LinkedHashSet (java.util.LinkedHashSet)4 Stopwatch (com.google.common.base.Stopwatch)3 HashSet (java.util.HashSet)3 Ignore (org.junit.Ignore)3 TurkishMorphology (zemberek.morphology.TurkishMorphology)3 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)3 TurkishMorphotactics (zemberek.morphology.morphotactics.TurkishMorphotactics)3 Path (java.nio.file.Path)2 List (java.util.List)2 Set (java.util.Set)2 ExecutorCompletionService (java.util.concurrent.ExecutorCompletionService)2 ExecutorService (java.util.concurrent.ExecutorService)2 Collectors (java.util.stream.Collectors)2 Assert (org.junit.Assert)2