Search in sources :

Example 11 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class Syllables method getTwoConsonantStartWords.

public static void getTwoConsonantStartWords() throws IOException {
    HashMultimap<String, String> map = HashMultimap.create();
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    for (DictionaryItem item : lexicon) {
        String lemma = item.lemma;
        if (item.attributes.contains(RootAttribute.Dummy)) {
            continue;
        }
        if (item.secondaryPos == SecondaryPos.Abbreviation) {
            continue;
        }
        if (lemma.length() < 4 || TurkishAlphabet.INSTANCE.vowelCount(lemma) < 2) {
            continue;
        }
        if (!TurkishAlphabet.INSTANCE.isVowel(lemma.charAt(0)) && !TurkishAlphabet.INSTANCE.isVowel(lemma.charAt(1))) {
            map.put(lemma.substring(0, 2), lemma);
        }
    }
    List<String> list = new ArrayList<>(map.keySet());
    list.sort((a, b) -> Integer.compare(map.get(b).size(), map.get(a).size()));
    List<String> result = new ArrayList<>();
    List<String> acceptedPrefixes = new ArrayList<>();
    for (String s : list) {
        result.add(s + " " + String.join(",", map.get(s)));
        if (Character.isUpperCase(s.charAt(0))) {
            if (map.get(s).size() > 3) {
                acceptedPrefixes.add(s.substring(0, 2).toLowerCase(Turkish.LOCALE));
            }
        } else {
            acceptedPrefixes.add(s.substring(0, 2).toLowerCase(Turkish.LOCALE));
        }
    }
    acceptedPrefixes = new ArrayList<>(new LinkedHashSet<>(acceptedPrefixes));
    acceptedPrefixes.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("two-consonant-words"), list);
    Files.write(Paths.get("two-consonant-words.all"), result);
    Files.write(Paths.get("accepted-syllable-prefixes"), acceptedPrefixes);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 12 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpusNoCache.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
    Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == TurkishLexer.Punctuation) {
                continue;
            }
            tokenCount++;
            List<_SingleAnalysis> results = analyzer.analyze(token.getText());
            if (results.size() == 0) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(org.antlr.v4.runtime.Token) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 13 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class ParseConsole method addTextDictionaryResources.

public static RootLexicon addTextDictionaryResources(String... resources) throws IOException {
    RootLexicon lexicon = new RootLexicon();
    Log.info("Dictionaries :%s", String.join(", ", Arrays.asList(resources)));
    List<String> lines = new ArrayList<>();
    for (String resource : resources) {
        lines.addAll(Resources.readLines(Resources.getResource(resource), Charsets.UTF_8));
    }
    lexicon.addAll(new TurkishDictionaryLoader().load(lines));
    Log.info("Lexicon Generated.");
    return lexicon;
}
Also used : TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon)

Example 14 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class WordAnalyzerFunctionalTest method getLexiconGraph.

private DynamicLexiconGraph getLexiconGraph(File... dictionaries) throws IOException {
    SuffixProvider suffixProvider = suffixes;
    RootLexicon lexicon = new RootLexicon();
    for (File dictionary : dictionaries) {
        new TurkishDictionaryLoader().loadInto(lexicon, dictionary);
    }
    DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
    graph.addDictionaryItems(lexicon);
    return graph;
}
Also used : SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) RootLexicon(zemberek.morphology.lexicon.RootLexicon) File(java.io.File) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph)

Example 15 with RootLexicon

use of zemberek.morphology.lexicon.RootLexicon in project zemberek-nlp by ahmetaa.

the class SimpleGeneratorTest method getLexicon.

private DynamicLexiconGraph getLexicon() throws IOException {
    RootLexicon items = new TurkishDictionaryLoader().load(new File(Resources.getResource("dev-lexicon.txt").getFile()));
    DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
    graph.addDictionaryItems(items);
    return graph;
}
Also used : TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) RootLexicon(zemberek.morphology.lexicon.RootLexicon) File(java.io.File) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph)

Aggregations

RootLexicon (zemberek.morphology.lexicon.RootLexicon)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)12 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)6 File (java.io.File)4 LinkedHashSet (java.util.LinkedHashSet)4 Stopwatch (com.google.common.base.Stopwatch)3 HashSet (java.util.HashSet)3 Ignore (org.junit.Ignore)3 TurkishMorphology (zemberek.morphology.TurkishMorphology)3 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)3 TurkishMorphotactics (zemberek.morphology.morphotactics.TurkishMorphotactics)3 Path (java.nio.file.Path)2 List (java.util.List)2 Set (java.util.Set)2 ExecutorCompletionService (java.util.concurrent.ExecutorCompletionService)2 ExecutorService (java.util.concurrent.ExecutorService)2 Collectors (java.util.stream.Collectors)2 Assert (org.junit.Assert)2