Search in sources :

Example 1 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project lucene-solr-analysis-turkish by iorixxx.

the class Zemberek3StemFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    List<String> lines = new ArrayList<>();
    List<String> files = splitFileNames(dictionaryFiles);
    if (files.size() > 0) {
        for (String file : files) {
            List<String> wlist = getLines(loader, file.trim());
            lines.addAll(wlist);
        }
    }
    if (lines.isEmpty()) {
        this.parser = TurkishWordParserGenerator.createWithDefaults().getParser();
        // Use default dictionaries shipped with Zemberek3.
        return;
    }
    SuffixProvider suffixProvider = new TurkishSuffixes();
    RootLexicon lexicon = new TurkishDictionaryLoader(suffixProvider).load(lines);
    DynamicLexiconGraph graph = new DynamicLexiconGraph(suffixProvider);
    graph.addDictionaryItems(lexicon);
    parser = new WordParser(graph);
}
Also used : SuffixProvider(zemberek.morphology.lexicon.SuffixProvider) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) DynamicLexiconGraph(zemberek.morphology.lexicon.graph.DynamicLexiconGraph) WordParser(zemberek.morphology.parser.WordParser)

Example 2 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method masterDictionaryLoadTest.

@Test
@Ignore("Not a unit Test. Only loads the master dictionary.")
public void masterDictionaryLoadTest() throws IOException {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    RootLexicon items = loader.load(new File(Resources.getResource("tr/master-dictionary.dict").getFile()));
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    Set<String> masterVoicing = new HashSet<>();
    for (DictionaryItem item : items) {
        if (item.attributes.contains(NoVoicing)) {
            masterVoicing.add(item.lemma);
        }
    }
    Locale tr = new Locale("tr");
    List<String> allZ2 = SimpleTextReader.trimmingUTF8Reader(new File(Resources.getResource("tr/master-dictionary.dict").getFile())).asStringList();
    for (String s : allZ2) {
        if (s.startsWith("#")) {
            continue;
        }
        String clean = Strings.subStringUntilFirst(s.trim(), " ").toLowerCase(tr).replaceAll("[\\-']", "");
        if (s.contains("Adj") && !s.contains("Compound") && !s.contains("PropNoun")) {
            TurkishLetterSequence seq = new TurkishLetterSequence(clean, alphabet);
            if (seq.vowelCount() > 1 && seq.lastLetter().isStopConsonant() && !s.contains("Vo") && !s.contains("VowDrop")) {
                if (!masterVoicing.contains(clean)) {
                    File f = new File("/home/afsina/data/tdk/html", clean + ".html");
                    if (!f.exists()) {
                        f = new File("/home/afsina/data/tdk/html", clean.replaceAll("â", "a").replaceAll("\\u00ee", "i") + ".html");
                    }
                    if (!f.exists()) {
                        System.out.println("Cannot find:" + s);
                        continue;
                    }
                    char c = clean.charAt(clean.length() - 1);
                    char vv = c;
                    switch(c) {
                        case 'k':
                            vv = 'ğ';
                            break;
                        case 'p':
                            vv = 'b';
                            break;
                        case 'ç':
                            vv = 'c';
                            break;
                        case 't':
                            vv = 'd';
                            break;
                        default:
                            System.out.println("crap:" + s);
                    }
                    String content = SimpleTextReader.trimmingUTF8Reader(f).asString();
                    if (!content.contains("color=DarkBlue>-" + String.valueOf(vv))) {
                        System.out.println(s);
                    }
                }
            }
        }
    }
    for (DictionaryItem item : items) {
        if ((item.primaryPos == Noun || item.primaryPos == PrimaryPos.Adjective) && item.secondaryPos != SecondaryPos.ProperNoun && item.hasAttribute(RootAttribute.Voicing)) {
        }
    }
    System.out.println(items.size());
}
Also used : Locale(java.util.Locale) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) File(java.io.File) HashSet(java.util.HashSet) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 3 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method loadNounsFromFileTest.

@Test
public void loadNounsFromFileTest() throws IOException {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    RootLexicon items = loader.load(new File(Resources.getResource("test-lexicon-nouns.txt").getFile()));
    Assert.assertFalse(items.isEmpty());
    for (DictionaryItem item : items) {
        Assert.assertTrue(item.primaryPos == Noun);
    }
}
Also used : TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) File(java.io.File) Test(org.junit.Test)

Example 4 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method properNounsShouldNotHaveVoicingAutomaticallyTest.

@Test
public void properNounsShouldNotHaveVoicingAutomaticallyTest() {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    DictionaryItem item = loader.loadFromString("Tokat");
    Assert.assertEquals("tokat", item.root);
    Assert.assertEquals(Noun, item.primaryPos);
    Assert.assertEquals(SecondaryPos.ProperNoun, item.secondaryPos);
    Assert.assertFalse(item.hasAttribute(RootAttribute.Voicing));
    item = loader.loadFromString("Dink");
    Assert.assertEquals("dink", item.root);
    Assert.assertEquals(Noun, item.primaryPos);
    Assert.assertEquals(SecondaryPos.ProperNoun, item.secondaryPos);
    Assert.assertFalse(item.hasAttribute(RootAttribute.Voicing));
}
Also used : TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) Test(org.junit.Test)

Example 5 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class WordAnalyzerTest method getItems.

private List<DictionaryItem> getItems(String[] lines) {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    List<DictionaryItem> items = new ArrayList<DictionaryItem>();
    for (String line : lines) {
        items.add(loader.loadFromString(line));
    }
    return items;
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList)

Aggregations

TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)17 Test (org.junit.Test)9 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 RootLexicon (zemberek.morphology.lexicon.RootLexicon)4 Ignore (org.junit.Ignore)2 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)2 SuffixProvider (zemberek.morphology.lexicon.SuffixProvider)2 PrintWriter (java.io.PrintWriter)1 Path (java.nio.file.Path)1 HashSet (java.util.HashSet)1 Locale (java.util.Locale)1 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)1 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)1 TurkishMorphology (zemberek.morphology.TurkishMorphology)1 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)1 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)1 TurkishSuffixes (zemberek.morphology.lexicon.tr.TurkishSuffixes)1 WordParser (zemberek.morphology.parser.WordParser)1