Search in sources :

Example 16 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method getLastItem.

public DictionaryItem getLastItem(String... itemStr) {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    String last = Strings.subStringUntilFirst(itemStr[itemStr.length - 1], " ");
    return loader.load(itemStr).getMatchingItems(last).get(0);
}
Also used : TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)

Example 17 with TurkishDictionaryLoader

use of zemberek.morphology.lexicon.tr.TurkishDictionaryLoader in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method guessRootsWithHeuristics.

@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
    Log.info("Loading histogram.");
    List<String> words = Files.readAllLines(wordFreqFile);
    TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
    // dictionaryLoader.load("elma");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
    Multimap<String, String> res = HashMultimap.create(100000, 3);
    int c = 0;
    for (String s : words) {
        if (s.length() < 4) {
            continue;
        }
        if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
            continue;
        }
        for (int i = 2; i < s.length(); i++) {
            String candidateRoot = s.substring(0, i + 1);
            if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
                continue;
            }
            List<DictionaryItem> items = new ArrayList<>(3);
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
            char last = candidateRoot.charAt(candidateRoot.length() - 1);
            if (i < s.length() - 1) {
                char next = s.charAt(candidateRoot.length());
                if (Turkish.Alphabet.isVowel(next)) {
                    String f = "";
                    if (last == 'b') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
                    } else if (last == 'c') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
                    } else if (last == 'ğ') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
                    }
                    if (last == 'd') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
                    }
                    if (f.length() > 0) {
                        items.add(TurkishDictionaryLoader.loadFromString(f));
                    }
                }
            }
            for (DictionaryItem item : items) {
                morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
                WordAnalysis analyze = morphology.analyze(s);
                for (SingleAnalysis wordAnalysis : analyze) {
                    if (!wordAnalysis.isUnknown()) {
                        res.put(candidateRoot, s);
                    }
                }
                morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
            }
        }
        if (++c % 10000 == 0) {
            Log.info(c);
        }
        if (c == 100000) {
            break;
        }
    }
    Log.info("Writing.");
    try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
        PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
        for (String root : res.keySet()) {
            Collection<String> vals = res.get(root);
            if (vals.size() < 2) {
                continue;
            }
            List<String> wl = new ArrayList<>(vals);
            wl.sort(turkishCollator::compare);
            pw1.println(root + " : " + String.join(", ", vals));
            pw2.println(root);
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)17 Test (org.junit.Test)9 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 RootLexicon (zemberek.morphology.lexicon.RootLexicon)4 Ignore (org.junit.Ignore)2 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)2 SuffixProvider (zemberek.morphology.lexicon.SuffixProvider)2 PrintWriter (java.io.PrintWriter)1 Path (java.nio.file.Path)1 HashSet (java.util.HashSet)1 Locale (java.util.Locale)1 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)1 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)1 TurkishMorphology (zemberek.morphology.TurkishMorphology)1 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)1 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)1 TurkishSuffixes (zemberek.morphology.lexicon.tr.TurkishSuffixes)1 WordParser (zemberek.morphology.parser.WordParser)1