Search in sources :

Example 1 with TurkishAlphabet

use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method masterDictionaryLoadTest.

@Test
@Ignore("Not a unit Test. Only loads the master dictionary.")
public void masterDictionaryLoadTest() throws IOException {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    RootLexicon items = loader.load(new File(Resources.getResource("tr/master-dictionary.dict").getFile()));
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    Set<String> masterVoicing = new HashSet<>();
    for (DictionaryItem item : items) {
        if (item.attributes.contains(NoVoicing)) {
            masterVoicing.add(item.lemma);
        }
    }
    Locale tr = new Locale("tr");
    List<String> allZ2 = SimpleTextReader.trimmingUTF8Reader(new File(Resources.getResource("tr/master-dictionary.dict").getFile())).asStringList();
    for (String s : allZ2) {
        if (s.startsWith("#")) {
            continue;
        }
        String clean = Strings.subStringUntilFirst(s.trim(), " ").toLowerCase(tr).replaceAll("[\\-']", "");
        if (s.contains("Adj") && !s.contains("Compound") && !s.contains("PropNoun")) {
            TurkishLetterSequence seq = new TurkishLetterSequence(clean, alphabet);
            if (seq.vowelCount() > 1 && seq.lastLetter().isStopConsonant() && !s.contains("Vo") && !s.contains("VowDrop")) {
                if (!masterVoicing.contains(clean)) {
                    File f = new File("/home/afsina/data/tdk/html", clean + ".html");
                    if (!f.exists()) {
                        f = new File("/home/afsina/data/tdk/html", clean.replaceAll("â", "a").replaceAll("\\u00ee", "i") + ".html");
                    }
                    if (!f.exists()) {
                        System.out.println("Cannot find:" + s);
                        continue;
                    }
                    char c = clean.charAt(clean.length() - 1);
                    char vv = c;
                    switch(c) {
                        case 'k':
                            vv = 'ğ';
                            break;
                        case 'p':
                            vv = 'b';
                            break;
                        case 'ç':
                            vv = 'c';
                            break;
                        case 't':
                            vv = 'd';
                            break;
                        default:
                            System.out.println("crap:" + s);
                    }
                    String content = SimpleTextReader.trimmingUTF8Reader(f).asString();
                    if (!content.contains("color=DarkBlue>-" + String.valueOf(vv))) {
                        System.out.println(s);
                    }
                }
            }
        }
    }
    for (DictionaryItem item : items) {
        if ((item.primaryPos == Noun || item.primaryPos == PrimaryPos.Adjective) && item.secondaryPos != SecondaryPos.ProperNoun && item.hasAttribute(RootAttribute.Voicing)) {
        }
    }
    System.out.println(items.size());
}
Also used : Locale(java.util.Locale) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) File(java.io.File) HashSet(java.util.HashSet) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with TurkishAlphabet

use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.

the class UnidentifiedTokenAnalyzer method tryWithoutApostrophe.

private List<SingleAnalysis> tryWithoutApostrophe(String word, SecondaryPos secondaryPos) {
    String normalized = null;
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    if (alphabet.containsForeignDiacritics(word)) {
        normalized = alphabet.foreignDiacriticsToTurkish(word);
    }
    normalized = normalized == null ? alphabet.normalize(word) : alphabet.normalize(normalized);
    boolean capitalize = secondaryPos == SecondaryPos.ProperNoun || secondaryPos == SecondaryPos.Abbreviation;
    // TODO: should we remove dots with normalization?
    String pronunciation = guessPronunciation(normalized.replaceAll("[.]", ""));
    DictionaryItem item = new DictionaryItem(capitalize ? Turkish.capitalize(normalized) : normalized, normalized, pronunciation, PrimaryPos.Noun, secondaryPos);
    if (!alphabet.containsVowel(pronunciation)) {
        List<SingleAnalysis> result = new ArrayList<>(1);
        result.add(SingleAnalysis.dummy(word, item));
        return result;
    }
    boolean itemDoesNotExist = !lexicon.containsItem(item);
    if (itemDoesNotExist) {
        item.attributes.add(RootAttribute.Runtime);
        analyzer.getStemTransitions().addDictionaryItem(item);
    }
    List<SingleAnalysis> results = analyzer.analyze(normalized);
    if (itemDoesNotExist) {
        analyzer.getStemTransitions().removeDictionaryItem(item);
    }
    return results;
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) ArrayList(java.util.ArrayList)

Example 3 with TurkishAlphabet

use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.

the class OflazerAnalyzerRunner method extractRootsFromParse.

public static void extractRootsFromParse(File input, File output) throws IOException {
    System.out.println("Extracting root words from parse list");
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    List<String> all = SimpleTextReader.trimmingUTF8Reader(input).asStringList();
    Set<String> roots = Sets.newHashSet();
    for (String s : all) {
        List<String> lst = Lists.newArrayList(Splitter.on("\t").split(s));
        String root = lst.get(1);
        if (root.contains("+")) {
            Iterator<String> iterator = Splitter.on("+").split(root).iterator();
            root = iterator.next();
            String pos = iterator.next();
            if (pos.equals("Verb")) {
                if (alphabet.getLastVowel(root).isFrontal()) {
                    root = root + "mek";
                } else {
                    root = root + "mak";
                }
            }
        }
        roots.add(root);
    }
    ArrayList<String> sorted = Lists.newArrayList(roots);
    sorted.sort(ctr);
    SimpleTextWriter.oneShotUTF8Writer(output).writeLines(sorted);
}
Also used : TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet)

Example 4 with TurkishAlphabet

use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.

the class OflazerAnalyzerRunner method extractDictItems.

public static void extractDictItems(File input, File output) throws IOException {
    System.out.println("Extracting dict items from parse list");
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    List<String> all = SimpleTextReader.trimmingUTF8Reader(input).asStringList();
    Set<String> roots = Sets.newHashSet();
    for (String s : all) {
        s = s.replaceAll("\\^DB", "");
        List<String> lst = Lists.newArrayList(Splitter.on("\t").split(s));
        String root = lst.get(1);
        StringBuilder data = new StringBuilder();
        if (root.contains("+")) {
            Iterator<String> iterator = Splitter.on("+").split(root).iterator();
            root = iterator.next();
            String pos = iterator.next();
            String secPos = "";
            if (iterator.hasNext()) {
                String c = iterator.next();
                if (secondaryPosSet.contains(c)) {
                    secPos = c;
                }
            }
            if (pos.equals("Verb")) {
                if (alphabet.getLastVowel(root).isFrontal()) {
                    root = root + "mek";
                } else {
                    root = root + "mak";
                }
            }
            data.append(root);
            if (!pos.equals("Noun") && !pos.equals("Verb")) {
                if (pos.equals("Adverb")) {
                    pos = "Adv";
                }
                data.append(" [P:").append(pos);
                if (secPos.length() > 1) {
                    data.append(" ,").append(secPos);
                }
                data.append("; A:Ext]");
            } else {
                if (secPos.length() > 0) {
                    data.append(" [P:").append(secPos).append("; A:Ext]");
                } else {
                    data.append(" [A:Ext]");
                }
            }
        } else {
            data = new StringBuilder(root).append(" [A:Ext]");
        }
        roots.add(data.toString());
    }
    ArrayList<String> sorted = Lists.newArrayList(roots);
    sorted.sort(ctr);
    SimpleTextWriter.oneShotUTF8Writer(output).writeLines(sorted);
}
Also used : TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet)

Example 5 with TurkishAlphabet

use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.

the class DictionaryOperations method saveRegular.

public static void saveRegular() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Set<String> set = new HashSet<>();
    for (DictionaryItem item : morphology.getLexicon()) {
        String lemma = item.lemma;
        if (item.attributes.contains(RootAttribute.Dummy)) {
            continue;
        }
        if (item.primaryPos == PrimaryPos.Punctuation) /*|| item.secondaryPos == SecondaryPos.ProperNoun
          || item.secondaryPos == SecondaryPos.Abbreviation*/
        {
            continue;
        }
        set.add(lemma);
        TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
        if (alphabet.containsCircumflex(lemma)) {
            set.add(alphabet.normalizeCircumflex(lemma));
        }
    }
    List<String> list = new ArrayList<>(set);
    list.sort(Turkish.STRING_COMPARATOR_ASC);
    Files.write(Paths.get("zemberek.vocab"), list);
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)6 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)3 LinkedHashSet (java.util.LinkedHashSet)2 File (java.io.File)1 Locale (java.util.Locale)1 Ignore (org.junit.Ignore)1 Test (org.junit.Test)1 TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)1 TurkishMorphology (zemberek.morphology.TurkishMorphology)1 RootLexicon (zemberek.morphology.lexicon.RootLexicon)1 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)1