Search in sources :

Example 1 with TurkishLetterSequence

use of zemberek.core.turkish.TurkishLetterSequence in project zemberek-nlp by ahmetaa.

the class TurkishDictionaryLoaderTest method masterDictionaryLoadTest.

@Test
@Ignore("Not a unit Test. Only loads the master dictionary.")
public void masterDictionaryLoadTest() throws IOException {
    TurkishDictionaryLoader loader = new TurkishDictionaryLoader();
    RootLexicon items = loader.load(new File(Resources.getResource("tr/master-dictionary.dict").getFile()));
    TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
    Set<String> masterVoicing = new HashSet<>();
    for (DictionaryItem item : items) {
        if (item.attributes.contains(NoVoicing)) {
            masterVoicing.add(item.lemma);
        }
    }
    Locale tr = new Locale("tr");
    List<String> allZ2 = SimpleTextReader.trimmingUTF8Reader(new File(Resources.getResource("tr/master-dictionary.dict").getFile())).asStringList();
    for (String s : allZ2) {
        if (s.startsWith("#")) {
            continue;
        }
        String clean = Strings.subStringUntilFirst(s.trim(), " ").toLowerCase(tr).replaceAll("[\\-']", "");
        if (s.contains("Adj") && !s.contains("Compound") && !s.contains("PropNoun")) {
            TurkishLetterSequence seq = new TurkishLetterSequence(clean, alphabet);
            if (seq.vowelCount() > 1 && seq.lastLetter().isStopConsonant() && !s.contains("Vo") && !s.contains("VowDrop")) {
                if (!masterVoicing.contains(clean)) {
                    File f = new File("/home/afsina/data/tdk/html", clean + ".html");
                    if (!f.exists()) {
                        f = new File("/home/afsina/data/tdk/html", clean.replaceAll("â", "a").replaceAll("\\u00ee", "i") + ".html");
                    }
                    if (!f.exists()) {
                        System.out.println("Cannot find:" + s);
                        continue;
                    }
                    char c = clean.charAt(clean.length() - 1);
                    char vv = c;
                    switch(c) {
                        case 'k':
                            vv = 'ğ';
                            break;
                        case 'p':
                            vv = 'b';
                            break;
                        case 'ç':
                            vv = 'c';
                            break;
                        case 't':
                            vv = 'd';
                            break;
                        default:
                            System.out.println("crap:" + s);
                    }
                    String content = SimpleTextReader.trimmingUTF8Reader(f).asString();
                    if (!content.contains("color=DarkBlue>-" + String.valueOf(vv))) {
                        System.out.println(s);
                    }
                }
            }
        }
    }
    for (DictionaryItem item : items) {
        if ((item.primaryPos == Noun || item.primaryPos == PrimaryPos.Adjective) && item.secondaryPos != SecondaryPos.ProperNoun && item.hasAttribute(RootAttribute.Voicing)) {
        }
    }
    System.out.println(items.size());
}
Also used : Locale(java.util.Locale) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) File(java.io.File) HashSet(java.util.HashSet) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with TurkishLetterSequence

use of zemberek.core.turkish.TurkishLetterSequence in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGenerator method generate.

public List<SuffixSurfaceNode> generate(EnumSet<PhoneticAttribute> attrs, EnumSet<PhoneticExpectation> expectations, SuffixData suffixData, SuffixForm suffixForm) {
    List<SuffixToken> tokenList = Lists.newArrayList(new SuffixStringTokenizer(suffixForm.generation));
    // zero length token
    if (tokenList.size() == 0) {
        return Lists.newArrayList(new SuffixSurfaceNode(suffixForm, "", attrs.clone(), expectations.clone(), suffixData, suffixForm.terminationType));
    }
    List<SuffixSurfaceNode> forms = new ArrayList<SuffixSurfaceNode>(1);
    // generation of forms. normally only one form is generated. But in situations like cI~k, two Forms are generated.
    TurkishLetterSequence seq = new TurkishLetterSequence();
    int index = 0;
    for (SuffixToken token : tokenList) {
        EnumSet<PhoneticAttribute> formAttrs = defineMorphemicAttributes(seq, attrs);
        switch(token.type) {
            case LETTER:
                seq.append(token.letter);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case A_WOVEL:
                if (index == 0 && attrs.contains(LastLetterVowel)) {
                    break;
                }
                TurkicLetter lA = TurkicLetter.UNDEFINED;
                if (formAttrs.contains(LastVowelBack)) {
                    lA = L_a;
                } else if (formAttrs.contains(LastVowelFrontal)) {
                    lA = L_e;
                }
                if (lA == TurkicLetter.UNDEFINED) {
                    throw new IllegalArgumentException("Cannot generate A form!");
                }
                seq.append(lA);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case I_WOVEL:
                if (index == 0 && attrs.contains(LastLetterVowel)) {
                    break;
                }
                TurkicLetter li = TurkicLetter.UNDEFINED;
                if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelRounded))) {
                    li = L_u;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelBack, LastVowelUnrounded))) {
                    li = L_ii;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelRounded))) {
                    li = L_uu;
                } else if (formAttrs.containsAll(Arrays.asList(LastVowelFrontal, LastVowelUnrounded))) {
                    li = L_i;
                }
                if (li == TurkicLetter.UNDEFINED) {
                    throw new IllegalArgumentException("Cannot generate I form!");
                }
                seq.append(li);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case APPEND:
                if (formAttrs.contains(LastLetterVowel)) {
                    seq.append(token.letter);
                }
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case DEVOICE_FIRST:
                TurkicLetter ld = token.letter;
                if (formAttrs.contains(LastLetterVoiceless)) {
                    ld = Turkish.Alphabet.devoice(token.letter);
                }
                seq.append(ld);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), suffixForm.terminationType));
                }
                break;
            case VOICE_LAST:
                ld = token.letter;
                seq.append(ld);
                if (index == tokenList.size() - 1) {
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.ConsonantStart), suffixData, suffixForm.terminationType));
                    seq.changeLast(Turkish.Alphabet.voice(token.letter));
                    forms.add(new SuffixSurfaceNode(suffixForm, seq.toString(), defineMorphemicAttributes(seq, attrs), EnumSet.of(PhoneticExpectation.VowelStart), suffixData, TerminationType.NON_TERMINAL));
                }
                break;
        }
        index++;
    }
    return forms;
}
Also used : TurkicLetter(zemberek.core.turkish.TurkicLetter) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) ArrayList(java.util.ArrayList) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode)

Example 3 with TurkishLetterSequence

use of zemberek.core.turkish.TurkishLetterSequence in project zemberek-nlp by ahmetaa.

the class StemNodeGenerator method generateModifiedRootNodes.

private StemNode[] generateModifiedRootNodes(DictionaryItem dicItem) {
    if (dicItem.hasAttribute(Special)) {
        return handleSpecialStems(dicItem);
    }
    TurkishLetterSequence modifiedSeq = new TurkishLetterSequence(dicItem.pronunciation, alphabet);
    EnumSet<PhoneticAttribute> originalAttrs = calculateAttributes(dicItem.pronunciation);
    EnumSet<PhoneticAttribute> modifiedAttrs = originalAttrs.clone();
    EnumSet<PhoneticExpectation> originalExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    EnumSet<PhoneticExpectation> modifiedExpectations = EnumSet.noneOf(PhoneticExpectation.class);
    for (RootAttribute attribute : dicItem.attributes) {
        // generate other boundary attributes and modified root state.
        switch(attribute) {
            case Voicing:
                TurkicLetter last = modifiedSeq.lastLetter();
                TurkicLetter modifiedLetter = alphabet.voice(last);
                if (modifiedLetter == null) {
                    throw new LexiconException("Voicing letter is not proper in:" + dicItem);
                }
                if (dicItem.lemma.endsWith("nk")) {
                    modifiedLetter = TurkishAlphabet.L_g;
                }
                modifiedSeq.changeLetter(modifiedSeq.length() - 1, modifiedLetter);
                modifiedAttrs.remove(PhoneticAttribute.LastLetterVoicelessStop);
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case Doubling:
                modifiedSeq.append(modifiedSeq.lastLetter());
                originalExpectations.add(PhoneticExpectation.ConsonantStart);
                modifiedExpectations.add(PhoneticExpectation.VowelStart);
                break;
            case LastVowelDrop:
                if (modifiedSeq.lastLetter().isVowel()) {
                    modifiedSeq.delete(modifiedSeq.length() - 1);
                    modifiedExpectations.add(PhoneticExpectation.ConsonantStart);
                } else {
                    modifiedSeq.delete(modifiedSeq.length() - 2);
                    if (!dicItem.primaryPos.equals(PrimaryPos.Verb)) {
                        originalExpectations.add(PhoneticExpectation.ConsonantStart);
                    }
                    modifiedExpectations.add(PhoneticExpectation.VowelStart);
                }
                break;
            case InverseHarmony:
                originalAttrs.add(PhoneticAttribute.LastVowelFrontal);
                originalAttrs.remove(PhoneticAttribute.LastVowelBack);
                modifiedAttrs.add(PhoneticAttribute.LastVowelFrontal);
                modifiedAttrs.remove(PhoneticAttribute.LastVowelBack);
                break;
            case ProgressiveVowelDrop:
                modifiedSeq.delete(modifiedSeq.length() - 1);
                if (modifiedSeq.hasVowel()) {
                    modifiedAttrs = calculateAttributes(modifiedSeq);
                }
                break;
            default:
                break;
        }
    }
    StemNode original = new StemNode(dicItem.root, dicItem, originalAttrs, originalExpectations);
    StemNode modified = new StemNode(modifiedSeq.toString(), dicItem, modifiedAttrs, modifiedExpectations);
    SuffixData[] roots = suffixProvider.defineSuccessorSuffixes(dicItem);
    original.exclusiveSuffixData = roots[0];
    modified.exclusiveSuffixData = roots[1];
    if (original.equals(modified)) {
        return new StemNode[] { original };
    }
    modified.setTermination(TerminationType.NON_TERMINAL);
    if (dicItem.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
        original.setTermination(TerminationType.NON_TERMINAL);
    }
    return new StemNode[] { original, modified };
}
Also used : RootAttribute(zemberek.core.turkish.RootAttribute) TurkicLetter(zemberek.core.turkish.TurkicLetter) PhoneticExpectation(zemberek.core.turkish.PhoneticExpectation) TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) LexiconException(zemberek.morphology.lexicon.LexiconException) StemNode(zemberek.morphology.lexicon.graph.StemNode) PhoneticAttribute(zemberek.core.turkish.PhoneticAttribute) SuffixData(zemberek.morphology.lexicon.graph.SuffixData)

Example 4 with TurkishLetterSequence

use of zemberek.core.turkish.TurkishLetterSequence in project zemberek-nlp by ahmetaa.

the class StrictTurkishSyllableParser method parse.

/**
 * Returns the syllables as a String List. if word cannot be parsed, an empty list is returned.
 * <p>Example <p><code>("merhaba") -> ["mer","ha","ba"]</code> <p><code>("mr") -> []</code>
 * <p><code>("al") -> ["al"]</code>
 *
 * @param input input string.
 * @return syllables as string list. if there is no syllables, an empty list.
 */
public List<String> parse(String input) {
    TurkishLetterSequence sequence = new TurkishLetterSequence(input, alphabet);
    List<String> list = new ArrayList<>();
    while (input.length() > 0) {
        int index = letterCountForLastSyllable(sequence);
        if (index < 0) {
            return Collections.emptyList();
        }
        int basla = sequence.length() - index;
        list.add(sequence.toString(basla));
        sequence.clip(basla);
    }
    Collections.reverse(list);
    return list;
}
Also used : TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) ArrayList(java.util.ArrayList)

Example 5 with TurkishLetterSequence

use of zemberek.core.turkish.TurkishLetterSequence in project zemberek-nlp by ahmetaa.

the class SuffixSurfaceNodeGeneratorTest method surfaceFormFunctionalTest.

@Test
public void surfaceFormFunctionalTest() {
    Triple[] triples = { new Triple("kalem", "lAr", "ler"), new Triple("kalem", "lArA", "lere"), new Triple("kan", "lAr", "lar"), new Triple("kan", "lArAt", "larat"), new Triple("kan", "Ar", "ar"), new Triple("kaba", "lAr", "lar"), new Triple("kaba", "Ar", "r"), new Triple("kedi", "lAr", "ler"), new Triple("kedi", "lArA", "lere"), new Triple("kart", "lAr", "lar"), new Triple("a", "lAr", "lar"), new Triple("ee", "lAr", "ler"), new Triple("kalem", "lIk", "lik"), new Triple("kedi", "lIk", "lik"), new Triple("kabak", "lIk", "lık"), new Triple("kuzu", "lIk", "luk"), new Triple("göz", "lIk", "lük"), new Triple("gö", "lIk", "lük"), new Triple("ö", "lIk", "lük"), new Triple("kalem", "lArI", "leri"), new Triple("arı", "lArI", "ları"), new Triple("odun", "lArI", "ları"), new Triple("odun", "lIrA", "lura"), new Triple("kale", "+yA", "ye"), new Triple("kale", "+nA", "ne"), new Triple("kalem", "+yA", "e"), new Triple("kale", "+yI", "yi"), new Triple("kalem", "+yI", "i"), new Triple("kale", "+yIr", "yir"), new Triple("kale", "+yAr", "yer"), new Triple("kale", "+In", "n"), new Triple("kale", "+An", "n"), new Triple("kalem", "InA", "ine"), new Triple("kale", "InI", "ni"), new Triple("kitap", ">cA", "ça"), new Triple("sarraf", ">cA", "ça"), new Triple("makas", ">cA", "ça"), new Triple("tokat", ">cA", "ça"), new Triple("kaş", ">cA", "ça"), new Triple("fok", ">cA", "ça"), new Triple("gitar", ">cA", "ca"), new Triple("kalem", ">cA", "ce"), new Triple("kale", ">cA", "ce"), new Triple("kitap", ">dAn", "tan"), new Triple("gitar", ">dIn", "dın"), new Triple("kalem", ">dA", "de"), new Triple("kale", ">dArI", "deri"), new Triple("kale", "+y>cI", "yci"), new Triple("kitap", "+y>cI", "çı") };
    SuffixSurfaceNodeGenerator sfg = new SuffixSurfaceNodeGenerator();
    for (Triple triple : triples) {
        SuffixSurfaceNode form = getFirstNodeNoExpectatios(sfg, sfg.defineMorphemicAttributes(new TurkishLetterSequence(triple.predecessor, alphabet)), triple.generationWord);
        Assert.assertEquals("Error in:" + triple, triple.expectedSurface, form.surfaceForm);
    }
}
Also used : TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) SuffixSurfaceNode(zemberek.morphology.lexicon.graph.SuffixSurfaceNode) Test(org.junit.Test)

Aggregations

TurkishLetterSequence (zemberek.core.turkish.TurkishLetterSequence)5 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 PhoneticAttribute (zemberek.core.turkish.PhoneticAttribute)2 TurkicLetter (zemberek.core.turkish.TurkicLetter)2 SuffixSurfaceNode (zemberek.morphology.lexicon.graph.SuffixSurfaceNode)2 File (java.io.File)1 HashSet (java.util.HashSet)1 Locale (java.util.Locale)1 Ignore (org.junit.Ignore)1 PhoneticExpectation (zemberek.core.turkish.PhoneticExpectation)1 RootAttribute (zemberek.core.turkish.RootAttribute)1 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)1 LexiconException (zemberek.morphology.lexicon.LexiconException)1 StemNode (zemberek.morphology.lexicon.graph.StemNode)1 SuffixData (zemberek.morphology.lexicon.graph.SuffixData)1 TurkishDictionaryLoader (zemberek.morphology.lexicon.tr.TurkishDictionaryLoader)1