Search in sources :

Example 1 with HasTailSequence

use of zemberek.morphology._morphotactics.Conditions.HasTailSequence in project zemberek-nlp by ahmetaa.

the class TurkishMorphotactics method connectNounStates.

/**
 * Turkish Nouns always have Noun-Person-Possession-Case morphemes.  Even there are no suffix
 * characters. elma -> Noun:elma - A3sg:ε - Pnon:ε - Nom:ε (Third person singular, No possession,
 * Nominal Case)
 */
public void connectNounStates() {
    // ev-ε-?-?
    noun_S.addEmpty(a3sg_S, notHave(RootAttribute.ImplicitPlural));
    // ev-ler-?-?.
    noun_S.add(a3pl_S, "lAr", notHave(RootAttribute.ImplicitPlural).and(notHave(RootAttribute.CompoundP3sg)));
    // Allow only implicit plural `hayvanat`.
    noun_S.addEmpty(a3pl_S, has(RootAttribute.ImplicitPlural));
    // --- Compound Handling ---------
    // for compound roots like "zeytinyağ-" generate two transitions
    // NounCompound--(ε)--> a3sgCompound --(ε)--> pNonCompound_S --> Nom_S
    nounCompoundRoot_S.addEmpty(a3sgCompound_S, has(RootAttribute.CompoundP3sgRoot));
    a3sgCompound_S.addEmpty(pnonCompound_S);
    a3sgCompound_S.add(p3pl_S, "lArI");
    // ---- Proper noun handling -------
    // TODO: consider adding single quote after an overhaul.
    // nounProper_S.add(puncProperSeparator_S, "'");
    nounProper_S.addEmpty(a3sg_S);
    nounProper_S.add(a3pl_S, "lAr");
    puncProperSeparator_S.addEmpty(a3sg_S);
    puncProperSeparator_S.add(a3pl_S, "lAr");
    // ---- For compund derivations -----------------
    pnonCompound_S.addEmpty(nom_S);
    nom_S.add(become_S, "lAş");
    nom_S.add(acquire_S, "lAn");
    // for "zeytinyağlı"
    nom_S.add(with_S, "lI", new ContainsMorpheme(with, without).not());
    // for "zeytinyağsız"
    nom_S.add(without_S, "sIz", new ContainsMorpheme(with, without).not());
    // for "zeytinyağlık"
    nom_S.add(ness_S, "lI~k", not(new ContainsMorpheme(ness)));
    nom_S.add(ness_S, "lI!ğ", not(new ContainsMorpheme(ness)));
    // for "zeytinyağcı"
    nom_S.add(agt_S, ">cI", not(new ContainsMorpheme(agt)));
    // for "zeytinyağsı"
    nom_S.add(justLike_S, "+msI", not(new ContainsMorpheme(justLike)));
    // for "zeytinyağcık"
    nom_S.add(dim_S, ">cI~k", Conditions.HAS_NO_SURFACE.andNot(new ContainsMorpheme(dim)));
    nom_S.add(dim_S, ">cI!ğ", Conditions.HAS_NO_SURFACE.andNot(new ContainsMorpheme(dim)));
    // "zeytinyağcağız"
    nom_S.add(dim_S, "cAğIz", Conditions.HAS_NO_SURFACE);
    // for compound roots like "zeytinyağ-lar-ı" generate two transition
    // NounCompound--(lAr)--> a3plCompound ---> p3sg_S, P1sg etc.
    nounCompoundRoot_S.add(a3plCompound_S, "lAr", has(RootAttribute.CompoundP3sgRoot));
    // but for pnon connection, we use lArI
    nounCompoundRoot_S.add(a3plCompound2_S, "lArI", has(RootAttribute.CompoundP3sgRoot));
    a3plCompound_S.add(p3sg_S, "I").add(p2sg_S, "In").add(p1sg_S, "Im").add(p1pl_S, "ImIz").add(p2pl_S, "InIz").add(p3pl_S, "I");
    // this path is used for plural analysis (A3pl+Pnon+Nom) of compound words.
    a3plCompound2_S.addEmpty(pnonCompound2_S);
    pnonCompound2_S.addEmpty(nom_ST);
    // ------
    Condition noFamily = notHave(RootAttribute.FamilyMember);
    // ev-ε-ε-? Reject "annemler" etc.
    a3sg_S.addEmpty(pnon_S, // ev
    noFamily).add(p1sg_S, "Im", // evim
    noFamily).add(p2sg_S, "In", // evin
    noFamily).add(p3sg_S, "+sI", // evi, odası
    noFamily).addEmpty(p3sg_S, // "zeytinyağı" has two analyses. Pnon and P3sg.
    has(RootAttribute.CompoundP3sg)).add(p1pl_S, "ImIz", // evimiz
    noFamily).add(p2pl_S, "InIz", // eviniz
    noFamily).add(p3pl_S, "lArI", // evleri
    noFamily);
    // ev-ler-ε-?
    a3pl_S.addEmpty(pnon_S, noFamily);
    // ev-ler-im-?
    a3pl_S.add(p1sg_S, "Im", noFamily).add(p2sg_S, "In", noFamily).addEmpty(p1sg_S, // for words like "annemler"
    has(RootAttribute.ImplicitP1sg)).addEmpty(p2sg_S, // for words like "annenler"
    has(RootAttribute.ImplicitP2sg)).add(p3sg_S, "I", noFamily).add(p1pl_S, "ImIz", noFamily).add(p2pl_S, "InIz", noFamily).add(p3pl_S, "I", noFamily);
    // --- handle su - akarsu roots. ----
    nounSuRoot_S.addEmpty(a3sgSu_S);
    nounSuRoot_S.add(a3pl_S, "lar");
    a3sgSu_S.addEmpty(pnon_S).add(p1sg_S, "yum").add(p2sg_S, "yun").add(p3sg_S, "yu").add(p1pl_S, "yumuz").add(p2pl_S, "yunuz").add(p3pl_S, "lArI");
    // ev-?-ε-ε (ev, evler).
    pnon_S.addEmpty(nom_ST, notHave(RootAttribute.FamilyMember));
    Condition equCond = new Conditions.ContainsMorpheme(adj, futPart, presPart, narrPart, pastPart).not().or(new Conditions.ContainsMorphemeSequence(able, verb, // allow `yapabildiğince`
    pastPart));
    // Not allow "zetinyağı-ya" etc.
    pnon_S.add(dat_ST, "+yA", // ev-e
    notHave(RootAttribute.CompoundP3sg)).add(abl_ST, ">dAn", // ev-den
    notHave(RootAttribute.CompoundP3sg)).add(loc_ST, ">dA", // evde
    notHave(RootAttribute.CompoundP3sg)).add(acc_ST, "+yI", // evi
    notHave(RootAttribute.CompoundP3sg)).add(gen_ST, "+nIn", // evin, zeytinyağının
    previousStateIsNot(a3sgSu_S)).add(gen_ST, "yIn", // suyun
    previousStateIs(a3sgSu_S)).add(equ_ST, ">cA", // evce
    notHave(RootAttribute.CompoundP3sg).and(equCond)).add(ins_ST, // evle, zeytinyağıyla
    "+ylA");
    // zeytinyağı-na
    pnon_S.add(dat_ST, "+nA", has(RootAttribute.CompoundP3sg)).add(abl_ST, "+ndAn", // zeytinyağı-ndan
    has(RootAttribute.CompoundP3sg)).add(loc_ST, "+ndA", // zeytinyağı-nda
    has(RootAttribute.CompoundP3sg)).add(equ_ST, "+ncA", // zeytinyağı-nca
    has(RootAttribute.CompoundP3sg).and(equCond)).add(acc_ST, "+nI", // zeytinyağı-nı
    has(RootAttribute.CompoundP3sg));
    // This transition is for words like "içeri" or "dışarı".
    // Those words implicitly contains Dative suffix.
    // But It is also possible to add dative suffix +yA to those words such as "içeri-ye".
    pnon_S.addEmpty(dat_ST, has(RootAttribute.ImplicitDative));
    p1sg_S.addEmpty(// evim
    nom_ST).add(dat_ST, // evime
    "A").add(loc_ST, // evimde
    "dA").add(abl_ST, // evimden
    "dAn").add(ins_ST, // evimle
    "lA").add(gen_ST, // evimin
    "In").add(equ_ST, "cA", // evimce
    equCond.or(new Conditions.ContainsMorpheme(pastPart))).add(acc_ST, // evimi
    "I");
    p2sg_S.addEmpty(// evin
    nom_ST).add(dat_ST, // evine
    "A").add(loc_ST, // evinde
    "dA").add(abl_ST, // evinden
    "dAn").add(ins_ST, // evinle
    "lA").add(gen_ST, // evinin
    "In").add(equ_ST, "cA", // evince
    equCond.or(new Conditions.ContainsMorpheme(pastPart))).add(acc_ST, // evini
    "I");
    p3sg_S.addEmpty(// evi
    nom_ST).add(dat_ST, // evine
    "nA").add(loc_ST, // evinde
    "ndA").add(abl_ST, // evinden
    "ndAn").add(ins_ST, // eviyle
    "ylA").add(gen_ST, // evinin
    "nIn").add(equ_ST, "ncA", // evince
    equCond.or(new Conditions.ContainsMorpheme(pastPart))).add(acc_ST, // evini
    "nI");
    p1pl_S.addEmpty(// evimiz
    nom_ST).add(dat_ST, // evimize
    "A").add(loc_ST, // evimizde
    "dA").add(abl_ST, // evimizden
    "dAn").add(ins_ST, // evimizden
    "lA").add(gen_ST, // evimizin
    "In").add(equ_ST, "cA", // evimizce
    equCond.or(new Conditions.ContainsMorpheme(pastPart))).add(acc_ST, // evimizi
    "I");
    p2pl_S.addEmpty(// eviniz
    nom_ST).add(dat_ST, // evinize
    "A").add(loc_ST, // evinizde
    "dA").add(abl_ST, // evinizden
    "dAn").add(ins_ST, // evinizle
    "lA").add(gen_ST, // evinizin
    "In").add(equ_ST, "cA", // evinizce
    equCond.or(new Conditions.ContainsMorpheme(pastPart))).add(acc_ST, // evinizi
    "I");
    p3pl_S.addEmpty(// evleri
    nom_ST).add(dat_ST, // evlerine
    "nA").add(loc_ST, // evlerinde
    "ndA").add(abl_ST, // evlerinden
    "ndAn").add(ins_ST, // evleriyle
    "ylA").add(gen_ST, // evlerinin
    "nIn").add(equ_ST, // evlerince.
    "+ncA").add(acc_ST, // evlerini
    "nI");
    // ev-ε-ε-ε-cik (evcik). Disallow this path if visitor contains any non empty surface suffix.
    // There are two almost identical suffix transitions with templates ">cI~k" and ">cI!ğ"
    // This was necessary for some simplification during analysis. This way there will be only one
    // surface form generated for each transition.
    nom_ST.add(dim_S, ">cI~k", Conditions.HAS_NO_SURFACE);
    nom_ST.add(dim_S, ">cI!ğ", Conditions.HAS_NO_SURFACE);
    // ev-ε-ε-ε-ceğiz (evceğiz)
    nom_ST.add(dim_S, "cAğIz", Conditions.HAS_NO_SURFACE);
    // connect dim to the noun root.
    dim_S.addEmpty(noun_S);
    nom_ST.add(ness_S, "lI~k", Conditions.CURRENT_GROUP_EMPTY.andNot(new ContainsMorpheme(ness)));
    nom_ST.add(ness_S, "lI!ğ", Conditions.CURRENT_GROUP_EMPTY.andNot(new ContainsMorpheme(ness)));
    // connect `ness` to the noun root.
    ness_S.addEmpty(noun_S);
    nom_ST.add(agt_S, ">cI", Conditions.CURRENT_GROUP_EMPTY.andNot(new ContainsMorpheme(adj, agt)));
    // connect `ness` to the noun root.
    agt_S.addEmpty(noun_S);
    // here we do not allow an adjective to pass here.
    // such as, adj->zero->noun->ε-ε-ε->zero->Verb is not acceptable because there is already a
    // adj->zero->Verb path.
    Condition noun2VerbZeroDerivationCondition = Conditions.HAS_TAIL.andNot(Conditions.CURRENT_GROUP_EMPTY.and(new Conditions.LastDerivationIs(adjZeroDeriv_S)));
    nom_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    // elma-ya-yım elma-ya-ydı
    dat_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    // elma-dan-ım elma-dan-dı
    abl_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    // elma-da-yım elma-da-ydı
    loc_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    // elma-yla-yım elma-yla-ydı
    ins_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    // elma-nın-ım elma-nın-dı
    gen_ST.addEmpty(nounZeroDeriv_S, noun2VerbZeroDerivationCondition);
    nounZeroDeriv_S.addEmpty(nVerb_S);
    // meyve-li
    Condition noSurfaceAfterDerivation = new NoSurfaceAfterDerivation();
    nom_ST.add(with_S, "lI", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(with, without)));
    nom_ST.add(without_S, "sIz", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(with, without, inf1)));
    nom_ST.add(justLike_S, "+msI", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(justLike, futPart, pastPart, presPart, adj)));
    nom_ST.add(justLike_S, "ImsI", notHave(PhoneticAttribute.LastLetterVowel).and(noSurfaceAfterDerivation).andNot(new ContainsMorpheme(justLike, futPart, pastPart, presPart, adj)));
    nom_ST.add(related_S, "sAl", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(with, without, related)));
    // connect With to Adjective root.
    with_S.addEmpty(adj_ST);
    without_S.addEmpty(adj_ST);
    related_S.addEmpty(adj_ST);
    justLike_S.addEmpty(adj_ST);
    // meyve-de-ki
    Condition notRelRepetition = new HasTailSequence(rel, adj, zero, noun, a3sg, pnon, loc).not();
    loc_ST.add(rel_S, "ki", notRelRepetition.andNot(new Conditions.SecondaryRootIs(SecondaryPos.Time)));
    rel_S.addEmpty(adj_ST);
    // for covering dünkü, anki, yarınki etc.
    // TODO: Use a more general grouping, not using Secondary Pos
    Condition time = Conditions.CURRENT_GROUP_EMPTY.and(new Conditions.SecondaryRootIs(SecondaryPos.Time));
    DictionaryItem dun = lexicon.getItemById("dün_Noun_Time");
    DictionaryItem gun = lexicon.getItemById("gün_Noun_Time");
    DictionaryItem bugun = lexicon.getItemById("bugün_Noun_Time");
    DictionaryItem ileri = lexicon.getItemById("ileri_Noun");
    DictionaryItem geri = lexicon.getItemById("geri_Noun");
    DictionaryItem ote = lexicon.getItemById("öte_Noun");
    DictionaryItem beri = lexicon.getItemById("beri_Noun");
    Condition time2 = Conditions.rootIsAny(dun, gun, bugun);
    nom_ST.add(rel_S, "ki", time.andNot(time2));
    nom_ST.add(rel_S, "ki", Conditions.rootIsAny(ileri, geri, ote, beri));
    nom_ST.add(rel_S, "kü", time2.and(time));
    // After Genitive suffix, Rel suffix makes a Pronoun derivation.
    gen_ST.add(relToPron_S, "ki");
    relToPron_S.addEmpty(pronAfterRel_S);
    ContainsMorpheme verbDeriv = new ContainsMorpheme(inf1, inf2, inf3, pastPart, futPart);
    nom_ST.add(become_S, "lAş", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(adj)).andNot(verbDeriv));
    become_S.addEmpty(verbRoot_S);
    nom_ST.add(acquire_S, "lAn", noSurfaceAfterDerivation.andNot(new ContainsMorpheme(adj)).andNot(verbDeriv));
    acquire_S.addEmpty(verbRoot_S);
    // Inf1 mak makes noun derivation. However, it cannot get any possessive or plural suffix.
    // Also cannot be followed by Dat, Gen, Acc case suffixes.
    // So we create a path only for it.
    nounInf1Root_S.addEmpty(a3sgInf1_S);
    a3sgInf1_S.addEmpty(pnonInf1_S);
    pnonInf1_S.addEmpty(nom_ST);
    pnonInf1_S.add(abl_ST, "tAn");
    pnonInf1_S.add(loc_ST, "tA");
    pnonInf1_S.add(ins_ST, "lA");
    nounActOfRoot_S.addEmpty(a3sgActOf_S);
    nounActOfRoot_S.add(a3plActOf_S, "lar");
    a3sgActOf_S.addEmpty(pnonActOf);
    a3plActOf_S.addEmpty(pnonActOf);
    pnonActOf.addEmpty(nom_ST);
}
Also used : NoSurfaceAfterDerivation(zemberek.morphology._morphotactics.Conditions.NoSurfaceAfterDerivation) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ContainsMorpheme(zemberek.morphology._morphotactics.Conditions.ContainsMorpheme) ContainsMorpheme(zemberek.morphology._morphotactics.Conditions.ContainsMorpheme) HasTailSequence(zemberek.morphology._morphotactics.Conditions.HasTailSequence)

Aggregations

ContainsMorpheme (zemberek.morphology._morphotactics.Conditions.ContainsMorpheme)1 HasTailSequence (zemberek.morphology._morphotactics.Conditions.HasTailSequence)1 NoSurfaceAfterDerivation (zemberek.morphology._morphotactics.Conditions.NoSurfaceAfterDerivation)1 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)1