Search in sources :

Example 1 with NullSuffixForm

use of zemberek.morphology.lexicon.NullSuffixForm in project zemberek-nlp by ahmetaa.

the class DynamicSuffixProviderTest method testRegister.

@Test
public void testRegister() {
    DynamicSuffixProvider provider = new TestSuffixProvider();
    Suffix sf1 = new Suffix("sf1");
    Suffix sf2 = new Suffix("sf2");
    Suffix sf3 = new Suffix("sf3");
    Suffix sf4 = new Suffix("sf4");
    Suffix sf5 = new Suffix("sf4");
    Suffix sf6 = new Suffix("sf6");
    Suffix sf7 = new Suffix("sf6");
    Suffix sf8 = new Suffix("sf6");
    Suffix sf9 = new Suffix("sf6");
    SuffixForm frm1 = provider.getForm("fs1", sf1, "abc");
    SuffixForm frm2_1 = provider.getForm("fs2-1", sf2, "ali");
    SuffixForm frm2_2 = provider.getForm("fs2-2", sf2, "kaan");
    SuffixForm frm4 = provider.getForm("fs4", sf4, "akin");
    SuffixForm frm5 = provider.getForm("frm5", sf6, "dadada");
    SuffixForm frm6 = provider.getForm("frm6", sf9, "aguagu");
    SuffixFormTemplate tmp1 = provider.getTemplate("tmp1", sf3, TerminationType.TRANSFER);
    SuffixFormTemplate tmp2 = provider.getTemplate("tmp2", sf5, TerminationType.TRANSFER);
    SuffixFormTemplate tmp3 = provider.getTemplate("tmp3", sf7, TerminationType.TRANSFER);
    SuffixFormTemplate tmp4 = provider.getTemplate("tmp4", sf8, TerminationType.TRANSFER);
    // 
    // /--frm5.............
    // /         \         \
    // -->frm4- -.........->frm2_2
    // /          \ \          /
    // tmp2-->frm1--->tmp1------
    // \      \      /        \
    // \      .....C......-> frm2_1        --- Direct link  ... indirect link.
    // \........ /........../
    // \
    // \---- tmp3----tmp4--- frm6
    // \............./......./
    // 
    tmp2.connections.add(frm4, frm1, frm5, tmp3);
    tmp2.indirectConnections.add(tmp1, frm2_2, frm2_1, tmp4, frm6);
    frm1.connections.add(tmp1);
    frm1.indirectConnections.add(frm2_2);
    frm4.connections.add(tmp1);
    frm4.indirectConnections.add(frm2_1);
    frm5.connections.add(tmp1);
    frm5.indirectConnections.add(frm2_2);
    tmp1.connections.add(frm2_1, frm2_2);
    tmp3.connections.add(tmp4);
    tmp3.indirectConnections.add(frm6);
    tmp4.connections.add(frm6);
    // register tmp2. It should not effect the graph.
    provider.registerForm(tmp2);
    Assert.assertEquals(0, provider.getFormCount());
    // after registering frm1, there should be 2 forms registered internally. frm1 and a nullForm from tmp1
    provider.registerForm(frm1);
    Assert.assertEquals(2, provider.getFormCount());
    // after registering frm4, there should be 4 forms registered internally. frm4 and another nullForm from tmp1 with different connections.
    provider.registerForm(frm4);
    Assert.assertEquals(4, provider.getFormCount());
    // if we attempt to re-register, should not effect the graph.
    provider.registerForm(frm4);
    Assert.assertEquals(4, provider.getFormCount());
    // we register frm5. it should not generate a null morpheme from tmp1 because it was already generated when frm4 is registered.
    provider.registerForm(frm5);
    Assert.assertEquals(5, provider.getFormCount());
    provider.registerForms(frm2_1, frm2_2);
    Assert.assertEquals(7, provider.getFormCount());
    // now we generate a nullmorpheme from tmp2 and register it. We apply a constraint so frm1 is out of connections
    SuffixData constraint = new SuffixData(tmp2.connections).remove(frm1).add(tmp2.indirectConnections);
    NullSuffixForm null2_1 = provider.generateNullFormFromTemplate(tmp2, constraint);
    Assert.assertFalse(null2_1.connections.contains(frm1));
    Assert.assertTrue(null2_1.connections.contains(frm4));
    Assert.assertTrue(null2_1.connections.contains(frm5));
    provider.registerForm(null2_1.copy());
    // null morphemes for tmp3 and tm4 also will be registered internally.
    Assert.assertEquals(10, provider.getFormCount());
    // we try to do the same. it should not effect the graph.
    SuffixData constraint2 = new SuffixData(tmp2.connections).remove(frm1).add(tmp2.indirectConnections);
    NullSuffixForm null2_2 = provider.generateNullFormFromTemplate(tmp2, constraint2);
    Assert.assertEquals(null2_1, null2_2);
    provider.registerForm(null2_2.copy());
    Assert.assertEquals(10, provider.getFormCount());
    SuffixData constraint3 = new SuffixData(tmp2.allConnections());
    NullSuffixForm null2_3 = provider.generateNullFormFromTemplate(tmp2, constraint3);
    provider.registerForm(null2_3.copy());
    Assert.assertNotSame(null2_3, null2_2);
    Assert.assertEquals(11, provider.getFormCount());
}
Also used : Suffix(zemberek.morphology.lexicon.Suffix) SuffixForm(zemberek.morphology.lexicon.SuffixForm) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) SuffixFormTemplate(zemberek.morphology.lexicon.SuffixFormTemplate) Test(org.junit.Test)

Example 2 with NullSuffixForm

use of zemberek.morphology.lexicon.NullSuffixForm in project zemberek-nlp by ahmetaa.

the class DynamicSuffixProvider method generateNullFormFromTemplate.

protected NullSuffixForm generateNullFormFromTemplate(SuffixFormTemplate templateForm, SuffixData constraints) {
    NullSuffixForm nullForm = new NullSuffixForm(-1, "", templateForm);
    nullForm.connections = new SuffixData(templateForm.connections).retain(constraints);
    nullForm.indirectConnections = new SuffixData(templateForm.indirectConnections).retain(constraints);
    if (nullFormsUnprocessed.containsKey(nullForm)) {
        return nullFormsUnprocessed.get(nullForm);
    } else {
        nullForm.index = getNewIndex();
        nullForm.id = idMaker.get(templateForm.id);
        nullFormsUnprocessed.put(nullForm, nullForm);
        return nullForm;
    }
}
Also used : NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm)

Example 3 with NullSuffixForm

use of zemberek.morphology.lexicon.NullSuffixForm in project zemberek-nlp by ahmetaa.

the class DynamicSuffixProvider method registerForm.

protected void registerForm(SuffixForm formSet) {
    // duplicates of newly generated FormSets.
    if (formSet instanceof SuffixFormTemplate) {
        formLookupByName.put(formSet.getId(), formSet);
        return;
    }
    if (suffixForms.containsKey(formSet)) {
        return;
    }
    SuffixData allConnections = formSet.allConnections();
    List<SuffixForm> templateFormsToRemove = new ArrayList<>();
    List<SuffixForm> nullFormsToRegister = new ArrayList<>();
    for (SuffixForm connection : formSet.connections) {
        if (connection instanceof SuffixFormTemplate) {
            NullSuffixForm nullForm = generateNullFormFromTemplate((SuffixFormTemplate) connection, new SuffixData(allConnections)).copy();
            nullFormsToRegister.add(nullForm);
            templateFormsToRemove.add(connection);
        }
    }
    formSet.connections.remove(templateFormsToRemove);
    // we dont need indirect connection data anymore.
    formSet.indirectConnections.clear();
    formSet.connections.add(nullFormsToRegister);
    if (formSet.index != -1) {
        formSet.index = getNewIndex();
    }
    suffixForms.put(formSet, formSet);
    formLookupByName.put(formSet.getId(), formSet);
    for (SuffixForm form : nullFormsToRegister) {
        registerForm(form);
    }
}
Also used : SuffixForm(zemberek.morphology.lexicon.SuffixForm) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) ArrayList(java.util.ArrayList) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) SuffixFormTemplate(zemberek.morphology.lexicon.SuffixFormTemplate)

Example 4 with NullSuffixForm

use of zemberek.morphology.lexicon.NullSuffixForm in project zemberek-nlp by ahmetaa.

the class TurkishSuffixes method getRootSet.

@Override
public SuffixForm getRootSet(DictionaryItem item, SuffixData suffixConstraint) {
    if (suffixConstraint.isEmpty()) {
        switch(item.primaryPos) {
            case Noun:
                if (item.hasAttribute(RootAttribute.CompoundP3sg)) {
                    return Noun_Comp_P3sg;
                }
                if (item.hasAttribute(RootAttribute.CompoundP3sgRoot)) {
                    return Noun_Comp_P3sg_Root;
                }
                switch(item.secondaryPos) {
                    case ProperNoun:
                        return ProperNoun_Default;
                    case Time:
                        return Noun_Time_Default;
                    default:
                        return Noun_Default;
                }
            case Adjective:
                return Adj_Default;
            case Verb:
                return Verb_Default;
            case Adverb:
                return Adv_Default;
            case Numeral:
                return Numeral_Default;
            case Interjection:
                return Interj_Default;
            case Question:
                return Ques_Default;
            case Conjunction:
                return Conj_Default;
            case PostPositive:
                return Postp_Default;
            case Punctuation:
                return Punc_Default;
            case Determiner:
                return Det_Default;
            case Duplicator:
                return Dup_Default;
            case Pronoun:
                switch(item.secondaryPos) {
                    case DemonstrativePron:
                        return DemonsPron_Default;
                    case QuantitivePron:
                        return QuantPron_Default;
                    case QuestionPron:
                        return QuesPron_Default;
                    case ReflexivePron:
                        return ReflexPron_Default;
                    default:
                        return PersPron_Default;
                }
            default:
                return Noun_Default;
        }
    } else {
        SuffixFormTemplate template;
        switch(item.primaryPos) {
            case Noun:
                template = Noun_TEMPLATE;
                break;
            case Adjective:
                template = Adj_TEMPLATE;
                break;
            case Verb:
                template = Verb_TEMPLATE;
                break;
            case PostPositive:
                template = Postp_Template;
                break;
            case Pronoun:
                if (item.secondaryPos == SecondaryPos.DemonstrativePron) {
                    template = DemonsPron_TEMPLATE;
                } else if (item.secondaryPos == SecondaryPos.QuantitivePron) {
                    template = QuantPron_TEMPLATE;
                } else if (item.secondaryPos == SecondaryPos.QuestionPron) {
                    template = QuesPron_TEMPLATE;
                } else {
                    return PersPron_TEMPLATE;
                }
                break;
            default:
                template = Noun_TEMPLATE;
        }
        NullSuffixForm copy = generateNullFormFromTemplate(template, suffixConstraint).copy();
        registerForm(copy);
        return copy;
    }
}
Also used : NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) SuffixFormTemplate(zemberek.morphology.lexicon.SuffixFormTemplate)

Example 5 with NullSuffixForm

use of zemberek.morphology.lexicon.NullSuffixForm in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method generateSuffixNames.

@Test
@Ignore("Not a Test.")
public void generateSuffixNames() throws IOException {
    TurkishSuffixes suffixes = new TurkishSuffixes();
    List<SuffixForm> forms = new ArrayList<>();
    for (SuffixForm form : suffixes.getAllForms()) {
        if (form instanceof NullSuffixForm) {
            continue;
        }
        forms.add(form);
    }
    forms.sort(Comparator.comparing(SuffixForm::getId));
    List<String> result = forms.stream().map(s -> s.id).collect(Collectors.toList());
    Files.write(Paths.get("suffix-list"), result);
}
Also used : TurkishLetterSequence(zemberek.core.turkish.TurkishLetterSequence) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TextUtil(zemberek.core.text.TextUtil) Stopwatch(com.google.common.base.Stopwatch) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Multimap(com.google.common.collect.Multimap) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) HashMultimap(com.google.common.collect.HashMultimap) Lists(com.google.common.collect.Lists) Locale(java.util.Locale) PrimaryPos(zemberek.core.turkish.PrimaryPos) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) LinkedHashSet(java.util.LinkedHashSet) Collator(java.text.Collator) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) SuffixForm(zemberek.morphology.lexicon.SuffixForm) Files(java.nio.file.Files) Collection(java.util.Collection) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Test(org.junit.Test) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) OflazerAnalyzerRunner(zemberek.morphology.external.OflazerAnalyzerRunner) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Ignore(org.junit.Ignore) Paths(java.nio.file.Paths) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) Comparator(java.util.Comparator) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) TurkishSuffixes(zemberek.morphology.lexicon.tr.TurkishSuffixes) SuffixForm(zemberek.morphology.lexicon.SuffixForm) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) ArrayList(java.util.ArrayList) NullSuffixForm(zemberek.morphology.lexicon.NullSuffixForm) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

NullSuffixForm (zemberek.morphology.lexicon.NullSuffixForm)6 SuffixForm (zemberek.morphology.lexicon.SuffixForm)4 SuffixFormTemplate (zemberek.morphology.lexicon.SuffixFormTemplate)4 Test (org.junit.Test)3 ArrayList (java.util.ArrayList)2 Splitter (com.google.common.base.Splitter)1 Stopwatch (com.google.common.base.Stopwatch)1 HashMultimap (com.google.common.collect.HashMultimap)1 LinkedHashMultimap (com.google.common.collect.LinkedHashMultimap)1 Lists (com.google.common.collect.Lists)1 Multimap (com.google.common.collect.Multimap)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 Collator (java.text.Collator)1 Collection (java.util.Collection)1 Comparator (java.util.Comparator)1