Search in sources :

Example 1 with Result

use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.

the class GenerateWords method generateVerbs.

private static void generateVerbs() {
    System.out.println("Generating Verbs.");
    String[] positiveNegatives = { "", "Neg" };
    String[] times = { "Imp", "Aor", "Past", "Prog1", "Prog2", "Narr", "Fut" };
    String[] persons = { "A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl" };
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("okumak").disableCache().build();
    for (String posNeg : positiveNegatives) {
        for (String time : times) {
            for (String person : persons) {
                List<String> seq = Stream.of(posNeg, time, person).filter(s -> s.length() > 0).collect(Collectors.toList());
                String stem = "oku";
                List<Result> results = morphology.getWordGenerator().generate(stem, seq);
                if (results.size() == 0) {
                    System.out.println("Cennot generate Stem = [" + stem + "] Morphemes = " + seq);
                    continue;
                }
                System.out.println(results.stream().map(s -> s.surface).collect(Collectors.joining(" ")) + " " + seq);
            }
        }
    }
}
Also used : List(java.util.List) Stream(java.util.stream.Stream) TurkishMorphology(zemberek.morphology.TurkishMorphology) Collectors(java.util.stream.Collectors) Result(zemberek.morphology.generator.WordGenerator.Result) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Result(zemberek.morphology.generator.WordGenerator.Result)

Example 2 with Result

use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.

the class GenerateWords method generateNouns.

private static void generateNouns() {
    System.out.println("Generating Nouns.");
    String[] number = { "A3sg", "A3pl" };
    String[] possessives = { "P1sg", "P2sg", "P3sg" };
    String[] cases = { "Dat", "Loc", "Abl" };
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("armut").disableCache().build();
    DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0);
    for (String numberM : number) {
        for (String possessiveM : possessives) {
            for (String caseM : cases) {
                List<Result> results = morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM);
                results.forEach(s -> System.out.println(s.surface));
            }
        }
    }
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Result(zemberek.morphology.generator.WordGenerator.Result)

Example 3 with Result

use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.

the class ChangeStem method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
    String word = "simidime";
    Log.info("Input Word = " + word);
    WordAnalysis results = morphology.analyze(word);
    for (SingleAnalysis result : results) {
        List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes());
        for (Result s : generated) {
            Log.info("Input analysis: " + result.formatLong());
            Log.info("After stem change, word = " + s.surface);
            Log.info("After stem change, Analysis = " + s.analysis.formatLong());
        }
    }
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Result(zemberek.morphology.generator.WordGenerator.Result)

Example 4 with Result

use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.

the class TurkishSentenceNormalizer method normalize.

public String normalize(String sentence) {
    if (sentence.trim().length() == 0) {
        return sentence;
    }
    String processed = preProcess(sentence);
    List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(processed);
    List<Candidates> candidatesList = new ArrayList<>();
    for (int i = 0; i < tokens.size(); i++) {
        Token currentToken = tokens.get(i);
        String current = currentToken.getText();
        String next = i == tokens.size() - 1 ? null : tokens.get(i + 1).getText();
        String previous = i == 0 ? null : tokens.get(i - 1).getText();
        LinkedHashSet<String> candidates = new LinkedHashSet<>(2);
        // add matches from manual lookup
        candidates.addAll(lookupManual.get(current));
        // add matches from random walk
        candidates.addAll(lookupFromGraph.get(current));
        // add matches from ascii equivalents.
        // TODO: this may decrease accuracy. Also, this can be eliminated with ascii tolerant analyzer.
        candidates.addAll(lookupFromAscii.get(current));
        // add matches from informal analysis to formal surface conversion.
        WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
        for (SingleAnalysis analysis : analyses) {
            if (analysis.containsInformalMorpheme()) {
                WordGenerator.Result result = analysisConverter.convert(current, analysis);
                if (result != null) {
                    candidates.add(result.surface);
                }
            } else {
                List<WordGenerator.Result> results = morphology.getWordGenerator().generate(analysis.getDictionaryItem(), analysis.getMorphemes());
                for (Result result : results) {
                    candidates.add(result.surface);
                }
            }
        }
        // get top 3 1 distance matches.
        if ((analyses.analysisCount() == 0) && current.length() > 3) {
            List<String> spellCandidates = spellChecker.suggestForWord(current, previous, next, lm);
            if (spellCandidates.size() > 3) {
                spellCandidates = new ArrayList<>(spellCandidates.subList(0, 3));
            }
            candidates.addAll(spellCandidates);
        }
        // if still there is no match, add the word itself.
        if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
            candidates.add(current);
        }
        Candidates result = new Candidates(currentToken.getText(), candidates.stream().map(Candidate::new).collect(Collectors.toList()));
        candidatesList.add(result);
    }
    // Apply Viterbi decoding and return result.
    return String.join(" ", decode(candidatesList));
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Result(zemberek.morphology.generator.WordGenerator.Result) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) WordGenerator(zemberek.morphology.generator.WordGenerator) Result(zemberek.morphology.generator.WordGenerator.Result)

Example 5 with Result

use of zemberek.morphology.generator.WordGenerator.Result in project zemberek-nlp by ahmetaa.

the class WordGeneratorTest method testGeneration4.

@Test
public void testGeneration4() {
    TurkishMorphotactics mo = getMorphotactics("elma");
    WordGenerator wordGenerator = new WordGenerator(mo);
    List<String> morphemes = Lists.newArrayList("Noun", "A3pl", "P1pl");
    List<Result> results = wordGenerator.generate(mo.getRootLexicon().getItemById("elma_Noun"), TurkishMorphotactics.getMorphemes(morphemes));
    Assert.assertTrue(results.size() > 0);
    Assert.assertEquals("elmalarımız", results.get(0).surface);
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) Result(zemberek.morphology.generator.WordGenerator.Result) Test(org.junit.Test)

Aggregations

Result (zemberek.morphology.generator.WordGenerator.Result)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)4 TurkishMorphology (zemberek.morphology.TurkishMorphology)3 Test (org.junit.Test)2 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)2 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)2 TurkishMorphotactics (zemberek.morphology.morphotactics.TurkishMorphotactics)2 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 WordGenerator (zemberek.morphology.generator.WordGenerator)1 Token (zemberek.tokenization.Token)1