Search in sources :

Example 21 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class Z3ModelA method getAmbiguousSequence.

public Ambiguous[] getAmbiguousSequence(SentenceAnalysis sentence) {
    Ambiguous[] awords = new Ambiguous[sentence.size() + 3];
    awords[0] = startWord;
    awords[1] = startWord;
    int i = 2;
    for (SentenceAnalysis.Entry entry : sentence) {
        int[] roots = new int[entry.parses.size()];
        int[][] igs = new int[entry.parses.size()][];
        int j = 0;
        for (WordAnalysis parse : entry.parses) {
            String rootPart = parse.dictionaryItem.lemma;
            roots[j] = rootLm.getVocabulary().indexOf(rootPart);
            igs[j] = new int[parse.inflectionalGroups.size()];
            for (int k = 0; j < parse.inflectionalGroups.size(); k++) {
                igs[j][k] = igLm.getVocabulary().indexOf(parse.inflectionalGroups.get(k).formatNoSurface());
            }
            j++;
        }
        awords[i] = new Ambiguous(roots, igs);
        i++;
    }
    awords[i] = endWord;
    return awords;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 22 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzer method analyze.

public SentenceAnalysis analyze(String sentence) {
    SentenceAnalysis sentenceParse = new SentenceAnalysis();
    String preprocessed = preProcess(sentence);
    for (String s : Splitter.on(" ").omitEmptyStrings().trimResults().split(preprocessed)) {
        List<WordAnalysis> parses = turkishMorphology.analyze(s);
        sentenceParse.addParse(s, parses);
    }
    return sentenceParse;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 23 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzer method bestParse.

/**
 * Returns the best parse of a sentence.
 *
 * @param sentence sentence
 * @return best parse.
 */
public List<WordAnalysis> bestParse(String sentence) {
    SentenceAnalysis parse = analyze(sentence);
    disambiguate(parse);
    List<WordAnalysis> bestParse = Lists.newArrayListWithCapacity(parse.size());
    for (SentenceAnalysis.Entry entry : parse) {
        bestParse.add(entry.parses.get(0));
    }
    return bestParse;
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 24 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class DisambiguateSentences method analyzeAndDisambiguate.

void analyzeAndDisambiguate(String sentence) {
    System.out.println("Sentence  = " + sentence);
    SentenceAnalysis result = sentenceAnalyzer.analyze(sentence);
    System.out.println("Before disambiguation.");
    writeParseResult(result);
    System.out.println("\nAfter disambiguation.");
    sentenceAnalyzer.disambiguate(result);
    writeParseResult(result);
}
Also used : SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 25 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method processContent.

public String processContent(TurkishSentenceAnalyzer analyzer, String content, boolean useRoots) {
    List<Token> docTokens = lexer.tokenize(content);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    String joined = String.join(" ", reduced);
    if (useRoots) {
        SentenceAnalysis analysis = analyzer.analyze(joined);
        analyzer.disambiguate(analysis);
        List<String> res = new ArrayList<>();
        for (SentenceAnalysis.Entry e : analysis) {
            WordAnalysis best = e.parses.get(0);
            if (best.isUnknown()) {
                res.add(e.input);
                continue;
            }
            List<String> lemmas = best.getLemmas();
            if (lemmas.size() == 0) {
                continue;
            }
            res.add(lemmas.get(lemmas.size() - 1));
        }
        joined = String.join(" ", res);
    }
    return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)35 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)22 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)19 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)10 PrintWriter (java.io.PrintWriter)5 Histogram (zemberek.core.collections.Histogram)5 Test (org.junit.Test)4 Token (zemberek.tokenization.Token)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 Path (java.nio.file.Path)3 Ignore (org.junit.Ignore)3 Log (zemberek.core.logging.Log)3 Lists (com.google.common.collect.Lists)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2