Search in sources :

Example 51 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class SubjectVerbAgreementRule method getPluralMatchOrNull.

@Nullable
private RuleMatch getPluralMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) {
    if (plural.contains(tokenStr)) {
        AnalyzedTokenReadings prevToken = tokens[i - 1];
        List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
        boolean match = prevChunkTags.contains(NPS) && !prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && !isCurrency(prevToken) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasUnknownTokenToTheRight(tokens, i + 1) && // z.B. "Die Zielgruppe sind Männer." - beides Nominativ, aber 'Männer' ist das Subjekt
        !isFollowedByNominativePlural(tokens, i + 1);
        if (match) {
            String message = "Bitte prüfen, ob hier <suggestion>" + getSingularFor(tokenStr) + "</suggestion> stehen sollte.";
            return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
        }
    }
    return null;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Nullable(org.jetbrains.annotations.Nullable)

Example 52 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class UppercaseNounReadingFilter method acceptRuleMatch.

@Nullable
@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map<String, String> arguments, AnalyzedTokenReadings[] patternTokens) {
    String token = arguments.get("token");
    if (token == null) {
        throw new RuntimeException("Set 'token' for filter " + UppercaseNounReadingFilter.class.getName() + " in rule " + match.getRule().getId());
    }
    try {
        String uppercase = StringTools.uppercaseFirstChar(token);
        List<AnalyzedTokenReadings> tags = tagger.tag(Collections.singletonList(uppercase));
        boolean hasNounReading = false;
        for (AnalyzedTokenReadings tag : tags) {
            if (tag.hasPartialPosTag("SUB:") && !tag.hasPartialPosTag("ADJ")) {
                hasNounReading = true;
                break;
            }
        }
        if (hasNounReading) {
            return match;
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return null;
}
Also used : IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Nullable(org.jetbrains.annotations.Nullable)

Example 53 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class VerbAgreementRule method match.

private List<RuleMatch> match(AnalyzedSentence sentence, int pos) {
    AnalyzedTokenReadings finiteVerb = null;
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    if (tokens.length < 4) {
        // ignore one-word sentences (3 tokens: SENT_START, one word, SENT_END)
        return ruleMatches;
    }
    // position of the pronouns:
    int posIch = -1;
    int posDu = -1;
    int posEr = -1;
    int posWir = -1;
    // positions of verbs which do match in person and number, and do not match any other person nor number:
    int posVer1Sin = -1;
    int posVer2Sin = -1;
    int posVer1Plu = -1;
    /*int posVer2Plu = -1;*/
    // positions of verbs which do match in person and number:
    int posPossibleVer1Sin = -1;
    int posPossibleVer2Sin = -1;
    int posPossibleVer3Sin = -1;
    int posPossibleVer1Plu = -1;
    for (int i = 1; i < tokens.length; ++i) {
        if (tokens[i].isImmunized()) {
            continue;
        }
        String strToken = tokens[i].getToken().toLowerCase();
        strToken = strToken.replace("‚", "");
        switch(strToken) {
            case "ich":
                posIch = i;
                break;
            case "du":
                posDu = i;
                break;
            case "er":
                posEr = i;
                break;
            case "wir":
                posWir = i;
                break;
        }
        if (tokens[i].hasPartialPosTag("VER") && (Character.isLowerCase(tokens[i].getToken().charAt(0)) || i == 1)) {
            if (hasUnambiguouslyPersonAndNumber(tokens[i], "1", "SIN") && !(strToken.equals("bin") && (BIN_IGNORE.contains(tokens[i - 1].getToken()) || (tokens.length != i + 1 && tokens[i + 1].getToken().startsWith("Laden"))))) {
                posVer1Sin = i;
            } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "2", "SIN") && !"Probst".equals(tokens[i].getToken())) {
                posVer2Sin = i;
            } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "1", "PLU")) {
                posVer1Plu = i;
            //      } else if (hasUnambiguouslyPersonAndNumber(tokens[i], "2", "PLU")) {
            //        posVer2Plu = i;
            }
            if (tokens[i].hasPartialPosTag(":1:SIN")) {
                posPossibleVer1Sin = i;
            }
            if (tokens[i].hasPartialPosTag(":2:SIN")) {
                posPossibleVer2Sin = i;
            }
            if (tokens[i].hasPartialPosTag(":3:SIN")) {
                posPossibleVer3Sin = i;
            }
            if (tokens[i].hasPartialPosTag(":1:PLU")) {
                posPossibleVer1Plu = i;
            }
        //      if (tokens[i].hasPartialPosTag(":2:PLU"))
        //        posPossibleVer2Plu = i;
        }
    }
    if (posVer1Sin != -1 && posIch == -1 && !isQuotationMark(tokens[posVer1Sin - 1])) {
        // 1st pers sg verb but no "ich"
        ruleMatches.add(ruleMatchWrongVerb(tokens[posVer1Sin], pos));
    } else if (// check whether verb next to "ich" is 1st pers sg
    posIch > 0 && !isNear(posPossibleVer1Sin, posIch) && // ignore "lyrisches Ich" etc.
    (tokens[posIch].getToken().equals("ich") || tokens[posIch].getStartPos() == 0) && !isQuotationMark(tokens[posIch - 1])) {
        // prevent posIch+1 segfault
        int plus1 = ((posIch + 1) == tokens.length) ? 0 : +1;
        BooleanAndFiniteVerb check = verbDoesMatchPersonAndNumber(tokens[posIch - 1], tokens[posIch + plus1], "1", "SIN", finiteVerb);
        if (!check.verbDoesMatchPersonAndNumber) {
            if (!nextButOneIsModal(tokens, posIch) && !"äußerst".equals(check.finiteVerb.getToken())) {
                ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posIch], check.finiteVerb, "1:SIN", pos));
            }
        }
    }
    if (posVer2Sin != -1 && posDu == -1 && !isQuotationMark(tokens[posVer2Sin - 1])) {
        ruleMatches.add(ruleMatchWrongVerb(tokens[posVer2Sin], pos));
    } else if (posDu > 0 && !isNear(posPossibleVer2Sin, posDu) && !isQuotationMark(tokens[posDu - 1])) {
        int plus1 = ((posDu + 1) == tokens.length) ? 0 : +1;
        BooleanAndFiniteVerb check = verbDoesMatchPersonAndNumber(tokens[posDu - 1], tokens[posDu + plus1], "2", "SIN", finiteVerb);
        if (!check.verbDoesMatchPersonAndNumber && // "Wenn ich du wäre"
        !tokens[posDu + plus1].hasPartialPosTag("VER:1:SIN:KJ2") && // "dass du  billige Klamotten..."
        !tokens[posDu + plus1].hasPartialPosTag("ADJ:") && !tokens[posDu - 1].hasPartialPosTag("VER:1:SIN:KJ2")) {
            if (!nextButOneIsModal(tokens, posDu)) {
                ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posDu], check.finiteVerb, "2:SIN", pos));
            }
        }
    }
    if (posEr > 0 && !isNear(posPossibleVer3Sin, posEr) && !isQuotationMark(tokens[posEr - 1])) {
        int plus1 = ((posEr + 1) == tokens.length) ? 0 : +1;
        BooleanAndFiniteVerb check = verbDoesMatchPersonAndNumber(tokens[posEr - 1], tokens[posEr + plus1], "3", "SIN", finiteVerb);
        if (!check.verbDoesMatchPersonAndNumber && !nextButOneIsModal(tokens, posEr) && !"äußerst".equals(check.finiteVerb.getToken()) && !"regen".equals(check.finiteVerb.getToken())) {
            // "wo er regen Anteil nahm"
            ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posEr], check.finiteVerb, "3:SIN", pos));
        }
    }
    if (posVer1Plu != -1 && posWir == -1 && !isQuotationMark(tokens[posVer1Plu - 1])) {
        ruleMatches.add(ruleMatchWrongVerb(tokens[posVer1Plu], pos));
    } else if (posWir > 0 && !isNear(posPossibleVer1Plu, posWir) && !isQuotationMark(tokens[posWir - 1])) {
        int plus1 = ((posWir + 1) == tokens.length) ? 0 : +1;
        BooleanAndFiniteVerb check = verbDoesMatchPersonAndNumber(tokens[posWir - 1], tokens[posWir + plus1], "1", "PLU", finiteVerb);
        if (!check.verbDoesMatchPersonAndNumber && !nextButOneIsModal(tokens, posWir)) {
            ruleMatches.add(ruleMatchWrongVerbSubject(tokens[posWir], check.finiteVerb, "1:PLU", pos));
        }
    }
    return ruleMatches;
}
Also used : ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 54 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GermanTagger method tag.

public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
    initializeIfRequired();
    boolean firstWord = true;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        List<AnalyzedToken> l = new ArrayList<>();
        List<TaggedWord> taggerTokens = getWordTagger().tag(word);
        if (firstWord && taggerTokens.isEmpty() && ignoreCase) {
            // e.g. "Das" -> "das" at start of sentence
            taggerTokens = getWordTagger().tag(word.toLowerCase());
            firstWord = word.matches("^\\W?$");
        } else if (pos == 0 && ignoreCase) {
            // "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
            taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
        }
        if (taggerTokens.size() > 0) {
            l.addAll(getAnalyzedTokens(taggerTokens, word));
        } else {
            // word not known, try to decompose it and use the last part for POS tagging:
            if (!StringTools.isEmpty(word.trim())) {
                List<String> compoundParts = compoundTokenizer.tokenize(word);
                if (compoundParts.size() <= 1) {
                    l.add(getNoInfoToken(word));
                } else {
                    // last part governs a word's POS:
                    String lastPart = compoundParts.get(compoundParts.size() - 1);
                    if (StringTools.startsWithUppercase(word)) {
                        lastPart = StringTools.uppercaseFirstChar(lastPart);
                    }
                    List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
                    if (partTaggerTokens.size() > 0) {
                        l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
                    } else {
                        l.add(getNoInfoToken(word));
                    }
                }
            } else {
                l.add(getNoInfoToken(word));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) TaggedWord(org.languagetool.tagging.TaggedWord) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 55 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GermanChunkerTest method assertBasicChunks.

private void assertBasicChunks(String input) throws Exception {
    String plainInput = getPlainInput(input);
    AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(plainInput);
    AnalyzedTokenReadings[] result = analyzedSentence.getTokensWithoutWhitespace();
    List<ChunkTaggedToken> basicChunks = chunker.getBasicChunks(Arrays.asList(result));
    List<String> expectedChunks = getExpectedChunks(input);
    assertChunks(input, plainInput, basicChunks, expectedChunks);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2