Search in sources :

Example 1 with ChunkTag

use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.

the class SubjectVerbAgreementRule method getPluralMatchOrNull.

@Nullable
private RuleMatch getPluralMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) {
    if (plural.contains(tokenStr)) {
        AnalyzedTokenReadings prevToken = tokens[i - 1];
        List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
        boolean match = prevChunkTags.contains(NPS) && !prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && !isCurrency(prevToken) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasUnknownTokenToTheRight(tokens, i + 1) && // z.B. "Die Zielgruppe sind Männer." - beides Nominativ, aber 'Männer' ist das Subjekt
        !isFollowedByNominativePlural(tokens, i + 1);
        if (match) {
            String message = "Bitte prüfen, ob hier <suggestion>" + getSingularFor(tokenStr) + "</suggestion> stehen sollte.";
            return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
        }
    }
    return null;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Nullable(org.jetbrains.annotations.Nullable)

Example 2 with ChunkTag

use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.

the class SubjectVerbAgreementRule method getSingularMatchOrNull.

@Nullable
private RuleMatch getSingularMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) throws IOException {
    if (singular.contains(tokenStr)) {
        AnalyzedTokenReadings prevToken = tokens[i - 1];
        AnalyzedTokenReadings nextToken = i + 1 < tokens.length ? tokens[i + 1] : null;
        List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
        boolean match = prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && // 'um 18 Uhr ist Feierabend'
        !prevToken.getToken().equals("Uhr") && !isCurrency(prevToken) && // 'zehn Jahre ist es her'
        !(nextToken != null && nextToken.getToken().equals("es")) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasQuestionPronounToTheLeft(tokens, i - 1) && !containsRegexToTheLeft("wer", tokens, i - 1) && !containsRegexToTheLeft("(?i)alle[nr]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)jede[rs]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)manche[nrs]?", tokens, i - 1) && !containsOnlyInfinitivesToTheLeft(tokens, i - 1);
        if (match) {
            String message = "Bitte prüfen, ob hier <suggestion>" + getPluralFor(tokenStr) + "</suggestion> stehen sollte.";
            return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
        }
    }
    return null;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Nullable(org.jetbrains.annotations.Nullable)

Example 3 with ChunkTag

use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.

the class DisambiguationPatternRuleReplacer method executeAction.

private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
    AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
    DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
    int correctedStPos = 0;
    int startPositionCorrection = rule.getStartPositionCorrection();
    int endPositionCorrection = rule.getEndPositionCorrection();
    int matchingTokensWithCorrection = matchingTokens;
    List<Integer> tokenPositionList = new ArrayList<>();
    for (int i : tokenPositions) {
        tokenPositionList.add(i);
    }
    if (startPositionCorrection > 0) {
        //token positions are shifted by 1
        correctedStPos--;
        for (int j = 0; j < pTokensMatched.size(); j++) {
            if (!pTokensMatched.get(j)) {
                // add zero-length token corresponding to the non-matching pattern element so that position count is fine
                tokenPositionList.add(j, 0);
            }
        }
        for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
            correctedStPos += tokenPositionList.get(l);
        }
        // adjust to make sure the token count is fine as it's checked later
        int w = startPositionCorrection;
        for (int j = 0; j <= w; j++) {
            if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
                startPositionCorrection--;
            }
        }
    }
    if (endPositionCorrection < 0) {
        // adjust the end position correction if one of the elements has not been matched
        for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
            if (!pTokensMatched.get(d)) {
                endPositionCorrection++;
            }
        }
    }
    if (lastMatchToken != -1) {
        int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
        matchingTokensWithCorrection += maxPosCorrection;
    }
    int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
    boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
    DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
    AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
    Match matchElement = rule.getMatchElement();
    String disambiguatedPOS = rule.getDisambiguatedPOS();
    switch(disAction) {
        case UNIFY:
            if (unifiedTokens != null) {
                //TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
                if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
                        unifiedTokens[unifiedTokens.length - 1].setSentEnd();
                    }
                    for (int i = 0; i < unifiedTokens.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        List<ChunkTag> chTags = whTokens[position].getChunkTags();
                        whTokens[position] = unifiedTokens[i];
                        whTokens[position].setChunkTags(chTags);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case REMOVE:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].removeReading(newTokenReadings[i]);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            } else if (!StringTools.isEmpty(disambiguatedPOS)) {
                // negative filtering
                Pattern p = Pattern.compile(disambiguatedPOS);
                AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
                for (AnalyzedToken analyzedToken : tmp) {
                    if (analyzedToken.getPOSTag() != null) {
                        Matcher mPos = p.matcher(analyzedToken.getPOSTag());
                        if (mPos.matches()) {
                            int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
                            String prevValue = whTokens[position].toString();
                            String prevAnot = whTokens[position].getHistoricalAnnotations();
                            whTokens[position].removeReading(analyzedToken);
                            annotateChange(whTokens[position], prevValue, prevAnot);
                        }
                    }
                }
            }
            break;
        case ADD:
            if (newTokenReadings != null) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if (newTokenReadings[i].getToken().isEmpty()) {
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].addReading(newTok);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case FILTERALL:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                PatternToken pToken;
                if (pTokensMatched.get(i + startPositionCorrection)) {
                    pToken = rule.getPatternTokens().get(i + startPositionCorrection);
                } else {
                    int k = 1;
                    while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
                        k++;
                    }
                    pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
                }
                Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
                String prevValue = whTokens[position].toString();
                String prevAnot = whTokens[position].getHistoricalAnnotations();
                whTokens[position] = matchState.filterReadings();
                annotateChange(whTokens[position], prevValue, prevAnot);
            }
            break;
        case IMMUNIZE:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
            }
            break;
        case IGNORE_SPELLING:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
            }
            break;
        case FILTER:
            if (matchElement == null) {
                // same as REPLACE if using <match>
                Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                boolean newPOSmatches = false;
                // only apply filter rule when it matches previous tags:
                for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
                    if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
                        newPOSmatches = true;
                        break;
                    }
                }
                if (newPOSmatches) {
                    MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                    String prevValue = whTokens[fromPos].toString();
                    String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                    whTokens[fromPos] = matchState.filterReadings();
                    annotateChange(whTokens[fromPos], prevValue, prevAnot);
                }
                break;
            }
        //fallthrough
        case REPLACE:
        default:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if ("".equals(newTokenReadings[i].getToken())) {
                            // empty token
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            // empty lemma
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                        whTokens[position] = replaceTokens(whTokens[position], toReplace);
                    }
                }
            } else if (matchElement == null) {
                String lemma = "";
                for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
                    if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
                        lemma = analyzedToken.getLemma();
                    }
                }
                if (StringTools.isEmpty(lemma)) {
                    lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
                }
                AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
                AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
            } else {
                // using the match element
                MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                String prevValue = whTokens[fromPos].toString();
                String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                whTokens[fromPos] = matchElementState.filterReadings();
                whTokens[fromPos].setWhitespaceBefore(spaceBefore);
                annotateChange(whTokens[fromPos], prevValue, prevAnot);
            }
    }
    return whTokens;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedToken(org.languagetool.AnalyzedToken)

Example 4 with ChunkTag

use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.

the class CatalanTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());
    for (String word : sentenceTokens) {
        // This hack allows all rules and dictionary entries to work with
        // typewriter apostrophe
        boolean containsTypewriterApostrophe = false;
        if (word.length() > 1) {
            if (word.contains("'")) {
                containsTypewriterApostrophe = true;
            }
            word = word.replace("’", "'");
        }
        final List<AnalyzedToken> l = new ArrayList<>();
        final String lowerWord = word.toLowerCase(conversionLocale);
        final boolean isLowercase = word.equals(lowerWord);
        final boolean isMixedCase = StringTools.isMixedCase(word);
        List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
        // normal case:
        addTokens(taggerTokens, l);
        // word with lowercase word tags:
        if (!isLowercase && !isMixedCase) {
            List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
            addTokens(lowerTaggerTokens, l);
        }
        // additional tagging with prefixes
        if (l.isEmpty() && !isMixedCase) {
            addTokens(additionalTags(word, dictLookup), l);
        }
        if (l.isEmpty()) {
            l.add(new AnalyzedToken(word, null, null));
        }
        AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
        if (containsTypewriterApostrophe) {
            List<ChunkTag> listChunkTags = new ArrayList<>();
            listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
            atr.setChunkTags(listChunkTags);
        }
        tokenReadings.add(atr);
        pos += word.length();
    }
    return tokenReadings;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedToken(org.languagetool.AnalyzedToken) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList) DictionaryLookup(morfologik.stemming.DictionaryLookup) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 5 with ChunkTag

use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.

the class XMLRuleHandler method setToken.

protected void setToken(Attributes attrs) {
    inToken = true;
    if (lastPhrase) {
        patternTokens.clear();
    }
    lastPhrase = false;
    tokenNegated = YES.equals(attrs.getValue(NEGATE));
    tokenInflected = YES.equals(attrs.getValue(INFLECTED));
    if (attrs.getValue(SKIP) != null) {
        skipPos = Integer.parseInt(attrs.getValue(SKIP));
    }
    if (attrs.getValue(MIN) != null) {
        minOccurrence = Integer.parseInt(attrs.getValue(MIN));
    }
    if (attrs.getValue(MAX) != null) {
        maxOccurrence = Integer.parseInt(attrs.getValue(MAX));
    }
    elements = new StringBuilder();
    // POSElement creation
    if (attrs.getValue(POSTAG) != null) {
        posToken = attrs.getValue(POSTAG);
        posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
        posNegation = YES.equals(attrs.getValue(NEGATE_POS));
    }
    if (attrs.getValue(CHUNKTAG) != null) {
        chunkTag = new ChunkTag(attrs.getValue(CHUNKTAG));
    }
    regExpression = YES.equals(attrs.getValue(REGEXP));
    if (attrs.getValue(SPACEBEFORE) != null) {
        tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
        tokenSpaceBeforeSet = !IGNORE.equals(attrs.getValue(SPACEBEFORE));
    }
    if (!inAndGroup && !inOrGroup) {
        tokenCounter++;
    }
    if (attrs.getValue(CASE_SENSITIVE) != null) {
        tokenLevelCaseSet = true;
        tokenLevelCaseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE));
    } else {
        tokenLevelCaseSensitive = false;
        tokenLevelCaseSet = false;
    }
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag)

Aggregations

ChunkTag (org.languagetool.chunking.ChunkTag)8 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)5 ArrayList (java.util.ArrayList)2 Nullable (org.jetbrains.annotations.Nullable)2 Test (org.junit.Test)2 AnalyzedToken (org.languagetool.AnalyzedToken)2 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 DictionaryLookup (morfologik.stemming.DictionaryLookup)1 IStemmer (morfologik.stemming.IStemmer)1 Demo (org.languagetool.language.Demo)1 IncorrectExample (org.languagetool.rules.IncorrectExample)1 Rule (org.languagetool.rules.Rule)1