Search in sources :

Example 31 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractCompoundRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    RuleMatch prevRuleMatch = null;
    Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
        AnalyzedTokenReadings token;
        // we need to extend the token list so we find matches at the end of the original list:
        if (i >= tokens.length) {
            token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
        } else {
            token = tokens[i];
        }
        if (i == 0) {
            addToQueue(token, prevTokens);
            continue;
        }
        if (token.isImmunized()) {
            continue;
        }
        AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
        List<String> stringsToCheck = new ArrayList<>();
        // original upper/lowercase spelling
        List<String> origStringsToCheck = new ArrayList<>();
        Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
        // sure we match longer strings first:
        for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
            String stringToCheck = stringsToCheck.get(k);
            String origStringToCheck = origStringsToCheck.get(k);
            if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
                AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
                String msg = null;
                List<String> replacement = new ArrayList<>();
                if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                }
                if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
                    replacement.add(mergeCompound(origStringToCheck));
                    msg = withoutHyphenMessage;
                }
                String[] parts = stringToCheck.split(" ");
                if (parts.length > 0 && parts[0].length() == 1) {
                    replacement.clear();
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                } else if (replacement.isEmpty() || replacement.size() == 2) {
                    // isEmpty shouldn't happen
                    msg = withOrWithoutHyphenMessage;
                }
                RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
                ruleMatch.setSuggestedReplacements(replacement);
                // avoid duplicate matches:
                if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
                    prevRuleMatch = ruleMatch;
                    break;
                }
                prevRuleMatch = ruleMatch;
                ruleMatches.add(ruleMatch);
                break;
            }
        }
        addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue)

Example 32 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractSimpleReplaceRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        // short for SENT_START
        if (JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()))
            continue;
        // and speller-ignorable rules
        if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
            continue;
        }
        String originalTokenStr = tokenReadings.getToken();
        if (ignoreTaggedWords && isTagged(tokenReadings)) {
            continue;
        }
        String tokenString = cleanup(originalTokenStr);
        // try first with the original word, then with the all lower-case version
        List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
        if (possibleReplacements == null) {
            possibleReplacements = getWrongWords().get(tokenString);
        }
        if (possibleReplacements == null && checkLemmas) {
            possibleReplacements = new ArrayList<>();
            List<String> lemmas = new ArrayList<>();
            for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
                String lemma = analyzedToken.getLemma();
                if (lemma != null && getWrongWords().containsKey(lemma) && !lemmas.contains(lemma)) {
                    lemmas.add(cleanup(lemma));
                }
            }
            for (String lemma : lemmas) {
                List<String> replacements = getWrongWords().get(lemma);
                if (replacements != null) {
                    possibleReplacements.addAll(replacements);
                }
            }
            possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
        }
        if (possibleReplacements != null && possibleReplacements.size() > 0) {
            List<String> replacements = new ArrayList<>();
            replacements.addAll(possibleReplacements);
            if (replacements.contains(originalTokenStr)) {
                replacements.remove(originalTokenStr);
            }
            if (replacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 33 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractWordCoherencyRule method match.

@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    // e.g. aufwändig -> RuleMatch of aufwendig
    Map<String, RuleMatch> shouldNotAppearWord = new HashMap<>();
    int pos = 0;
    for (AnalyzedSentence sentence : sentences) {
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
        for (AnalyzedTokenReadings tmpToken : tokens) {
            String token = tmpToken.getToken();
            List<AnalyzedToken> readings = tmpToken.getReadings();
            // TODO: in theory we need to care about the other readings, too (affects e.g. German "Schenke" as a noun):
            if (readings.size() > 0) {
                String baseform = readings.get(0).getLemma();
                if (baseform != null) {
                    token = baseform;
                }
            }
            if (shouldNotAppearWord.containsKey(token)) {
                RuleMatch otherMatch = shouldNotAppearWord.get(token);
                String otherSpelling = otherMatch.getMessage();
                String msg = getMessage(token, otherSpelling);
                RuleMatch ruleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), msg);
                ruleMatch.setSuggestedReplacement(otherSpelling);
                ruleMatches.add(ruleMatch);
            } else if (getWordMap().containsKey(token)) {
                String shouldNotAppear = getWordMap().get(token);
                RuleMatch potentialRuleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), token);
                shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
            }
        }
        pos += sentence.getText().length();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 34 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AgreementRule method getAgreementCategories.

/** Return Kasus, Numerus, Genus of those forms with a determiner. */
private Set<String> getAgreementCategories(AnalyzedTokenReadings aToken, Set<GrammarCategory> omit, boolean skipSol) {
    Set<String> set = new HashSet<>();
    List<AnalyzedToken> readings = aToken.getReadings();
    for (AnalyzedToken tmpReading : readings) {
        if (skipSol && tmpReading.getPOSTag() != null && tmpReading.getPOSTag().endsWith(":SOL")) {
            // SOL = alleinstehend - needs to be skipped so we find errors like "An der roter Ampel."
            continue;
        }
        AnalyzedGermanToken reading = new AnalyzedGermanToken(tmpReading);
        if (reading.getCasus() == null && reading.getNumerus() == null && reading.getGenus() == null) {
            continue;
        }
        if (reading.getGenus() == GermanToken.Genus.ALLGEMEIN && tmpReading.getPOSTag() != null && // STV: stellvertretend (!= begleitend)
        !tmpReading.getPOSTag().endsWith(":STV") && !possessiveSpecialCase(aToken, tmpReading)) {
            // e.g. "Ich Arbeiter" doesn't get flagged as incorrect:
            if (reading.getDetermination() == null) {
                // Nouns don't have the determination property (definite/indefinite), and as we don't want to
                // introduce a special case for that, we just pretend they always fulfill both properties:
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.INDEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.INDEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.INDEFINITE, omit));
            } else {
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, reading.getDetermination(), omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, reading.getDetermination(), omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, reading.getDetermination(), omit));
            }
        } else {
            if (reading.getDetermination() == null || "jed".equals(tmpReading.getLemma()) || "manch".equals(tmpReading.getLemma())) {
                // "jeder" etc. needs a special case to avoid false alarm
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.INDEFINITE, omit));
            } else {
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), reading.getDetermination(), omit));
            }
        }
    }
    return set;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedGermanToken(org.languagetool.tagging.de.AnalyzedGermanToken)

Example 35 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AgreementSuggestor method getSuggestions.

List<String> getSuggestions() {
    Set<String> suggestionSet = new HashSet<>();
    try {
        for (AnalyzedToken token2Reading : nounToken.getReadings()) {
            String nounCase = GermanHelper.getNounCase(token2Reading.getPOSTag());
            String nounNumber = GermanHelper.getNounNumber(token2Reading.getPOSTag());
            String nounGender = GermanHelper.getNounGender(token2Reading.getPOSTag());
            for (AnalyzedToken token1Reading : determinerToken.getReadings()) {
                List<String> articleSuggestions = getArticleSuggestions(nounCase, nounNumber, nounGender, token1Reading);
                suggestionSet.addAll(articleSuggestions);
                List<String> pronounSuggestions = getPronounSuggestions(nounCase, nounNumber, nounGender, token1Reading);
                suggestionSet.addAll(pronounSuggestions);
                List<String> nounSuggestions = getNounSuggestions(token2Reading, token1Reading);
                suggestionSet.addAll(nounSuggestions);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    List<String> suggestions = new ArrayList<>(suggestionSet);
    Collections.sort(suggestions);
    return suggestions;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) IOException(java.io.IOException)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2