Search in sources :

Example 11 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class LanguageToolFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    if (posStack.size() > 0) {
        String pop = posStack.pop();
        restoreState(current);
        termAtt.append(pop);
        posIncrAtt.setPositionIncrement(0);
        typeAtt.setType("pos");
        return true;
    }
    if (tokenIter == null || !tokenIter.hasNext()) {
        // there are no remaining tokens from the current sentence... are there more sentences?
        if (input.incrementToken()) {
            // a new sentence is available: process it.
            String sentenceStr = termAtt.toString();
            collectedInput.append(sentenceStr);
            if (sentenceStr.length() >= 255) {
                // later. See https://github.com/languagetool-org/languagetool/issues/364
                return true;
            } else {
                sentenceStr = collectedInput.toString();
                collectedInput.setLength(0);
            }
            AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
            List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
            tokenIter = tokenBuffer.iterator();
            /*
         * it should not be possible to have a sentence with 0 words, check just in case. returning
         * EOS isn't the best either, but it's the behavior of the original code.
         */
            if (!tokenIter.hasNext()) {
                return false;
            }
        } else {
            // no more sentences, end of stream!
            return false;
        }
    }
    // It must clear attributes, as it is creating new tokens.
    clearAttributes();
    AnalyzedTokenReadings tr = tokenIter.next();
    // add POS tag for sentence start.
    if (tr.isSentenceStart()) {
        // TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
        // but breaks other cases:
        //termAtt.append("SENT_START");
        typeAtt.setType("pos");
        String posTag = tr.getAnalyzedToken(0).getPOSTag();
        String lemma = tr.getAnalyzedToken(0).getLemma();
        if (toLowerCase) {
            termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
            }
        } else {
            termAtt.append(POS_PREFIX).append(posTag);
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX).append(lemma);
            }
        }
        return true;
    }
    // by pass the white spaces.
    if (tr.isWhitespace()) {
        return this.incrementToken();
    }
    offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
    for (AnalyzedToken token : tr) {
        if (token.getPOSTag() != null) {
            if (toLowerCase) {
                posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
            } else {
                posStack.push(POS_PREFIX + token.getPOSTag());
            }
        }
        if (token.getLemma() != null) {
            if (toLowerCase) {
                posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
            } else {
                // chances are good this is the same for all loop iterations, store it anyway...
                posStack.push(LEMMA_PREFIX + token.getLemma());
            }
        }
    }
    current = captureState();
    if (toLowerCase) {
        termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
    } else {
        termAtt.append(tr.getAnalyzedToken(0).getToken());
    }
    return true;
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 12 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractCompoundRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    RuleMatch prevRuleMatch = null;
    Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
        AnalyzedTokenReadings token;
        // we need to extend the token list so we find matches at the end of the original list:
        if (i >= tokens.length) {
            token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
        } else {
            token = tokens[i];
        }
        if (i == 0) {
            addToQueue(token, prevTokens);
            continue;
        }
        if (token.isImmunized()) {
            continue;
        }
        AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
        List<String> stringsToCheck = new ArrayList<>();
        // original upper/lowercase spelling
        List<String> origStringsToCheck = new ArrayList<>();
        Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
        // sure we match longer strings first:
        for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
            String stringToCheck = stringsToCheck.get(k);
            String origStringToCheck = origStringsToCheck.get(k);
            if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
                AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
                String msg = null;
                List<String> replacement = new ArrayList<>();
                if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                }
                if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
                    replacement.add(mergeCompound(origStringToCheck));
                    msg = withoutHyphenMessage;
                }
                String[] parts = stringToCheck.split(" ");
                if (parts.length > 0 && parts[0].length() == 1) {
                    replacement.clear();
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                } else if (replacement.isEmpty() || replacement.size() == 2) {
                    // isEmpty shouldn't happen
                    msg = withOrWithoutHyphenMessage;
                }
                RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
                ruleMatch.setSuggestedReplacements(replacement);
                // avoid duplicate matches:
                if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
                    prevRuleMatch = ruleMatch;
                    break;
                }
                prevRuleMatch = ruleMatch;
                ruleMatches.add(ruleMatch);
                break;
            }
        }
        addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue)

Example 13 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractSimpleReplaceRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        // short for SENT_START
        if (JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()))
            continue;
        // and speller-ignorable rules
        if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
            continue;
        }
        String originalTokenStr = tokenReadings.getToken();
        if (ignoreTaggedWords && isTagged(tokenReadings)) {
            continue;
        }
        String tokenString = cleanup(originalTokenStr);
        // try first with the original word, then with the all lower-case version
        List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
        if (possibleReplacements == null) {
            possibleReplacements = getWrongWords().get(tokenString);
        }
        if (possibleReplacements == null && checkLemmas) {
            possibleReplacements = new ArrayList<>();
            List<String> lemmas = new ArrayList<>();
            for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
                String lemma = analyzedToken.getLemma();
                if (lemma != null && getWrongWords().containsKey(lemma) && !lemmas.contains(lemma)) {
                    lemmas.add(cleanup(lemma));
                }
            }
            for (String lemma : lemmas) {
                List<String> replacements = getWrongWords().get(lemma);
                if (replacements != null) {
                    possibleReplacements.addAll(replacements);
                }
            }
            possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
        }
        if (possibleReplacements != null && possibleReplacements.size() > 0) {
            List<String> replacements = new ArrayList<>();
            replacements.addAll(possibleReplacements);
            if (replacements.contains(originalTokenStr)) {
                replacements.remove(originalTokenStr);
            }
            if (replacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 14 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AbstractWordCoherencyRule method match.

@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    // e.g. aufwändig -> RuleMatch of aufwendig
    Map<String, RuleMatch> shouldNotAppearWord = new HashMap<>();
    int pos = 0;
    for (AnalyzedSentence sentence : sentences) {
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
        for (AnalyzedTokenReadings tmpToken : tokens) {
            String token = tmpToken.getToken();
            List<AnalyzedToken> readings = tmpToken.getReadings();
            // TODO: in theory we need to care about the other readings, too (affects e.g. German "Schenke" as a noun):
            if (readings.size() > 0) {
                String baseform = readings.get(0).getLemma();
                if (baseform != null) {
                    token = baseform;
                }
            }
            if (shouldNotAppearWord.containsKey(token)) {
                RuleMatch otherMatch = shouldNotAppearWord.get(token);
                String otherSpelling = otherMatch.getMessage();
                String msg = getMessage(token, otherSpelling);
                RuleMatch ruleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), msg);
                ruleMatch.setSuggestedReplacement(otherSpelling);
                ruleMatches.add(ruleMatch);
            } else if (getWordMap().containsKey(token)) {
                String shouldNotAppear = getWordMap().get(token);
                RuleMatch potentialRuleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), token);
                shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
            }
        }
        pos += sentence.getText().length();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 15 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AgreementRule method getAgreementCategories.

/** Return Kasus, Numerus, Genus of those forms with a determiner. */
private Set<String> getAgreementCategories(AnalyzedTokenReadings aToken, Set<GrammarCategory> omit, boolean skipSol) {
    Set<String> set = new HashSet<>();
    List<AnalyzedToken> readings = aToken.getReadings();
    for (AnalyzedToken tmpReading : readings) {
        if (skipSol && tmpReading.getPOSTag() != null && tmpReading.getPOSTag().endsWith(":SOL")) {
            // SOL = alleinstehend - needs to be skipped so we find errors like "An der roter Ampel."
            continue;
        }
        AnalyzedGermanToken reading = new AnalyzedGermanToken(tmpReading);
        if (reading.getCasus() == null && reading.getNumerus() == null && reading.getGenus() == null) {
            continue;
        }
        if (reading.getGenus() == GermanToken.Genus.ALLGEMEIN && tmpReading.getPOSTag() != null && // STV: stellvertretend (!= begleitend)
        !tmpReading.getPOSTag().endsWith(":STV") && !possessiveSpecialCase(aToken, tmpReading)) {
            // e.g. "Ich Arbeiter" doesn't get flagged as incorrect:
            if (reading.getDetermination() == null) {
                // Nouns don't have the determination property (definite/indefinite), and as we don't want to
                // introduce a special case for that, we just pretend they always fulfill both properties:
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.INDEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.INDEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.INDEFINITE, omit));
            } else {
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, reading.getDetermination(), omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, reading.getDetermination(), omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, reading.getDetermination(), omit));
            }
        } else {
            if (reading.getDetermination() == null || "jed".equals(tmpReading.getLemma()) || "manch".equals(tmpReading.getLemma())) {
                // "jeder" etc. needs a special case to avoid false alarm
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.DEFINITE, omit));
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.INDEFINITE, omit));
            } else {
                set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), reading.getDetermination(), omit));
            }
        }
    }
    return set;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedGermanToken(org.languagetool.tagging.de.AnalyzedGermanToken)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2