Search in sources :

Example 11 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GermanHelperTest method testHasReadingOfType.

@Test
public void testHasReadingOfType() throws Exception {
    AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("der", "ART:DEF:DAT:SIN:FEM", null), 0);
    assertTrue(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.DETERMINER));
    assertFalse(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.NOMEN));
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 12 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class LanguageToolFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    if (posStack.size() > 0) {
        String pop = posStack.pop();
        restoreState(current);
        termAtt.append(pop);
        posIncrAtt.setPositionIncrement(0);
        typeAtt.setType("pos");
        return true;
    }
    if (tokenIter == null || !tokenIter.hasNext()) {
        // there are no remaining tokens from the current sentence... are there more sentences?
        if (input.incrementToken()) {
            // a new sentence is available: process it.
            String sentenceStr = termAtt.toString();
            collectedInput.append(sentenceStr);
            if (sentenceStr.length() >= 255) {
                // later. See https://github.com/languagetool-org/languagetool/issues/364
                return true;
            } else {
                sentenceStr = collectedInput.toString();
                collectedInput.setLength(0);
            }
            AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
            List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
            tokenIter = tokenBuffer.iterator();
            /*
         * it should not be possible to have a sentence with 0 words, check just in case. returning
         * EOS isn't the best either, but it's the behavior of the original code.
         */
            if (!tokenIter.hasNext()) {
                return false;
            }
        } else {
            // no more sentences, end of stream!
            return false;
        }
    }
    // It must clear attributes, as it is creating new tokens.
    clearAttributes();
    AnalyzedTokenReadings tr = tokenIter.next();
    // add POS tag for sentence start.
    if (tr.isSentenceStart()) {
        // TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
        // but breaks other cases:
        //termAtt.append("SENT_START");
        typeAtt.setType("pos");
        String posTag = tr.getAnalyzedToken(0).getPOSTag();
        String lemma = tr.getAnalyzedToken(0).getLemma();
        if (toLowerCase) {
            termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
            }
        } else {
            termAtt.append(POS_PREFIX).append(posTag);
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX).append(lemma);
            }
        }
        return true;
    }
    // by pass the white spaces.
    if (tr.isWhitespace()) {
        return this.incrementToken();
    }
    offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
    for (AnalyzedToken token : tr) {
        if (token.getPOSTag() != null) {
            if (toLowerCase) {
                posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
            } else {
                posStack.push(POS_PREFIX + token.getPOSTag());
            }
        }
        if (token.getLemma() != null) {
            if (toLowerCase) {
                posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
            } else {
                // chances are good this is the same for all loop iterations, store it anyway...
                posStack.push(LEMMA_PREFIX + token.getLemma());
            }
        }
    }
    current = captureState();
    if (toLowerCase) {
        termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
    } else {
        termAtt.append(tr.getAnalyzedToken(0).getToken());
    }
    return true;
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 13 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractCompoundRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    RuleMatch prevRuleMatch = null;
    Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
        AnalyzedTokenReadings token;
        // we need to extend the token list so we find matches at the end of the original list:
        if (i >= tokens.length) {
            token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
        } else {
            token = tokens[i];
        }
        if (i == 0) {
            addToQueue(token, prevTokens);
            continue;
        }
        if (token.isImmunized()) {
            continue;
        }
        AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
        List<String> stringsToCheck = new ArrayList<>();
        // original upper/lowercase spelling
        List<String> origStringsToCheck = new ArrayList<>();
        Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
        // sure we match longer strings first:
        for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
            String stringToCheck = stringsToCheck.get(k);
            String origStringToCheck = origStringsToCheck.get(k);
            if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
                AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
                String msg = null;
                List<String> replacement = new ArrayList<>();
                if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                }
                if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
                    replacement.add(mergeCompound(origStringToCheck));
                    msg = withoutHyphenMessage;
                }
                String[] parts = stringToCheck.split(" ");
                if (parts.length > 0 && parts[0].length() == 1) {
                    replacement.clear();
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                } else if (replacement.isEmpty() || replacement.size() == 2) {
                    // isEmpty shouldn't happen
                    msg = withOrWithoutHyphenMessage;
                }
                RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
                ruleMatch.setSuggestedReplacements(replacement);
                // avoid duplicate matches:
                if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
                    prevRuleMatch = ruleMatch;
                    break;
                }
                prevRuleMatch = ruleMatch;
                ruleMatches.add(ruleMatch);
                break;
            }
        }
        addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue)

Example 14 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractCompoundRule method getStringToTokenMap.

private Map<String, AnalyzedTokenReadings> getStringToTokenMap(Queue<AnalyzedTokenReadings> prevTokens, List<String> stringsToCheck, List<String> origStringsToCheck) {
    StringBuilder sb = new StringBuilder();
    Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>();
    int j = 0;
    for (AnalyzedTokenReadings atr : prevTokens) {
        sb.append(' ');
        sb.append(atr.getToken());
        if (j >= 1) {
            String stringToCheck = normalize(sb.toString());
            stringsToCheck.add(stringToCheck);
            origStringsToCheck.add(sb.toString().trim());
            if (!stringToToken.containsKey(stringToCheck)) {
                stringToToken.put(stringToCheck, atr);
            }
        }
        j++;
    }
    return stringToToken;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 15 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractSimpleReplaceRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        // short for SENT_START
        if (JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()))
            continue;
        // and speller-ignorable rules
        if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
            continue;
        }
        String originalTokenStr = tokenReadings.getToken();
        if (ignoreTaggedWords && isTagged(tokenReadings)) {
            continue;
        }
        String tokenString = cleanup(originalTokenStr);
        // try first with the original word, then with the all lower-case version
        List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
        if (possibleReplacements == null) {
            possibleReplacements = getWrongWords().get(tokenString);
        }
        if (possibleReplacements == null && checkLemmas) {
            possibleReplacements = new ArrayList<>();
            List<String> lemmas = new ArrayList<>();
            for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
                String lemma = analyzedToken.getLemma();
                if (lemma != null && getWrongWords().containsKey(lemma) && !lemmas.contains(lemma)) {
                    lemmas.add(cleanup(lemma));
                }
            }
            for (String lemma : lemmas) {
                List<String> replacements = getWrongWords().get(lemma);
                if (replacements != null) {
                    possibleReplacements.addAll(replacements);
                }
            }
            possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
        }
        if (possibleReplacements != null && possibleReplacements.size() > 0) {
            List<String> replacements = new ArrayList<>();
            replacements.addAll(possibleReplacements);
            if (replacements.contains(originalTokenStr)) {
                replacements.remove(originalTokenStr);
            }
            if (replacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2