Search in sources :

Example 81 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MatchState method filterReadings.

public final AnalyzedTokenReadings filterReadings() {
    List<AnalyzedToken> l = new ArrayList<>();
    if (formattedToken != null) {
        if (match.isStaticLemma()) {
            matchedToken.leaveReading(new AnalyzedToken(matchedToken.getToken(), match.getPosTag(), formattedToken.getToken()));
            formattedToken = matchedToken;
        }
        String token = formattedToken.getToken();
        Pattern regexMatch = match.getRegexMatch();
        String regexReplace = match.getRegexReplace();
        if (regexMatch != null && regexReplace != null) {
            /* only replace if it is something to replace */
            token = regexMatch.matcher(token).replaceAll(regexReplace);
        }
        token = convertCase(token, token, null);
        String posTag = match.getPosTag();
        if (posTag != null) {
            int numRead = formattedToken.getReadingsLength();
            if (match.isPostagRegexp()) {
                Pattern pPosRegexMatch = match.getPosRegexMatch();
                String posTagReplace = match.getPosTagReplace();
                String targetPosTag;
                for (int i = 0; i < numRead; i++) {
                    String testTag = formattedToken.getAnalyzedToken(i).getPOSTag();
                    if (testTag != null && pPosRegexMatch.matcher(testTag).matches()) {
                        targetPosTag = testTag;
                        if (posTagReplace != null) {
                            targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
                        }
                        l.add(new AnalyzedToken(token, targetPosTag, formattedToken.getAnalyzedToken(i).getLemma()));
                        l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
                    }
                }
                if (l.isEmpty()) {
                    l.addAll(getNewToken(numRead, token));
                }
            } else {
                l.addAll(getNewToken(numRead, token));
            }
            String lemma = formattedToken.getAnalyzedToken(0).getLemma();
            if (formattedToken.isSentenceEnd()) {
                l.add(new AnalyzedToken(formattedToken.getToken(), SENTENCE_END_TAGNAME, lemma));
            }
            if (formattedToken.isParagraphEnd()) {
                l.add(new AnalyzedToken(formattedToken.getToken(), PARAGRAPH_END_TAGNAME, lemma));
            }
        }
    }
    if (l.isEmpty()) {
        return formattedToken;
    }
    final AnalyzedTokenReadings anTkRead = new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
    anTkRead.setWhitespaceBefore(formattedToken.isWhitespaceBefore());
    if (!formattedToken.getChunkTags().isEmpty()) {
        anTkRead.setChunkTags(formattedToken.getChunkTags());
    }
    if (formattedToken.isImmunized()) {
        anTkRead.immunize();
    }
    return anTkRead;
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 82 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AdvancedWordRepeatRule method match.

/*
   * Tests if any word form is repeated in the sentence.
   */
@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    boolean repetition = false;
    Set<String> inflectedWords = new TreeSet<>();
    String prevLemma;
    int curToken = 0;
    // start from real token, 0 = SENT_START
    for (int i = 1; i < tokens.length; i++) {
        String token = tokens[i].getToken();
        // avoid "..." etc. to be matched:
        boolean isWord = true;
        boolean hasLemma = true;
        if (token.length() < 2) {
            isWord = false;
        }
        for (AnalyzedToken analyzedToken : tokens[i]) {
            String posTag = analyzedToken.getPOSTag();
            if (posTag != null) {
                if (StringTools.isEmpty(posTag)) {
                    isWord = false;
                    break;
                }
                String lemma = analyzedToken.getLemma();
                if (lemma == null) {
                    hasLemma = false;
                    break;
                }
                if (getExcludedWordsPattern().contains(lemma)) {
                    isWord = false;
                    break;
                }
                Matcher m2 = getExcludedPos().matcher(posTag);
                if (m2.matches()) {
                    isWord = false;
                    break;
                }
            } else {
                hasLemma = false;
            }
        }
        Matcher m1 = getExcludedNonWordsPattern().matcher(tokens[i].getToken());
        if (isWord && m1.matches()) {
            isWord = false;
        }
        prevLemma = "";
        if (isWord) {
            boolean notSentEnd = false;
            for (AnalyzedToken analyzedToken : tokens[i]) {
                String pos = analyzedToken.getPOSTag();
                if (pos != null) {
                    notSentEnd |= JLanguageTool.SENTENCE_END_TAGNAME.equals(pos);
                }
                if (hasLemma) {
                    String curLemma = analyzedToken.getLemma();
                    if (!prevLemma.equals(curLemma) && !notSentEnd) {
                        if (inflectedWords.contains(curLemma) && curToken != i) {
                            repetition = true;
                        } else {
                            inflectedWords.add(analyzedToken.getLemma());
                            curToken = i;
                        }
                    }
                    prevLemma = curLemma;
                } else {
                    if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
                        repetition = true;
                    } else {
                        inflectedWords.add(tokens[i].getToken());
                    }
                }
            }
        }
        if (repetition) {
            int pos = tokens[i].getStartPos();
            RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), getMessage(), getShortMessage());
            ruleMatches.add(ruleMatch);
            repetition = false;
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 83 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class DemoRule method match.

// This is the method with the error detection logic that you need to implement:
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    // Let's get all the tokens (i.e. words) of this sentence, but not the spaces:
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    // be a special token that indicates the start of a sentence:
    for (AnalyzedTokenReadings token : tokens) {
        // the original word from the input text
        System.out.println("Token: " + token.getToken());
        // so we iterate over the readings:
        for (AnalyzedToken analyzedToken : token.getReadings()) {
            System.out.println("  Lemma: " + analyzedToken.getLemma());
            System.out.println("  POS: " + analyzedToken.getPOSTag());
        }
        // then show to the user:
        if (token.getToken().equals("demo")) {
            RuleMatch ruleMatch = new RuleMatch(this, token.getStartPos(), token.getEndPos(), "The demo rule thinks this looks wrong");
            // the user will see this as a suggested correction
            ruleMatch.setSuggestedReplacement("blablah");
            ruleMatches.add(ruleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 84 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class PatternRuleTest method testBadSentences.

private void testBadSentences(JLanguageTool languageTool, JLanguageTool allRulesLanguageTool, Language lang, Map<String, AbstractPatternRule> complexRules, AbstractPatternRule rule) throws IOException {
    List<IncorrectExample> badSentences = rule.getIncorrectExamples();
    if (badSentences.size() == 0) {
        fail("No incorrect examples found for rule " + rule.getFullId());
    }
    // necessary for XML Pattern rules containing <or>
    List<AbstractPatternRule> rules = allRulesLanguageTool.getPatternRulesByIdAndSubId(rule.getId(), rule.getSubId());
    for (IncorrectExample origBadExample : badSentences) {
        // enable indentation use
        String origBadSentence = origBadExample.getExample().replaceAll("[\\n\\t]+", "");
        List<String> expectedCorrections = origBadExample.getCorrections();
        int expectedMatchStart = origBadSentence.indexOf("<marker>");
        int expectedMatchEnd = origBadSentence.indexOf("</marker>") - "<marker>".length();
        if (expectedMatchStart == -1 || expectedMatchEnd == -1) {
            fail(lang + ": No error position markup ('<marker>...</marker>') in bad example in rule " + rule.getFullId());
        }
        String badSentence = cleanXML(origBadSentence);
        assertTrue(badSentence.trim().length() > 0);
        // necessary for XML Pattern rules containing <or>
        List<RuleMatch> matches = new ArrayList<>();
        for (Rule auxRule : rules) {
            matches.addAll(getMatches(auxRule, badSentence, languageTool));
        }
        if (rule instanceof RegexPatternRule || rule instanceof PatternRule && !((PatternRule) rule).isWithComplexPhrase()) {
            if (matches.size() != 1) {
                AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(badSentence);
                StringBuilder sb = new StringBuilder("Analyzed token readings:");
                for (AnalyzedTokenReadings atr : analyzedSentence.getTokens()) {
                    sb.append(" ").append(atr);
                }
                String info = "";
                if (rule instanceof RegexPatternRule) {
                    info = "\nRegexp: " + ((RegexPatternRule) rule).getPattern().toString();
                }
                fail(lang + " rule " + rule.getFullId() + ":\n\"" + badSentence + "\"\n" + "Errors expected: 1\n" + "Errors found   : " + matches.size() + "\n" + "Message: " + rule.getMessage() + "\n" + sb + "\nMatches: " + matches + info);
            }
            assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule.getFullId() + ", sentence: " + badSentence, expectedMatchStart, matches.get(0).getFromPos());
            assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule.getFullId() + ", sentence: " + badSentence, expectedMatchEnd, matches.get(0).getToPos());
            // make sure suggestion is what we expect it to be
            assertSuggestions(badSentence, lang, expectedCorrections, rule, matches);
            // make sure the suggested correction doesn't produce an error:
            if (matches.get(0).getSuggestedReplacements().size() > 0) {
                int fromPos = matches.get(0).getFromPos();
                int toPos = matches.get(0).getToPos();
                for (String replacement : matches.get(0).getSuggestedReplacements()) {
                    String fixedSentence = badSentence.substring(0, fromPos) + replacement + badSentence.substring(toPos);
                    matches = getMatches(rule, fixedSentence, languageTool);
                    if (matches.size() > 0) {
                        fail("Incorrect input:\n" + "  " + badSentence + "\nCorrected sentence:\n" + "  " + fixedSentence + "\nBy Rule:\n" + "  " + rule.getFullId() + "\nThe correction triggered an error itself:\n" + "  " + matches.get(0) + "\n");
                    }
                }
            }
        } else {
            // for multiple rules created with complex phrases
            matches = getMatches(rule, badSentence, languageTool);
            if (matches.size() == 0 && !complexRules.containsKey(rule.getId() + badSentence)) {
                complexRules.put(rule.getId() + badSentence, rule);
            }
            if (matches.size() != 0) {
                complexRules.put(rule.getId() + badSentence, null);
                assertTrue(lang + ": Did expect one error in: \"" + badSentence + "\" (Rule: " + rule.getFullId() + "), got " + matches.size(), matches.size() == 1);
                assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule.getFullId(), expectedMatchStart, matches.get(0).getFromPos());
                assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule.getFullId(), expectedMatchEnd, matches.get(0).getToPos());
                assertSuggestions(badSentence, lang, expectedCorrections, rule, matches);
                assertSuggestionsDoNotCreateErrors(badSentence, languageTool, rule, matches);
            }
        }
    // check for overlapping rules
    /*matches = getMatches(rule, badSentence, languageTool);
      List<RuleMatch> matchesAllRules = allRulesLanguageTool.check(badSentence);
      for (RuleMatch match : matchesAllRules) {
        if (!match.getRule().getId().equals(rule.getId()) && !matches.isEmpty()
            && rangeIsOverlapping(matches.get(0).getFromPos(), matches.get(0).getToPos(), match.getFromPos(), match.getToPos()))
          System.err.println("WARN: " + lang.getShortCode() + ": '" + badSentence + "' in "
                  + rule.getId() + " also matched " + match.getRule().getId());
      }*/
    }
}
Also used : DisambiguationPatternRule(org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) RuleMatch(org.languagetool.rules.RuleMatch) AnalyzedSentence(org.languagetool.AnalyzedSentence) IncorrectExample(org.languagetool.rules.IncorrectExample) SpellingCheckRule(org.languagetool.rules.spelling.SpellingCheckRule) DisambiguationPatternRule(org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule) Rule(org.languagetool.rules.Rule)

Example 85 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class ContextBuilder method getContext.

public List<String> getContext(AnalyzedTokenReadings[] tokens, int pos, int contextSize) {
    List<String> l = new ArrayList<>();
    int i = 0;
    for (AnalyzedTokenReadings token : tokens) {
        if (i == pos) {
            l.addAll(getLeftContext(tokens, pos, contextSize));
            l.add(token.getToken());
            l.addAll(getRightContext(tokens, pos, contextSize));
            break;
        }
        i++;
    }
    return l;
}
Also used : ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2