use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MatchState method filterReadings.
public final AnalyzedTokenReadings filterReadings() {
List<AnalyzedToken> l = new ArrayList<>();
if (formattedToken != null) {
if (match.isStaticLemma()) {
matchedToken.leaveReading(new AnalyzedToken(matchedToken.getToken(), match.getPosTag(), formattedToken.getToken()));
formattedToken = matchedToken;
}
String token = formattedToken.getToken();
Pattern regexMatch = match.getRegexMatch();
String regexReplace = match.getRegexReplace();
if (regexMatch != null && regexReplace != null) {
/* only replace if it is something to replace */
token = regexMatch.matcher(token).replaceAll(regexReplace);
}
token = convertCase(token, token, null);
String posTag = match.getPosTag();
if (posTag != null) {
int numRead = formattedToken.getReadingsLength();
if (match.isPostagRegexp()) {
Pattern pPosRegexMatch = match.getPosRegexMatch();
String posTagReplace = match.getPosTagReplace();
String targetPosTag;
for (int i = 0; i < numRead; i++) {
String testTag = formattedToken.getAnalyzedToken(i).getPOSTag();
if (testTag != null && pPosRegexMatch.matcher(testTag).matches()) {
targetPosTag = testTag;
if (posTagReplace != null) {
targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
}
l.add(new AnalyzedToken(token, targetPosTag, formattedToken.getAnalyzedToken(i).getLemma()));
l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
}
}
if (l.isEmpty()) {
l.addAll(getNewToken(numRead, token));
}
} else {
l.addAll(getNewToken(numRead, token));
}
String lemma = formattedToken.getAnalyzedToken(0).getLemma();
if (formattedToken.isSentenceEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(), SENTENCE_END_TAGNAME, lemma));
}
if (formattedToken.isParagraphEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(), PARAGRAPH_END_TAGNAME, lemma));
}
}
}
if (l.isEmpty()) {
return formattedToken;
}
final AnalyzedTokenReadings anTkRead = new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
anTkRead.setWhitespaceBefore(formattedToken.isWhitespaceBefore());
if (!formattedToken.getChunkTags().isEmpty()) {
anTkRead.setChunkTags(formattedToken.getChunkTags());
}
if (formattedToken.isImmunized()) {
anTkRead.immunize();
}
return anTkRead;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AdvancedWordRepeatRule method match.
/*
* Tests if any word form is repeated in the sentence.
*/
@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
boolean repetition = false;
Set<String> inflectedWords = new TreeSet<>();
String prevLemma;
int curToken = 0;
// start from real token, 0 = SENT_START
for (int i = 1; i < tokens.length; i++) {
String token = tokens[i].getToken();
// avoid "..." etc. to be matched:
boolean isWord = true;
boolean hasLemma = true;
if (token.length() < 2) {
isWord = false;
}
for (AnalyzedToken analyzedToken : tokens[i]) {
String posTag = analyzedToken.getPOSTag();
if (posTag != null) {
if (StringTools.isEmpty(posTag)) {
isWord = false;
break;
}
String lemma = analyzedToken.getLemma();
if (lemma == null) {
hasLemma = false;
break;
}
if (getExcludedWordsPattern().contains(lemma)) {
isWord = false;
break;
}
Matcher m2 = getExcludedPos().matcher(posTag);
if (m2.matches()) {
isWord = false;
break;
}
} else {
hasLemma = false;
}
}
Matcher m1 = getExcludedNonWordsPattern().matcher(tokens[i].getToken());
if (isWord && m1.matches()) {
isWord = false;
}
prevLemma = "";
if (isWord) {
boolean notSentEnd = false;
for (AnalyzedToken analyzedToken : tokens[i]) {
String pos = analyzedToken.getPOSTag();
if (pos != null) {
notSentEnd |= JLanguageTool.SENTENCE_END_TAGNAME.equals(pos);
}
if (hasLemma) {
String curLemma = analyzedToken.getLemma();
if (!prevLemma.equals(curLemma) && !notSentEnd) {
if (inflectedWords.contains(curLemma) && curToken != i) {
repetition = true;
} else {
inflectedWords.add(analyzedToken.getLemma());
curToken = i;
}
}
prevLemma = curLemma;
} else {
if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
repetition = true;
} else {
inflectedWords.add(tokens[i].getToken());
}
}
}
}
if (repetition) {
int pos = tokens[i].getStartPos();
RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), getMessage(), getShortMessage());
ruleMatches.add(ruleMatch);
repetition = false;
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class DemoRule method match.
// This is the method with the error detection logic that you need to implement:
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List<RuleMatch> ruleMatches = new ArrayList<>();
// Let's get all the tokens (i.e. words) of this sentence, but not the spaces:
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
// be a special token that indicates the start of a sentence:
for (AnalyzedTokenReadings token : tokens) {
// the original word from the input text
System.out.println("Token: " + token.getToken());
// so we iterate over the readings:
for (AnalyzedToken analyzedToken : token.getReadings()) {
System.out.println(" Lemma: " + analyzedToken.getLemma());
System.out.println(" POS: " + analyzedToken.getPOSTag());
}
// then show to the user:
if (token.getToken().equals("demo")) {
RuleMatch ruleMatch = new RuleMatch(this, token.getStartPos(), token.getEndPos(), "The demo rule thinks this looks wrong");
// the user will see this as a suggested correction
ruleMatch.setSuggestedReplacement("blablah");
ruleMatches.add(ruleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class PatternRuleTest method testBadSentences.
private void testBadSentences(JLanguageTool languageTool, JLanguageTool allRulesLanguageTool, Language lang, Map<String, AbstractPatternRule> complexRules, AbstractPatternRule rule) throws IOException {
List<IncorrectExample> badSentences = rule.getIncorrectExamples();
if (badSentences.size() == 0) {
fail("No incorrect examples found for rule " + rule.getFullId());
}
// necessary for XML Pattern rules containing <or>
List<AbstractPatternRule> rules = allRulesLanguageTool.getPatternRulesByIdAndSubId(rule.getId(), rule.getSubId());
for (IncorrectExample origBadExample : badSentences) {
// enable indentation use
String origBadSentence = origBadExample.getExample().replaceAll("[\\n\\t]+", "");
List<String> expectedCorrections = origBadExample.getCorrections();
int expectedMatchStart = origBadSentence.indexOf("<marker>");
int expectedMatchEnd = origBadSentence.indexOf("</marker>") - "<marker>".length();
if (expectedMatchStart == -1 || expectedMatchEnd == -1) {
fail(lang + ": No error position markup ('<marker>...</marker>') in bad example in rule " + rule.getFullId());
}
String badSentence = cleanXML(origBadSentence);
assertTrue(badSentence.trim().length() > 0);
// necessary for XML Pattern rules containing <or>
List<RuleMatch> matches = new ArrayList<>();
for (Rule auxRule : rules) {
matches.addAll(getMatches(auxRule, badSentence, languageTool));
}
if (rule instanceof RegexPatternRule || rule instanceof PatternRule && !((PatternRule) rule).isWithComplexPhrase()) {
if (matches.size() != 1) {
AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(badSentence);
StringBuilder sb = new StringBuilder("Analyzed token readings:");
for (AnalyzedTokenReadings atr : analyzedSentence.getTokens()) {
sb.append(" ").append(atr);
}
String info = "";
if (rule instanceof RegexPatternRule) {
info = "\nRegexp: " + ((RegexPatternRule) rule).getPattern().toString();
}
fail(lang + " rule " + rule.getFullId() + ":\n\"" + badSentence + "\"\n" + "Errors expected: 1\n" + "Errors found : " + matches.size() + "\n" + "Message: " + rule.getMessage() + "\n" + sb + "\nMatches: " + matches + info);
}
assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule.getFullId() + ", sentence: " + badSentence, expectedMatchStart, matches.get(0).getFromPos());
assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule.getFullId() + ", sentence: " + badSentence, expectedMatchEnd, matches.get(0).getToPos());
// make sure suggestion is what we expect it to be
assertSuggestions(badSentence, lang, expectedCorrections, rule, matches);
// make sure the suggested correction doesn't produce an error:
if (matches.get(0).getSuggestedReplacements().size() > 0) {
int fromPos = matches.get(0).getFromPos();
int toPos = matches.get(0).getToPos();
for (String replacement : matches.get(0).getSuggestedReplacements()) {
String fixedSentence = badSentence.substring(0, fromPos) + replacement + badSentence.substring(toPos);
matches = getMatches(rule, fixedSentence, languageTool);
if (matches.size() > 0) {
fail("Incorrect input:\n" + " " + badSentence + "\nCorrected sentence:\n" + " " + fixedSentence + "\nBy Rule:\n" + " " + rule.getFullId() + "\nThe correction triggered an error itself:\n" + " " + matches.get(0) + "\n");
}
}
}
} else {
// for multiple rules created with complex phrases
matches = getMatches(rule, badSentence, languageTool);
if (matches.size() == 0 && !complexRules.containsKey(rule.getId() + badSentence)) {
complexRules.put(rule.getId() + badSentence, rule);
}
if (matches.size() != 0) {
complexRules.put(rule.getId() + badSentence, null);
assertTrue(lang + ": Did expect one error in: \"" + badSentence + "\" (Rule: " + rule.getFullId() + "), got " + matches.size(), matches.size() == 1);
assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule.getFullId(), expectedMatchStart, matches.get(0).getFromPos());
assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule.getFullId(), expectedMatchEnd, matches.get(0).getToPos());
assertSuggestions(badSentence, lang, expectedCorrections, rule, matches);
assertSuggestionsDoNotCreateErrors(badSentence, languageTool, rule, matches);
}
}
// check for overlapping rules
/*matches = getMatches(rule, badSentence, languageTool);
List<RuleMatch> matchesAllRules = allRulesLanguageTool.check(badSentence);
for (RuleMatch match : matchesAllRules) {
if (!match.getRule().getId().equals(rule.getId()) && !matches.isEmpty()
&& rangeIsOverlapping(matches.get(0).getFromPos(), matches.get(0).getToPos(), match.getFromPos(), match.getToPos()))
System.err.println("WARN: " + lang.getShortCode() + ": '" + badSentence + "' in "
+ rule.getId() + " also matched " + match.getRule().getId());
}*/
}
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class ContextBuilder method getContext.
public List<String> getContext(AnalyzedTokenReadings[] tokens, int pos, int contextSize) {
List<String> l = new ArrayList<>();
int i = 0;
for (AnalyzedTokenReadings token : tokens) {
if (i == pos) {
l.addAll(getLeftContext(tokens, pos, contextSize));
l.add(token.getToken());
l.addAll(getRightContext(tokens, pos, contextSize));
break;
}
i++;
}
return l;
}
Aggregations