use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AbstractCompoundRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
RuleMatch prevRuleMatch = null;
Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
AnalyzedTokenReadings token;
// we need to extend the token list so we find matches at the end of the original list:
if (i >= tokens.length) {
token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
} else {
token = tokens[i];
}
if (i == 0) {
addToQueue(token, prevTokens);
continue;
}
if (token.isImmunized()) {
continue;
}
AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
List<String> stringsToCheck = new ArrayList<>();
// original upper/lowercase spelling
List<String> origStringsToCheck = new ArrayList<>();
Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
// sure we match longer strings first:
for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
String stringToCheck = stringsToCheck.get(k);
String origStringToCheck = origStringsToCheck.get(k);
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
List<String> replacement = new ArrayList<>();
if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
}
if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
replacement.add(mergeCompound(origStringToCheck));
msg = withoutHyphenMessage;
}
String[] parts = stringToCheck.split(" ");
if (parts.length > 0 && parts[0].length() == 1) {
replacement.clear();
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
} else if (replacement.isEmpty() || replacement.size() == 2) {
// isEmpty shouldn't happen
msg = withOrWithoutHyphenMessage;
}
RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
ruleMatch.setSuggestedReplacements(replacement);
// avoid duplicate matches:
if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
prevRuleMatch = ruleMatch;
break;
}
prevRuleMatch = ruleMatch;
ruleMatches.add(ruleMatch);
break;
}
}
addToQueue(token, prevTokens);
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AbstractSimpleReplaceRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tokenReadings : tokens) {
// short for SENT_START
if (JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()))
continue;
// and speller-ignorable rules
if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
continue;
}
String originalTokenStr = tokenReadings.getToken();
if (ignoreTaggedWords && isTagged(tokenReadings)) {
continue;
}
String tokenString = cleanup(originalTokenStr);
// try first with the original word, then with the all lower-case version
List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
if (possibleReplacements == null) {
possibleReplacements = getWrongWords().get(tokenString);
}
if (possibleReplacements == null && checkLemmas) {
possibleReplacements = new ArrayList<>();
List<String> lemmas = new ArrayList<>();
for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
String lemma = analyzedToken.getLemma();
if (lemma != null && getWrongWords().containsKey(lemma) && !lemmas.contains(lemma)) {
lemmas.add(cleanup(lemma));
}
}
for (String lemma : lemmas) {
List<String> replacements = getWrongWords().get(lemma);
if (replacements != null) {
possibleReplacements.addAll(replacements);
}
}
possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
}
if (possibleReplacements != null && possibleReplacements.size() > 0) {
List<String> replacements = new ArrayList<>();
replacements.addAll(possibleReplacements);
if (replacements.contains(originalTokenStr)) {
replacements.remove(originalTokenStr);
}
if (replacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AbstractWordCoherencyRule method match.
@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
List<RuleMatch> ruleMatches = new ArrayList<>();
// e.g. aufwändig -> RuleMatch of aufwendig
Map<String, RuleMatch> shouldNotAppearWord = new HashMap<>();
int pos = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tmpToken : tokens) {
String token = tmpToken.getToken();
List<AnalyzedToken> readings = tmpToken.getReadings();
// TODO: in theory we need to care about the other readings, too (affects e.g. German "Schenke" as a noun):
if (readings.size() > 0) {
String baseform = readings.get(0).getLemma();
if (baseform != null) {
token = baseform;
}
}
if (shouldNotAppearWord.containsKey(token)) {
RuleMatch otherMatch = shouldNotAppearWord.get(token);
String otherSpelling = otherMatch.getMessage();
String msg = getMessage(token, otherSpelling);
RuleMatch ruleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), msg);
ruleMatch.setSuggestedReplacement(otherSpelling);
ruleMatches.add(ruleMatch);
} else if (getWordMap().containsKey(token)) {
String shouldNotAppear = getWordMap().get(token);
RuleMatch potentialRuleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), token);
shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
}
}
pos += sentence.getText().length();
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AgreementRule method getAgreementCategories.
/** Return Kasus, Numerus, Genus of those forms with a determiner. */
private Set<String> getAgreementCategories(AnalyzedTokenReadings aToken, Set<GrammarCategory> omit, boolean skipSol) {
Set<String> set = new HashSet<>();
List<AnalyzedToken> readings = aToken.getReadings();
for (AnalyzedToken tmpReading : readings) {
if (skipSol && tmpReading.getPOSTag() != null && tmpReading.getPOSTag().endsWith(":SOL")) {
// SOL = alleinstehend - needs to be skipped so we find errors like "An der roter Ampel."
continue;
}
AnalyzedGermanToken reading = new AnalyzedGermanToken(tmpReading);
if (reading.getCasus() == null && reading.getNumerus() == null && reading.getGenus() == null) {
continue;
}
if (reading.getGenus() == GermanToken.Genus.ALLGEMEIN && tmpReading.getPOSTag() != null && // STV: stellvertretend (!= begleitend)
!tmpReading.getPOSTag().endsWith(":STV") && !possessiveSpecialCase(aToken, tmpReading)) {
// e.g. "Ich Arbeiter" doesn't get flagged as incorrect:
if (reading.getDetermination() == null) {
// Nouns don't have the determination property (definite/indefinite), and as we don't want to
// introduce a special case for that, we just pretend they always fulfill both properties:
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.DEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, GermanToken.Determination.INDEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.DEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, GermanToken.Determination.INDEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.DEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, GermanToken.Determination.INDEFINITE, omit));
} else {
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, reading.getDetermination(), omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, reading.getDetermination(), omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, reading.getDetermination(), omit));
}
} else {
if (reading.getDetermination() == null || "jed".equals(tmpReading.getLemma()) || "manch".equals(tmpReading.getLemma())) {
// "jeder" etc. needs a special case to avoid false alarm
set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.DEFINITE, omit));
set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), GermanToken.Determination.INDEFINITE, omit));
} else {
set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), reading.getDetermination(), omit));
}
}
}
return set;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AgreementSuggestor method getSuggestions.
List<String> getSuggestions() {
Set<String> suggestionSet = new HashSet<>();
try {
for (AnalyzedToken token2Reading : nounToken.getReadings()) {
String nounCase = GermanHelper.getNounCase(token2Reading.getPOSTag());
String nounNumber = GermanHelper.getNounNumber(token2Reading.getPOSTag());
String nounGender = GermanHelper.getNounGender(token2Reading.getPOSTag());
for (AnalyzedToken token1Reading : determinerToken.getReadings()) {
List<String> articleSuggestions = getArticleSuggestions(nounCase, nounNumber, nounGender, token1Reading);
suggestionSet.addAll(articleSuggestions);
List<String> pronounSuggestions = getPronounSuggestions(nounCase, nounNumber, nounGender, token1Reading);
suggestionSet.addAll(pronounSuggestions);
List<String> nounSuggestions = getNounSuggestions(token2Reading, token1Reading);
suggestionSet.addAll(nounSuggestions);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
List<String> suggestions = new ArrayList<>(suggestionSet);
Collections.sort(suggestions);
return suggestions;
}
Aggregations