use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MissingVerbRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
if (!isRealSentence(sentence)) {
return new RuleMatch[0];
}
if (isSpecialCase(sentence)) {
return new RuleMatch[0];
}
boolean verbFound = false;
AnalyzedTokenReadings lastToken = null;
int i = 0;
for (AnalyzedTokenReadings readings : sentence.getTokensWithoutWhitespace()) {
if (readings.hasPartialPosTag("VER") || (!readings.isTagged() && !StringTools.isCapitalizedWord(readings.getToken()))) {
// ignore unknown words to avoid false alarms
//System.out.println("Found verb: " + readings.getToken());
verbFound = true;
break;
} else if (i == 1 && verbAtSentenceStart(readings)) {
//System.out.println("Found verb: " + readings.getToken());
verbFound = true;
break;
}
lastToken = readings;
i++;
}
if (!verbFound && lastToken != null && sentence.getTokensWithoutWhitespace().length >= MIN_TOKENS_FOR_ERROR) {
RuleMatch match = new RuleMatch(this, 0, lastToken.getStartPos() + lastToken.getToken().length(), "Dieser Satz scheint kein Verb zu enthalten");
return new RuleMatch[] { match };
}
return new RuleMatch[0];
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MissingVerbRule method isRealSentence.
// we want to ignore headlines, and these usually don't end with [.?!]
private boolean isRealSentence(AnalyzedSentence sentence) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
if (tokens.length > 0) {
AnalyzedTokenReadings lastToken = tokens[tokens.length - 1];
String lastTokenStr = lastToken.getToken();
if (lastTokenStr.equals(".") || lastTokenStr.equals("?") || lastTokenStr.equals("!")) {
return true;
}
}
return false;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class SubjectVerbAgreementRule method getSingularMatchOrNull.
@Nullable
private RuleMatch getSingularMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) throws IOException {
if (singular.contains(tokenStr)) {
AnalyzedTokenReadings prevToken = tokens[i - 1];
AnalyzedTokenReadings nextToken = i + 1 < tokens.length ? tokens[i + 1] : null;
List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
boolean match = prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && // 'um 18 Uhr ist Feierabend'
!prevToken.getToken().equals("Uhr") && !isCurrency(prevToken) && // 'zehn Jahre ist es her'
!(nextToken != null && nextToken.getToken().equals("es")) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasQuestionPronounToTheLeft(tokens, i - 1) && !containsRegexToTheLeft("wer", tokens, i - 1) && !containsRegexToTheLeft("(?i)alle[nr]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)jede[rs]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)manche[nrs]?", tokens, i - 1) && !containsOnlyInfinitivesToTheLeft(tokens, i - 1);
if (match) {
String message = "Bitte prüfen, ob hier <suggestion>" + getPluralFor(tokenStr) + "</suggestion> stehen sollte.";
return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
}
}
return null;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class Main method appendTagsWithDisambigLog.
private boolean appendTagsWithDisambigLog(StringBuilder sb, AnalyzedSentence sentence, boolean odd) {
for (AnalyzedTokenReadings t : sentence.getTokens()) {
if (t.isWhitespace() && !t.isSentenceStart()) {
continue;
}
odd = !odd;
sb.append("<tr>");
sb.append("<td bgcolor=\"");
if (odd) {
sb.append("#ffffff");
} else {
sb.append("#f1f1f1");
}
sb.append("\">");
if (!t.isWhitespace()) {
sb.append(t.getToken());
sb.append("<font color='");
sb.append(TAG_COLOR);
sb.append("'>[");
}
Iterator<AnalyzedToken> iterator = t.iterator();
while (iterator.hasNext()) {
AnalyzedToken token = iterator.next();
String posTag = token.getPOSTag();
if (t.isSentenceStart()) {
sb.append(StringTools.escapeHTML("<S>"));
} else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
sb.append(StringTools.escapeHTML("</S>"));
} else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
sb.append(StringTools.escapeHTML("<P/>"));
} else {
if (!t.isWhitespace()) {
sb.append(token);
if (iterator.hasNext()) {
sb.append(", ");
}
}
}
}
if (!t.isWhitespace()) {
if (t.getChunkTags().size() > 0) {
sb.append(',');
sb.append(StringUtils.join(t.getChunkTags(), "|"));
}
if (t.isImmunized()) {
sb.append("{!}");
}
sb.append("]</font>");
} else {
sb.append(' ');
}
sb.append("</td>");
sb.append("<td bgcolor=\"");
if (odd) {
sb.append("#ffffff");
} else {
sb.append("#f1f1f1");
}
sb.append("\">");
if (!"".equals(t.getHistoricalAnnotations())) {
sb.append(StringTools.escapeHTML(t.getHistoricalAnnotations()).trim().replace("\n", "<br>"));
}
sb.append("</td>");
sb.append("</tr>");
}
return odd;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class TokenInflectionAgreementRule method match.
@Override
public final RuleMatch[] match(AnalyzedSentence text) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
List<AnalyzedToken> adjTokenReadings = new ArrayList<>();
AnalyzedTokenReadings adjAnalyzedTokenReadings = null;
for (int i = 1; i < tokens.length; i++) {
AnalyzedTokenReadings tokenReadings = tokens[i];
String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();
if (posTag0 == null) {
// || posTag0.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
adjTokenReadings.clear();
continue;
}
if (adjTokenReadings.isEmpty()) {
// no need to start checking on last token or if no noun
if (i == tokens.length - 1)
continue;
//TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
if (PosTagHelper.hasPosTagPart(tokens[i], ":nv") || //TODO: turn back on when we can handle pron
PosTagHelper.hasPosTagPart(tokens[i], "&pron") || PosTagHelper.hasPosTagPart(tokens[i], "<"))
continue;
if (!PosTagHelper.hasPosTagPart(tokens[i + 1], "noun:") || PosTagHelper.hasPosTagPart(tokens[i + 1], ":nv") || PosTagHelper.hasPosTagPart(tokens[i + 1], "&pron") || PosTagHelper.hasPosTagPart(tokens[i + 1], "<"))
continue;
if (LemmaHelper.hasLemma(tokens[i], Arrays.asList("червоний", "правий", "місцевий", "найсильніший", "найкращі"), ":p:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("новенький", "головний", "вибраний", "більший", "побачений", "подібний"), ":n:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("державний"), ":f:")) {
adjTokenReadings.clear();
break;
}
for (AnalyzedToken token : tokenReadings) {
String adjPosTag = token.getPOSTag();
if (adjPosTag == null) {
// can happen for words with ́ or
continue;
}
if (adjPosTag.startsWith("adj")) {
adjTokenReadings.add(token);
adjAnalyzedTokenReadings = tokenReadings;
} else {
adjTokenReadings.clear();
break;
}
}
continue;
}
List<AnalyzedToken> slaveTokenReadings = new ArrayList<>();
for (AnalyzedToken token : tokenReadings) {
String nounPosTag = token.getPOSTag();
if (nounPosTag == null) {
// can happen for words with ́ or
continue;
}
if (nounPosTag.startsWith("noun") && !nounPosTag.contains(NO_VIDMINOK_SUBSTR)) {
slaveTokenReadings.add(token);
} else if (nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) {
continue;
} else {
slaveTokenReadings.clear();
break;
}
}
if (slaveTokenReadings.isEmpty()) {
adjTokenReadings.clear();
continue;
}
if (DEBUG) {
System.err.println(MessageFormat.format("=== Checking:\n\t{0}\n\t{1}", adjTokenReadings, slaveTokenReadings));
}
// perform the check
List<InflectionHelper.Inflection> masterInflections = InflectionHelper.getAdjInflections(adjTokenReadings);
List<InflectionHelper.Inflection> slaveInflections = InflectionHelper.getNounInflections(slaveTokenReadings);
if (Collections.disjoint(masterInflections, slaveInflections)) {
if (TokenInflectionExceptionHelper.isException(tokens, i, masterInflections, slaveInflections, adjTokenReadings, slaveTokenReadings)) {
adjTokenReadings.clear();
continue;
}
if (DEBUG) {
System.err.println(MessageFormat.format("=== Found:\n\t{0}\n\t", adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // " + adjAnalyzedTokenReadings, slaveTokenReadings.get(0).getToken() + ": " + slaveInflections + " // " + slaveTokenReadings));
}
String msg = String.format("Потенційна помилка: прикметник не узгоджений з іменником: \"%s\": [%s] і \"%s\": [%s]", adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true), slaveTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));
if (PosTagHelper.hasPosTagPart(adjTokenReadings, ":m:v_rod") && tokens[i].getToken().matches(".*[ую]") && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*:m:v_dav.*")) {
msg += ". Можливо вжито невнормований родовий відмінок ч.р. з закінченням -у/-ю замість -а/-я (така тенденція є в сучасній мові)?";
}
RuleMatch potentialRuleMatch = new RuleMatch(this, adjAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
List<String> suggestions = new ArrayList<>();
try {
for (Inflection adjInflection : masterInflections) {
String genderTag = ":" + adjInflection.gender + ":";
String vidmTag = adjInflection._case;
if (!adjInflection._case.equals("v_kly") && (adjInflection.gender.equals("p") || PosTagHelper.hasPosTagPart(slaveTokenReadings, genderTag))) {
for (AnalyzedToken nounToken : slaveTokenReadings) {
if (adjInflection.animMatters()) {
if (!nounToken.getPOSTag().contains(":" + adjInflection.animTag))
continue;
}
String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...", genderTag + vidmTag);
String[] synthesized = ukrainianSynthesizer.synthesize(nounToken, newNounPosTag, false);
for (String s : synthesized) {
String suggestion = adjAnalyzedTokenReadings.getToken() + " " + s;
if (!suggestions.contains(suggestion)) {
suggestions.add(suggestion);
}
}
}
}
}
for (Inflection nounInflection : slaveInflections) {
String genderTag = ":" + nounInflection.gender + ":";
String vidmTag = nounInflection._case;
if (nounInflection.animMatters()) {
vidmTag += ":r" + nounInflection.animTag;
}
for (AnalyzedToken adjToken : adjTokenReadings) {
String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?", genderTag + vidmTag);
String[] synthesized = ukrainianSynthesizer.synthesize(adjToken, newAdjTag, false);
for (String s : synthesized) {
String suggestion = s + " " + tokenReadings.getToken();
if (!suggestions.contains(suggestion)) {
suggestions.add(suggestion);
}
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
if (suggestions.size() > 0) {
potentialRuleMatch.setSuggestedReplacements(suggestions);
}
ruleMatches.add(potentialRuleMatch);
}
adjTokenReadings.clear();
}
return toRuleMatchArray(ruleMatches);
}
Aggregations