Search in sources :

Example 61 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MissingVerbRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    if (!isRealSentence(sentence)) {
        return new RuleMatch[0];
    }
    if (isSpecialCase(sentence)) {
        return new RuleMatch[0];
    }
    boolean verbFound = false;
    AnalyzedTokenReadings lastToken = null;
    int i = 0;
    for (AnalyzedTokenReadings readings : sentence.getTokensWithoutWhitespace()) {
        if (readings.hasPartialPosTag("VER") || (!readings.isTagged() && !StringTools.isCapitalizedWord(readings.getToken()))) {
            // ignore unknown words to avoid false alarms
            //System.out.println("Found verb: " + readings.getToken());
            verbFound = true;
            break;
        } else if (i == 1 && verbAtSentenceStart(readings)) {
            //System.out.println("Found verb: " + readings.getToken());
            verbFound = true;
            break;
        }
        lastToken = readings;
        i++;
    }
    if (!verbFound && lastToken != null && sentence.getTokensWithoutWhitespace().length >= MIN_TOKENS_FOR_ERROR) {
        RuleMatch match = new RuleMatch(this, 0, lastToken.getStartPos() + lastToken.getToken().length(), "Dieser Satz scheint kein Verb zu enthalten");
        return new RuleMatch[] { match };
    }
    return new RuleMatch[0];
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 62 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MissingVerbRule method isRealSentence.

// we want to ignore headlines, and these usually don't end with [.?!]
private boolean isRealSentence(AnalyzedSentence sentence) {
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    if (tokens.length > 0) {
        AnalyzedTokenReadings lastToken = tokens[tokens.length - 1];
        String lastTokenStr = lastToken.getToken();
        if (lastTokenStr.equals(".") || lastTokenStr.equals("?") || lastTokenStr.equals("!")) {
            return true;
        }
    }
    return false;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 63 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class SubjectVerbAgreementRule method getSingularMatchOrNull.

@Nullable
private RuleMatch getSingularMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) throws IOException {
    if (singular.contains(tokenStr)) {
        AnalyzedTokenReadings prevToken = tokens[i - 1];
        AnalyzedTokenReadings nextToken = i + 1 < tokens.length ? tokens[i + 1] : null;
        List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
        boolean match = prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && // 'um 18 Uhr ist Feierabend'
        !prevToken.getToken().equals("Uhr") && !isCurrency(prevToken) && // 'zehn Jahre ist es her'
        !(nextToken != null && nextToken.getToken().equals("es")) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasQuestionPronounToTheLeft(tokens, i - 1) && !containsRegexToTheLeft("wer", tokens, i - 1) && !containsRegexToTheLeft("(?i)alle[nr]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)jede[rs]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)manche[nrs]?", tokens, i - 1) && !containsOnlyInfinitivesToTheLeft(tokens, i - 1);
        if (match) {
            String message = "Bitte prüfen, ob hier <suggestion>" + getPluralFor(tokenStr) + "</suggestion> stehen sollte.";
            return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
        }
    }
    return null;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Nullable(org.jetbrains.annotations.Nullable)

Example 64 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class Main method appendTagsWithDisambigLog.

private boolean appendTagsWithDisambigLog(StringBuilder sb, AnalyzedSentence sentence, boolean odd) {
    for (AnalyzedTokenReadings t : sentence.getTokens()) {
        if (t.isWhitespace() && !t.isSentenceStart()) {
            continue;
        }
        odd = !odd;
        sb.append("<tr>");
        sb.append("<td bgcolor=\"");
        if (odd) {
            sb.append("#ffffff");
        } else {
            sb.append("#f1f1f1");
        }
        sb.append("\">");
        if (!t.isWhitespace()) {
            sb.append(t.getToken());
            sb.append("<font color='");
            sb.append(TAG_COLOR);
            sb.append("'>[");
        }
        Iterator<AnalyzedToken> iterator = t.iterator();
        while (iterator.hasNext()) {
            AnalyzedToken token = iterator.next();
            String posTag = token.getPOSTag();
            if (t.isSentenceStart()) {
                sb.append(StringTools.escapeHTML("<S>"));
            } else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
                sb.append(StringTools.escapeHTML("</S>"));
            } else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
                sb.append(StringTools.escapeHTML("<P/>"));
            } else {
                if (!t.isWhitespace()) {
                    sb.append(token);
                    if (iterator.hasNext()) {
                        sb.append(", ");
                    }
                }
            }
        }
        if (!t.isWhitespace()) {
            if (t.getChunkTags().size() > 0) {
                sb.append(',');
                sb.append(StringUtils.join(t.getChunkTags(), "|"));
            }
            if (t.isImmunized()) {
                sb.append("{!}");
            }
            sb.append("]</font>");
        } else {
            sb.append(' ');
        }
        sb.append("</td>");
        sb.append("<td bgcolor=\"");
        if (odd) {
            sb.append("#ffffff");
        } else {
            sb.append("#f1f1f1");
        }
        sb.append("\">");
        if (!"".equals(t.getHistoricalAnnotations())) {
            sb.append(StringTools.escapeHTML(t.getHistoricalAnnotations()).trim().replace("\n", "<br>"));
        }
        sb.append("</td>");
        sb.append("</tr>");
    }
    return odd;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 65 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class TokenInflectionAgreementRule method match.

@Override
public final RuleMatch[] match(AnalyzedSentence text) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
    List<AnalyzedToken> adjTokenReadings = new ArrayList<>();
    AnalyzedTokenReadings adjAnalyzedTokenReadings = null;
    for (int i = 1; i < tokens.length; i++) {
        AnalyzedTokenReadings tokenReadings = tokens[i];
        String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();
        if (posTag0 == null) {
            //          || posTag0.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
            adjTokenReadings.clear();
            continue;
        }
        if (adjTokenReadings.isEmpty()) {
            // no need to start checking on last token or if no noun
            if (i == tokens.length - 1)
                continue;
            //TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
            if (PosTagHelper.hasPosTagPart(tokens[i], ":nv") || //TODO: turn back on when we can handle pron
            PosTagHelper.hasPosTagPart(tokens[i], "&pron") || PosTagHelper.hasPosTagPart(tokens[i], "<"))
                continue;
            if (!PosTagHelper.hasPosTagPart(tokens[i + 1], "noun:") || PosTagHelper.hasPosTagPart(tokens[i + 1], ":nv") || PosTagHelper.hasPosTagPart(tokens[i + 1], "&pron") || PosTagHelper.hasPosTagPart(tokens[i + 1], "<"))
                continue;
            if (LemmaHelper.hasLemma(tokens[i], Arrays.asList("червоний", "правий", "місцевий", "найсильніший", "найкращі"), ":p:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("новенький", "головний", "вибраний", "більший", "побачений", "подібний"), ":n:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("державний"), ":f:")) {
                adjTokenReadings.clear();
                break;
            }
            for (AnalyzedToken token : tokenReadings) {
                String adjPosTag = token.getPOSTag();
                if (adjPosTag == null) {
                    // can happen for words with ́ or ­
                    continue;
                }
                if (adjPosTag.startsWith("adj")) {
                    adjTokenReadings.add(token);
                    adjAnalyzedTokenReadings = tokenReadings;
                } else {
                    adjTokenReadings.clear();
                    break;
                }
            }
            continue;
        }
        List<AnalyzedToken> slaveTokenReadings = new ArrayList<>();
        for (AnalyzedToken token : tokenReadings) {
            String nounPosTag = token.getPOSTag();
            if (nounPosTag == null) {
                // can happen for words with ́ or ­
                continue;
            }
            if (nounPosTag.startsWith("noun") && !nounPosTag.contains(NO_VIDMINOK_SUBSTR)) {
                slaveTokenReadings.add(token);
            } else if (nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) {
                continue;
            } else {
                slaveTokenReadings.clear();
                break;
            }
        }
        if (slaveTokenReadings.isEmpty()) {
            adjTokenReadings.clear();
            continue;
        }
        if (DEBUG) {
            System.err.println(MessageFormat.format("=== Checking:\n\t{0}\n\t{1}", adjTokenReadings, slaveTokenReadings));
        }
        // perform the check
        List<InflectionHelper.Inflection> masterInflections = InflectionHelper.getAdjInflections(adjTokenReadings);
        List<InflectionHelper.Inflection> slaveInflections = InflectionHelper.getNounInflections(slaveTokenReadings);
        if (Collections.disjoint(masterInflections, slaveInflections)) {
            if (TokenInflectionExceptionHelper.isException(tokens, i, masterInflections, slaveInflections, adjTokenReadings, slaveTokenReadings)) {
                adjTokenReadings.clear();
                continue;
            }
            if (DEBUG) {
                System.err.println(MessageFormat.format("=== Found:\n\t{0}\n\t", adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // " + adjAnalyzedTokenReadings, slaveTokenReadings.get(0).getToken() + ": " + slaveInflections + " // " + slaveTokenReadings));
            }
            String msg = String.format("Потенційна помилка: прикметник не узгоджений з іменником: \"%s\": [%s] і \"%s\": [%s]", adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true), slaveTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));
            if (PosTagHelper.hasPosTagPart(adjTokenReadings, ":m:v_rod") && tokens[i].getToken().matches(".*[ую]") && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*:m:v_dav.*")) {
                msg += ". Можливо вжито невнормований родовий відмінок ч.р. з закінченням -у/-ю замість -а/-я (така тенденція є в сучасній мові)?";
            }
            RuleMatch potentialRuleMatch = new RuleMatch(this, adjAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
            Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
            List<String> suggestions = new ArrayList<>();
            try {
                for (Inflection adjInflection : masterInflections) {
                    String genderTag = ":" + adjInflection.gender + ":";
                    String vidmTag = adjInflection._case;
                    if (!adjInflection._case.equals("v_kly") && (adjInflection.gender.equals("p") || PosTagHelper.hasPosTagPart(slaveTokenReadings, genderTag))) {
                        for (AnalyzedToken nounToken : slaveTokenReadings) {
                            if (adjInflection.animMatters()) {
                                if (!nounToken.getPOSTag().contains(":" + adjInflection.animTag))
                                    continue;
                            }
                            String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...", genderTag + vidmTag);
                            String[] synthesized = ukrainianSynthesizer.synthesize(nounToken, newNounPosTag, false);
                            for (String s : synthesized) {
                                String suggestion = adjAnalyzedTokenReadings.getToken() + " " + s;
                                if (!suggestions.contains(suggestion)) {
                                    suggestions.add(suggestion);
                                }
                            }
                        }
                    }
                }
                for (Inflection nounInflection : slaveInflections) {
                    String genderTag = ":" + nounInflection.gender + ":";
                    String vidmTag = nounInflection._case;
                    if (nounInflection.animMatters()) {
                        vidmTag += ":r" + nounInflection.animTag;
                    }
                    for (AnalyzedToken adjToken : adjTokenReadings) {
                        String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?", genderTag + vidmTag);
                        String[] synthesized = ukrainianSynthesizer.synthesize(adjToken, newAdjTag, false);
                        for (String s : synthesized) {
                            String suggestion = s + " " + tokenReadings.getToken();
                            if (!suggestions.contains(suggestion)) {
                                suggestions.add(suggestion);
                            }
                        }
                    }
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (suggestions.size() > 0) {
                potentialRuleMatch.setSuggestedReplacements(suggestions);
            }
            ruleMatches.add(potentialRuleMatch);
        }
        adjTokenReadings.clear();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : ArrayList(java.util.ArrayList) Inflection(org.languagetool.rules.uk.InflectionHelper.Inflection) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) RuleMatch(org.languagetool.rules.RuleMatch) AnalyzedToken(org.languagetool.AnalyzedToken) Synthesizer(org.languagetool.synthesis.Synthesizer)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2