Search in sources :

Example 61 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class Example method main.

public static void main(String[] args) throws IOException {
    List<Language> realLanguages = Languages.get();
    System.out.println("This example will test a short string with all languages known to LanguageTool.");
    System.out.println("It's just a test to make sure there's at least no crash.");
    System.out.println("Using LanguageTool " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
    System.out.println("Supported languages: " + realLanguages.size());
    for (Language language : realLanguages) {
        JLanguageTool langTool = new JLanguageTool(language);
        String input = "And the the";
        List<RuleMatch> result = langTool.check(input);
        System.out.println("Checking '" + input + "' with " + language + ":");
        for (RuleMatch ruleMatch : result) {
            System.out.println("    " + ruleMatch);
        }
    }
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) Language(org.languagetool.Language) JLanguageTool(org.languagetool.JLanguageTool)

Example 62 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class CommandLineTools method checkText.

/**
   * Check the given text and print results to System.out.
   *
   * @param contents a text to check (may be more than one sentence)
   * @param lt Initialized LanguageTool
   * @param isXmlFormat whether to print the result in XML format
   * @param isJsonFormat whether to print the result in JSON format
   * @param contextSize error text context size: -1 for default
   * @param lineOffset line number offset to be added to line numbers in matches
   * @param prevMatches number of previously matched rules
   * @param apiMode mode of xml/json printout for simple xml/json output
   * @return Number of rule matches to the input text.
   */
public static int checkText(String contents, JLanguageTool lt, boolean isXmlFormat, boolean isJsonFormat, int contextSize, int lineOffset, int prevMatches, StringTools.ApiPrintMode apiMode, boolean listUnknownWords, List<String> unknownWords) throws IOException {
    if (contextSize == -1) {
        contextSize = DEFAULT_CONTEXT_SIZE;
    }
    long startTime = System.currentTimeMillis();
    List<RuleMatch> ruleMatches = lt.check(contents);
    // adjust line numbers
    for (RuleMatch r : ruleMatches) {
        r.setLine(r.getLine() + lineOffset);
        r.setEndLine(r.getEndLine() + lineOffset);
    }
    if (isXmlFormat) {
        if (listUnknownWords && apiMode == StringTools.ApiPrintMode.NORMAL_API) {
            unknownWords = lt.getUnknownWords();
        }
        RuleMatchAsXmlSerializer serializer = new RuleMatchAsXmlSerializer();
        String xml = serializer.ruleMatchesToXml(ruleMatches, contents, contextSize, apiMode, lt.getLanguage(), unknownWords);
        PrintStream out = new PrintStream(System.out, true, "UTF-8");
        out.print(xml);
    } else if (isJsonFormat) {
        RuleMatchesAsJsonSerializer serializer = new RuleMatchesAsJsonSerializer();
        String json = serializer.ruleMatchesToJson(ruleMatches, contents, contextSize, lt.getLanguage());
        PrintStream out = new PrintStream(System.out, true, "UTF-8");
        out.print(json);
    } else {
        printMatches(ruleMatches, prevMatches, contents, contextSize);
    }
    //display stats if it's not in a buffered mode
    if (apiMode == StringTools.ApiPrintMode.NORMAL_API && !isJsonFormat) {
        SentenceTokenizer sentenceTokenizer = lt.getLanguage().getSentenceTokenizer();
        int sentenceCount = sentenceTokenizer.tokenize(contents).size();
        displayTimeStats(startTime, sentenceCount, isXmlFormat);
    }
    return ruleMatches.size();
}
Also used : RuleMatchesAsJsonSerializer(org.languagetool.tools.RuleMatchesAsJsonSerializer) PrintStream(java.io.PrintStream) RuleMatch(org.languagetool.rules.RuleMatch) SentenceTokenizer(org.languagetool.tokenizers.SentenceTokenizer) RuleMatchAsXmlSerializer(org.languagetool.tools.RuleMatchAsXmlSerializer)

Example 63 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class CommandLineTools method correctBitext.

/**
   * Automatically applies suggestions to the bilingual text.
   * Note: if there is more than one suggestion, always the first
   * one is applied, and others ignored silently.
   * Prints results to System.out.
   *
   * @param reader a bitext file reader
   * @param sourceLt Initialized source JLanguageTool object
   * @param targetLt Initialized target JLanguageTool object
   * @param bRules  List of all BitextRules to use
   */
public static void correctBitext(BitextReader reader, JLanguageTool sourceLt, JLanguageTool targetLt, List<BitextRule> bRules) throws IOException {
    for (StringPair srcAndTrg : reader) {
        List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), sourceLt, targetLt, bRules);
        List<RuleMatch> fixedMatches = new ArrayList<>();
        for (RuleMatch thisMatch : curMatches) {
            fixedMatches.add(targetLt.adjustRuleMatchPos(thisMatch, //don't need to adjust at all, we have zero offset related to trg sentence 
            0, reader.getTargetColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
        }
        if (fixedMatches.size() > 0) {
            System.out.println(correctTextFromMatches(srcAndTrg.getTarget(), fixedMatches));
        } else {
            System.out.println(srcAndTrg.getTarget());
        }
    }
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) StringPair(org.languagetool.bitext.StringPair) ArrayList(java.util.ArrayList)

Example 64 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class JLanguageToolTest method testCleanOverlappingWithGerman.

@Test
public void testCleanOverlappingWithGerman() throws IOException {
    JLanguageTool tool = new JLanguageTool(new GermanyGerman());
    // Juxtaposed errors in "TRGS - Technische" should not be removed.
    List<RuleMatch> matches = tool.check("TRGS - Technische Regeln für Gefahrstoffe");
    assertEquals(3, matches.size());
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) GermanyGerman(org.languagetool.language.GermanyGerman) Test(org.junit.Test)

Example 65 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class AccentuationCheckRule method match.

@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        // ignoring token 0, i.e. SENT_START
        final String token;
        if (i == 1) {
            token = tokens[i].getToken().toLowerCase();
        } else {
            token = tokens[i].getToken();
        }
        final String prevToken = tokens[i - 1].getToken();
        String prevPrevToken = "";
        if (i > 2) {
            prevPrevToken = tokens[i - 2].getToken();
        }
        String nextToken = "";
        if (i < tokens.length - 1) {
            nextToken = tokens[i + 1].getToken();
        }
        String nextNextToken = "";
        if (i < tokens.length - 2) {
            nextNextToken = tokens[i + 2].getToken();
        }
        boolean isRelevantWord = false;
        boolean isRelevantWord2 = false;
        if (StringTools.isEmpty(token)) {
            continue;
        }
        if (relevantWords.containsKey(token)) {
            isRelevantWord = true;
        }
        if (relevantWords2.containsKey(token)) {
            isRelevantWord2 = true;
        }
        if (!isRelevantWord && !isRelevantWord2) {
            continue;
        }
        // verb amb pronom feble davant
        if (matchPostagRegexp(tokens[i - 1], PRONOM_FEBLE) && !prevToken.startsWith("'") && !prevToken.startsWith("-")) {
            continue;
        }
        String replacement = null;
        final Matcher mPreposicioDE = PREPOSICIO_DE.matcher(nextToken);
        final Matcher mExcepcionsDE = EXCEPCIONS_DARRERE_DE.matcher(nextNextToken);
        final Matcher mArticleELMS = ARTICLE_EL_MS.matcher(prevToken);
        final Matcher mArticleELFS = ARTICLE_EL_FS.matcher(prevToken);
        final Matcher mArticleELMP = ARTICLE_EL_MP.matcher(prevToken);
        final Matcher mArticleELFP = ARTICLE_EL_FP.matcher(prevToken);
        // VERB WITHOUT ACCENT -> NOUN WITH ACCENT
        if (isRelevantWord && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
            // amb renuncies
            if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANT) && !matchPostagRegexp(tokens[i], INFINITIU)) {
                replacement = relevantWords.get(token).getToken();
            } else if (i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 2], DETERMINANT) && (matchPostagRegexp(tokens[i - 1], DETERMINANT) || mArticleELMS.matches() || mArticleELFS.matches() || mArticleELMP.matches() || mArticleELFP.matches()) && !matchPostagRegexp(tokens[i], INFINITIU)) {
                replacement = relevantWords.get(token).getToken();
            } else // aquestes renuncies
            if (((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS) && !token.equals("cantar")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS) && !token.equals("venia") && !token.equals("tenia") && !token.equals("continua") && !token.equals("genera") && !token.equals("faria")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fumaré una faria (correct: fària)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fem la copia (correct: còpia)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // circumstancies d'una altra classe
            if (!matchPostagRegexp(tokens[i], PARTICIPI_MS) && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("espero") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !prevToken.equals("que") && !prevToken.equals("qui") && !prevToken.equals("què") && mPreposicioDE.matches() && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && !matchPostagRegexp(tokens[i + 1], LOCUCIONS) && (i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIU) && !mExcepcionsDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
                replacement = relevantWords.get(token).getToken();
            } else // la renuncia del president.
            if (!token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !token.equals("diferencia") && !token.equals("diferencies") && !token.equals("distancia") && !token.equals("distancies") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP))) && mPreposicioDE.matches()) {
                replacement = relevantWords.get(token).getToken();
            } else // circumstancies extraordinàries
            if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && (i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les seves contraries
            if ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && !matchPostagRegexp(tokens[i], VERB_3S) && !matchPostagRegexp(tokens[i], GRUP_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP))) {
                replacement = relevantWords.get(token).getToken();
            } else //una nova formula que (fórmula)
            if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les circumstancies que ens envolten
            if (nextToken.equals("que") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
            // de positiva influencia
            if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
        }
        // VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
        if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
            // de manera obvia, circumstàncies extraordinaries.
            if ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i - 1], NOM_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERB_CONJUGAT) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("manera") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i - 1], NOM_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i - 1], NOM_FP))) {
                replacement = relevantWords2.get(token).getToken();
            } else // de continua disputa
            if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
                replacement = relevantWords2.get(token).getToken();
            } else // la magnifica conservació
            if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && mArticleELMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && mArticleELFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && mArticleELMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && mArticleELFP.matches()))) {
                replacement = relevantWords2.get(token).getToken();
            }
        }
        if (replacement != null) {
            final String msg = "Si és un nom o un adjectiu, ha de portar accent.";
            final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta un accent");
            ruleMatch.setSuggestedReplacement(replacement);
            ruleMatches.add(ruleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

RuleMatch (org.languagetool.rules.RuleMatch)144 Test (org.junit.Test)64 JLanguageTool (org.languagetool.JLanguageTool)54 ArrayList (java.util.ArrayList)30 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)14 Rule (org.languagetool.rules.Rule)14 Language (org.languagetool.Language)10 PatternRule (org.languagetool.rules.patterns.PatternRule)10 AnalyzedSentence (org.languagetool.AnalyzedSentence)8 Ukrainian (org.languagetool.language.Ukrainian)8 AbstractPatternRule (org.languagetool.rules.patterns.AbstractPatternRule)8 Matcher (java.util.regex.Matcher)7 English (org.languagetool.language.English)7 IOException (java.io.IOException)6 Catalan (org.languagetool.language.Catalan)6 Polish (org.languagetool.language.Polish)6 GermanyGerman (org.languagetool.language.GermanyGerman)5 AnnotatedText (org.languagetool.markup.AnnotatedText)5 PatternToken (org.languagetool.rules.patterns.PatternToken)5 AnalyzedToken (org.languagetool.AnalyzedToken)4