Search in sources :

Example 76 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class CommandLineTools method checkBitext.

/**
   * Checks the bilingual input (bitext) and displays the output (considering the target 
   * language) in API format or in the simple text format.
   *
   * NOTE: the positions returned by the rule matches are adjusted
   * according to the data returned by the reader.
   *
   * @param reader   Reader of bitext strings.
   * @param srcLt Source JLanguageTool (used to analyze the text).
   * @param trgLt Target JLanguageTool (used to analyze the text).
   * @param bRules  Bilingual rules used in addition to target standard rules.
   * @return The number of rules matched on the bitext.
   * @since 1.0.1
   */
public static int checkBitext(BitextReader reader, JLanguageTool srcLt, JLanguageTool trgLt, List<BitextRule> bRules, boolean isXmlFormat) throws IOException {
    long startTime = System.currentTimeMillis();
    int contextSize = DEFAULT_CONTEXT_SIZE;
    List<RuleMatch> ruleMatches = new ArrayList<>();
    int matchCount = 0;
    int sentCount = 0;
    RuleMatchAsXmlSerializer serializer = new RuleMatchAsXmlSerializer();
    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    if (isXmlFormat) {
        out.print(serializer.getXmlStart(null, null));
    }
    for (StringPair srcAndTrg : reader) {
        List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), srcLt, trgLt, bRules);
        List<RuleMatch> fixedMatches = new ArrayList<>();
        for (RuleMatch thisMatch : curMatches) {
            fixedMatches.add(trgLt.adjustRuleMatchPos(thisMatch, reader.getSentencePosition(), reader.getColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
        }
        ruleMatches.addAll(fixedMatches);
        if (fixedMatches.size() > 0) {
            if (isXmlFormat) {
                String xml = serializer.ruleMatchesToXmlSnippet(fixedMatches, reader.getCurrentLine(), contextSize);
                out.print(xml);
            } else {
                printMatches(fixedMatches, matchCount, reader.getCurrentLine(), contextSize);
                matchCount += fixedMatches.size();
            }
        }
        sentCount++;
    }
    displayTimeStats(startTime, sentCount, isXmlFormat);
    if (isXmlFormat) {
        out.print(serializer.getXmlEnd());
    }
    return ruleMatches.size();
}
Also used : PrintStream(java.io.PrintStream) RuleMatch(org.languagetool.rules.RuleMatch) StringPair(org.languagetool.bitext.StringPair) ArrayList(java.util.ArrayList) RuleMatchAsXmlSerializer(org.languagetool.tools.RuleMatchAsXmlSerializer)

Example 77 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class CommandLineTools method correctTextFromMatches.

private static String correctTextFromMatches(String contents, List<RuleMatch> matches) {
    StringBuilder sb = new StringBuilder(contents);
    List<String> errors = new ArrayList<>();
    for (RuleMatch rm : matches) {
        List<String> replacements = rm.getSuggestedReplacements();
        if (!replacements.isEmpty()) {
            errors.add(sb.substring(rm.getFromPos(), rm.getToPos()));
        }
    }
    int offset = 0;
    int counter = 0;
    for (RuleMatch rm : matches) {
        List<String> replacements = rm.getSuggestedReplacements();
        if (!replacements.isEmpty()) {
            //make sure the error hasn't been already corrected:
            if (errors.get(counter).equals(sb.substring(rm.getFromPos() - offset, rm.getToPos() - offset))) {
                sb.replace(rm.getFromPos() - offset, rm.getToPos() - offset, replacements.get(0));
                offset += (rm.getToPos() - rm.getFromPos()) - replacements.get(0).length();
            }
            counter++;
        }
    }
    return sb.toString();
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) ArrayList(java.util.ArrayList)

Example 78 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class DatabaseHandler method handleResult.

@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
    try {
        java.sql.Date nowDate = new java.sql.Date(new Date().getTime());
        for (RuleMatch match : ruleMatches) {
            String smallContext = smallContextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
            insertSt.setString(1, language.getShortCode());
            Rule rule = match.getRule();
            insertSt.setString(2, rule.getId());
            insertSt.setString(3, rule.getCategory().getName());
            if (rule instanceof AbstractPatternRule) {
                AbstractPatternRule patternRule = (AbstractPatternRule) rule;
                insertSt.setString(4, patternRule.getSubId());
            } else {
                insertSt.setNull(4, Types.VARCHAR);
            }
            insertSt.setString(5, rule.getDescription());
            insertSt.setString(6, StringUtils.abbreviate(match.getMessage(), 255));
            String context = contextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
            if (context.length() > MAX_CONTEXT_LENGTH) {
                // let's skip these strange cases, as shortening the text might leave us behind with invalid markup etc
                continue;
            }
            insertSt.setString(7, context);
            insertSt.setString(8, StringUtils.abbreviate(smallContext, 255));
            // should actually be the dump's date, but isn't really used anyway...
            insertSt.setDate(9, nowDate);
            insertSt.setDate(10, nowDate);
            insertSt.setString(11, sentence.getUrl());
            insertSt.setString(12, sentence.getSource());
            insertSt.addBatch();
            if (++batchCount >= batchSize) {
                executeBatch();
                batchCount = 0;
            }
            checkMaxErrors(++errorCount);
            if (errorCount % 100 == 0) {
                System.out.println("Storing error #" + errorCount + " for text:");
                System.out.println("  " + sentence.getText());
            }
        }
        checkMaxSentences(++sentenceCount);
    } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException("Error storing matches for '" + sentence.getTitle() + "'", e);
    }
}
Also used : Date(java.util.Date) IOException(java.io.IOException) java.sql(java.sql) RuleMatch(org.languagetool.rules.RuleMatch) Rule(org.languagetool.rules.Rule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule)

Example 79 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class SentenceSourceChecker method run.

private void run(File propFile, Set<String> disabledRules, String langCode, List<String> fileNames, String[] ruleIds, String[] additionalCategoryIds, int maxSentences, int maxErrors, File languageModelDir, Pattern filter) throws IOException {
    Language lang = Languages.getLanguageForShortCode(langCode);
    MultiThreadedJLanguageTool languageTool = new MultiThreadedJLanguageTool(lang);
    languageTool.setCleanOverlappingMatches(false);
    if (languageModelDir != null) {
        languageTool.activateLanguageModelRules(languageModelDir);
    }
    if (ruleIds != null) {
        enableOnlySpecifiedRules(ruleIds, languageTool);
    } else {
        applyRuleDeactivation(languageTool, disabledRules);
    }
    if (filter != null) {
        System.out.println("*** NOTE: only sentences that match regular expression '" + filter + "' will be checked");
    }
    activateAdditionalCategories(additionalCategoryIds, languageTool);
    disableSpellingRules(languageTool);
    System.out.println("Working on: " + StringUtils.join(fileNames, ", "));
    System.out.println("Sentence limit: " + (maxSentences > 0 ? maxSentences : "no limit"));
    System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
    ResultHandler resultHandler = null;
    int ruleMatchCount = 0;
    int sentenceCount = 0;
    try {
        if (propFile != null) {
            resultHandler = new DatabaseHandler(propFile, maxSentences, maxErrors);
        } else {
            //resultHandler = new CompactStdoutHandler(maxSentences, maxErrors);
            resultHandler = new StdoutHandler(maxSentences, maxErrors);
        }
        MixingSentenceSource mixingSource = MixingSentenceSource.create(fileNames, lang, filter);
        while (mixingSource.hasNext()) {
            Sentence sentence = mixingSource.next();
            try {
                List<RuleMatch> matches = languageTool.check(sentence.getText());
                resultHandler.handleResult(sentence, matches, lang);
                sentenceCount++;
                if (sentenceCount % 5000 == 0) {
                    System.err.printf("%s sentences checked...\n", NumberFormat.getNumberInstance(Locale.US).format(sentenceCount));
                }
                ruleMatchCount += matches.size();
            } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
                throw e;
            } catch (Exception e) {
                throw new RuntimeException("Check failed on sentence: " + StringUtils.abbreviate(sentence.getText(), 250), e);
            }
        }
    } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
        System.out.println(getClass().getSimpleName() + ": " + e);
    } finally {
        languageTool.shutdown();
        if (resultHandler != null) {
            float matchesPerSentence = (float) ruleMatchCount / sentenceCount;
            System.out.printf(lang + ": %d total matches\n", ruleMatchCount);
            System.out.printf(lang + ": ΓΈ%.2f rule matches per sentence\n", matchesPerSentence);
            try {
                resultHandler.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}
Also used : MultiThreadedJLanguageTool(org.languagetool.MultiThreadedJLanguageTool) IOException(java.io.IOException) RuleMatch(org.languagetool.rules.RuleMatch) Language(org.languagetool.Language)

Example 80 with RuleMatch

use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.

the class StdoutHandler method handleResult.

@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
    if (ruleMatches.size() > 0) {
        int i = 1;
        System.out.println("\nTitle: " + sentence.getTitle());
        for (RuleMatch match : ruleMatches) {
            String output = i + ".) Line " + (match.getLine() + 1) + ", column " + match.getColumn() + ", Rule ID: " + match.getRule().getId();
            if (match.getRule() instanceof AbstractPatternRule) {
                AbstractPatternRule pRule = (AbstractPatternRule) match.getRule();
                output += "[" + pRule.getSubId() + "]";
            }
            System.out.println(output);
            String msg = match.getMessage();
            msg = msg.replaceAll("<suggestion>", "'");
            msg = msg.replaceAll("</suggestion>", "'");
            System.out.println("Message: " + msg);
            List<String> replacements = match.getSuggestedReplacements();
            if (!replacements.isEmpty()) {
                System.out.println("Suggestion: " + String.join("; ", replacements));
            }
            System.out.println(contextTools.getPlainTextContext(match.getFromPos(), match.getToPos(), sentence.getText()));
            i++;
            checkMaxErrors(++errorCount);
        }
    }
    checkMaxSentences(++sentenceCount);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule)

Aggregations

RuleMatch (org.languagetool.rules.RuleMatch)144 Test (org.junit.Test)64 JLanguageTool (org.languagetool.JLanguageTool)54 ArrayList (java.util.ArrayList)30 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)14 Rule (org.languagetool.rules.Rule)14 Language (org.languagetool.Language)10 PatternRule (org.languagetool.rules.patterns.PatternRule)10 AnalyzedSentence (org.languagetool.AnalyzedSentence)8 Ukrainian (org.languagetool.language.Ukrainian)8 AbstractPatternRule (org.languagetool.rules.patterns.AbstractPatternRule)8 Matcher (java.util.regex.Matcher)7 English (org.languagetool.language.English)7 IOException (java.io.IOException)6 Catalan (org.languagetool.language.Catalan)6 Polish (org.languagetool.language.Polish)6 GermanyGerman (org.languagetool.language.GermanyGerman)5 AnnotatedText (org.languagetool.markup.AnnotatedText)5 PatternToken (org.languagetool.rules.patterns.PatternToken)5 AnalyzedToken (org.languagetool.AnalyzedToken)4