Search in sources :

Example 36 with Rule

use of org.languagetool.rules.Rule in project languagetool by languagetool-org.

the class CommandLineTools method profileRulesOnText.

/**
   * Simple rule profiler - used to run LT on a corpus to see which
   * rule takes most time. Prints results to System.out.
   *
   * @param contents text to check
   * @param lt instance of LanguageTool
   */
public static void profileRulesOnText(String contents, JLanguageTool lt) throws IOException {
    long[] workTime = new long[10];
    List<Rule> rules = lt.getAllActiveRules();
    int ruleCount = rules.size();
    System.out.printf("Testing %d rules%n", ruleCount);
    System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec.");
    List<String> sentences = lt.sentenceTokenize(contents);
    for (Rule rule : rules) {
        if (rule instanceof TextLevelRule) {
            // profile rules for sentences only
            continue;
        }
        int matchCount = 0;
        for (int k = 0; k < 10; k++) {
            long startTime = System.currentTimeMillis();
            for (String sentence : sentences) {
                matchCount += rule.match(lt.getAnalyzedSentence(sentence)).length;
            }
            long endTime = System.currentTimeMillis();
            workTime[k] = endTime - startTime;
        }
        long time = median(workTime);
        float timeInSeconds = time / 1000.0f;
        float sentencesPerSecond = sentences.size() / timeInSeconds;
        System.out.printf(Locale.ENGLISH, "%s\t%d\t%d\t%d\t%.1f", rule.getId(), time, sentences.size(), matchCount, sentencesPerSecond);
        System.out.println();
    }
}
Also used : BitextRule(org.languagetool.rules.bitext.BitextRule) TextLevelRule(org.languagetool.rules.TextLevelRule) Rule(org.languagetool.rules.Rule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule) TextLevelRule(org.languagetool.rules.TextLevelRule)

Example 37 with Rule

use of org.languagetool.rules.Rule in project languagetool by languagetool-org.

the class Main method runOnFileLineByLine.

private void runOnFileLineByLine(String filename, String encoding) throws IOException {
    System.err.println("Warning: running in line by line mode. Cross-paragraph checks will not work.\n");
    if (options.isVerbose()) {
        lt.setOutput(System.err);
    }
    if (!options.isXmlFormat() && !options.isApplySuggestions()) {
        if (isStdIn(filename)) {
            System.err.println("Working on STDIN...");
        } else {
            System.err.println("Working on " + filename + "...");
        }
    }
    if (profileRules && isStdIn(filename)) {
        throw new IllegalArgumentException("Profiling mode cannot be used with input from STDIN");
    }
    int runCount = 1;
    List<Rule> rules = lt.getAllActiveRules();
    if (profileRules) {
        System.out.printf("Testing %d rules\n", rules.size());
        System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec.");
        runCount = rules.size();
    }
    int lineOffset = 0;
    int tmpLineOffset = 0;
    handleLine(ApiPrintMode.START_API, 0, new StringBuilder());
    StringBuilder sb = new StringBuilder();
    for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount; ruleIndex++) {
        currentRule = rules.get(ruleIndex);
        try (InputStreamReader isr = getInputStreamReader(filename, encoding);
            BufferedReader br = new BufferedReader(isr)) {
            String line;
            int lineCount = 0;
            while ((line = br.readLine()) != null) {
                sb.append(line);
                lineCount++;
                // to detect language from the first input line
                if (lineCount == 1 && options.isAutoDetect()) {
                    Language language = detectLanguageOfString(line);
                    if (language == null) {
                        System.err.println("Could not detect language well enough, using American English");
                        language = new AmericanEnglish();
                    }
                    System.err.println("Language used is: " + language.getName());
                    language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph(options.isSingleLineBreakMarksParagraph());
                    changeLanguage(language, options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules());
                }
                sb.append('\n');
                tmpLineOffset++;
                if (isBreakPoint(line)) {
                    handleLine(ApiPrintMode.CONTINUE_API, lineOffset, sb);
                    if (profileRules) {
                        lt.sentenceTokenize(sb.toString()).size();
                    }
                    sb = new StringBuilder();
                    lineOffset = tmpLineOffset;
                }
            }
        } finally {
            if (sb.length() > 0) {
                if (profileRules) {
                    lt.sentenceTokenize(sb.toString()).size();
                }
            }
            handleLine(ApiPrintMode.END_API, tmpLineOffset - 1, sb);
        }
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) Language(org.languagetool.Language) AmericanEnglish(org.languagetool.language.AmericanEnglish) BufferedReader(java.io.BufferedReader) BitextRule(org.languagetool.rules.bitext.BitextRule) Rule(org.languagetool.rules.Rule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule) StringTools.readerToString(org.languagetool.tools.StringTools.readerToString)

Example 38 with Rule

use of org.languagetool.rules.Rule in project languagetool by languagetool-org.

the class Main method main.

/**
   * Command line tool to check plain text files.
   */
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException {
    JnaTools.setBugWorkaroundProperty();
    CommandLineParser commandLineParser = new CommandLineParser();
    CommandLineOptions options = null;
    try {
        options = commandLineParser.parseOptions(args);
    } catch (WrongParameterNumberException e) {
        commandLineParser.printUsage();
        System.exit(1);
    } catch (IllegalArgumentException e) {
        System.err.println(e.toString());
        System.exit(1);
    } catch (UnknownParameterException e) {
        if (e.getMessage() != null) {
            System.err.println(e.getMessage());
        } else {
            System.err.println(e.toString());
        }
        commandLineParser.printUsage(System.err);
        System.exit(1);
    }
    if (options.isPrintUsage()) {
        commandLineParser.printUsage();
        System.exit(1);
    }
    if (options.isPrintVersion()) {
        System.out.println("LanguageTool version " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
        System.exit(0);
    }
    if (options.isPrintLanguages()) {
        printLanguages();
        System.exit(0);
    }
    if (options.getFilename() == null) {
        options.setFilename("-");
    }
    String languageHint = null;
    if (options.getLanguage() == null) {
        if (!options.isXmlFormat() && !options.isAutoDetect()) {
            System.err.println("No language specified, using English (no spell checking active, " + "specify a language variant like 'en-GB' if available)");
        }
        options.setLanguage(new English());
    } else if (!options.isXmlFormat() && !options.isApplySuggestions()) {
        languageHint = "Expected text language: " + options.getLanguage().getName();
    }
    options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph(options.isSingleLineBreakMarksParagraph());
    Main prg = new Main(options);
    if (options.getFalseFriendFile() != null) {
        List<AbstractPatternRule> ffRules = prg.lt.loadFalseFriendRules(options.getFalseFriendFile());
        for (AbstractPatternRule ffRule : ffRules) {
            prg.lt.addRule(ffRule);
        }
    }
    if (prg.lt.getAllActiveRules().size() == 0) {
        List<String> catIds = options.getEnabledCategories().stream().map(i -> i.toString()).collect(Collectors.toList());
        throw new RuntimeException("No rules are active. Please make sure your rule ids " + "(" + options.getEnabledRules() + ") and " + "category ids (" + catIds + ") are correct");
    }
    if (languageHint != null) {
        String spellHint = prg.isSpellCheckingActive() ? "" : " (no spell checking active, specify a language variant like 'en-GB' if available)";
        System.err.println(languageHint + spellHint);
    }
    prg.setListUnknownWords(options.isListUnknown());
    if (options.isProfile()) {
        prg.setProfilingMode();
    }
    if (options.isBitext()) {
        if (options.getMotherTongue() == null) {
            throw new IllegalArgumentException("You have to set the source language (as mother tongue) in bitext mode");
        }
        File bitextRuleFile = options.getBitextRuleFile() != null ? new File(options.getBitextRuleFile()) : null;
        prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules(), bitextRuleFile);
    }
    if (options.isRecursive()) {
        prg.runRecursive(options.getFilename(), options.getEncoding(), options.isXmlFiltering());
    } else {
        if (options.isLineByLine()) {
            prg.runOnFileLineByLine(options.getFilename(), options.getEncoding());
        } else {
            prg.runOnFile(options.getFilename(), options.getEncoding(), options.isXmlFiltering());
        }
    }
    prg.cleanUp();
}
Also used : BufferedInputStream(java.io.BufferedInputStream) JLanguageTool(org.languagetool.JLanguageTool) StringTools.readerToString(org.languagetool.tools.StringTools.readerToString) ArrayList(java.util.ArrayList) MultiThreadedJLanguageTool(org.languagetool.MultiThreadedJLanguageTool) HashSet(java.util.HashSet) Charset(java.nio.charset.Charset) BOMInputStream(org.apache.commons.io.input.BOMInputStream) LanguageIdentifier(org.languagetool.language.LanguageIdentifier) TabBitextReader(org.languagetool.bitext.TabBitextReader) ApiPrintMode(org.languagetool.tools.StringTools.ApiPrintMode) AmericanEnglish(org.languagetool.language.AmericanEnglish) BitextRule(org.languagetool.rules.bitext.BitextRule) IOException(java.io.IOException) ByteOrderMark(org.apache.commons.io.ByteOrderMark) FileInputStream(java.io.FileInputStream) InputStreamReader(java.io.InputStreamReader) Collectors(java.util.stream.Collectors) File(java.io.File) Rule(org.languagetool.rules.Rule) English(org.languagetool.language.English) List(java.util.List) Tools(org.languagetool.tools.Tools) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) StringTools.filterXML(org.languagetool.tools.StringTools.filterXML) SAXException(org.xml.sax.SAXException) Language(org.languagetool.Language) PatternRuleLoader(org.languagetool.rules.patterns.PatternRuleLoader) Languages(org.languagetool.Languages) JnaTools(org.languagetool.tools.JnaTools) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) InputStream(java.io.InputStream) StringTools.readerToString(org.languagetool.tools.StringTools.readerToString) AmericanEnglish(org.languagetool.language.AmericanEnglish) English(org.languagetool.language.English) File(java.io.File) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule)

Example 39 with Rule

use of org.languagetool.rules.Rule in project languagetool by languagetool-org.

the class DatabaseHandler method handleResult.

@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
    try {
        java.sql.Date nowDate = new java.sql.Date(new Date().getTime());
        for (RuleMatch match : ruleMatches) {
            String smallContext = smallContextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
            insertSt.setString(1, language.getShortCode());
            Rule rule = match.getRule();
            insertSt.setString(2, rule.getId());
            insertSt.setString(3, rule.getCategory().getName());
            if (rule instanceof AbstractPatternRule) {
                AbstractPatternRule patternRule = (AbstractPatternRule) rule;
                insertSt.setString(4, patternRule.getSubId());
            } else {
                insertSt.setNull(4, Types.VARCHAR);
            }
            insertSt.setString(5, rule.getDescription());
            insertSt.setString(6, StringUtils.abbreviate(match.getMessage(), 255));
            String context = contextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
            if (context.length() > MAX_CONTEXT_LENGTH) {
                // let's skip these strange cases, as shortening the text might leave us behind with invalid markup etc
                continue;
            }
            insertSt.setString(7, context);
            insertSt.setString(8, StringUtils.abbreviate(smallContext, 255));
            // should actually be the dump's date, but isn't really used anyway...
            insertSt.setDate(9, nowDate);
            insertSt.setDate(10, nowDate);
            insertSt.setString(11, sentence.getUrl());
            insertSt.setString(12, sentence.getSource());
            insertSt.addBatch();
            if (++batchCount >= batchSize) {
                executeBatch();
                batchCount = 0;
            }
            checkMaxErrors(++errorCount);
            if (errorCount % 100 == 0) {
                System.out.println("Storing error #" + errorCount + " for text:");
                System.out.println("  " + sentence.getText());
            }
        }
        checkMaxSentences(++sentenceCount);
    } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException("Error storing matches for '" + sentence.getTitle() + "'", e);
    }
}
Also used : Date(java.util.Date) IOException(java.io.IOException) java.sql(java.sql) RuleMatch(org.languagetool.rules.RuleMatch) Rule(org.languagetool.rules.Rule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule) AbstractPatternRule(org.languagetool.rules.patterns.AbstractPatternRule)

Example 40 with Rule

use of org.languagetool.rules.Rule in project languagetool by languagetool-org.

the class SentenceSourceChecker method enableOnlySpecifiedRules.

private void enableOnlySpecifiedRules(String[] ruleIds, JLanguageTool languageTool) {
    for (Rule rule : languageTool.getAllRules()) {
        languageTool.disableRule(rule.getId());
    }
    for (String ruleId : ruleIds) {
        languageTool.enableRule(ruleId);
    }
    warnOnNonExistingRuleIds(ruleIds, languageTool);
    System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds));
}
Also used : Rule(org.languagetool.rules.Rule)

Aggregations

Rule (org.languagetool.rules.Rule)67 JLanguageTool (org.languagetool.JLanguageTool)16 PatternRule (org.languagetool.rules.patterns.PatternRule)15 ArrayList (java.util.ArrayList)14 RuleMatch (org.languagetool.rules.RuleMatch)14 AbstractPatternRule (org.languagetool.rules.patterns.AbstractPatternRule)12 Test (org.junit.Test)11 Language (org.languagetool.Language)11 IncorrectExample (org.languagetool.rules.IncorrectExample)8 English (org.languagetool.language.English)7 SpellingCheckRule (org.languagetool.rules.spelling.SpellingCheckRule)7 File (java.io.File)6 PatternToken (org.languagetool.rules.patterns.PatternToken)6 AnalyzedSentence (org.languagetool.AnalyzedSentence)5 AmericanEnglish (org.languagetool.language.AmericanEnglish)5 BitextRule (org.languagetool.rules.bitext.BitextRule)5 GermanSpellerRule (org.languagetool.rules.de.GermanSpellerRule)4 DisambiguationPatternRule (org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule)4 FileReader (java.io.FileReader)3 IOException (java.io.IOException)3