use of org.languagetool.rules.Rule in project languagetool by languagetool-org.
the class CommandLineTools method profileRulesOnText.
/**
* Simple rule profiler - used to run LT on a corpus to see which
* rule takes most time. Prints results to System.out.
*
* @param contents text to check
* @param lt instance of LanguageTool
*/
public static void profileRulesOnText(String contents, JLanguageTool lt) throws IOException {
long[] workTime = new long[10];
List<Rule> rules = lt.getAllActiveRules();
int ruleCount = rules.size();
System.out.printf("Testing %d rules%n", ruleCount);
System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec.");
List<String> sentences = lt.sentenceTokenize(contents);
for (Rule rule : rules) {
if (rule instanceof TextLevelRule) {
// profile rules for sentences only
continue;
}
int matchCount = 0;
for (int k = 0; k < 10; k++) {
long startTime = System.currentTimeMillis();
for (String sentence : sentences) {
matchCount += rule.match(lt.getAnalyzedSentence(sentence)).length;
}
long endTime = System.currentTimeMillis();
workTime[k] = endTime - startTime;
}
long time = median(workTime);
float timeInSeconds = time / 1000.0f;
float sentencesPerSecond = sentences.size() / timeInSeconds;
System.out.printf(Locale.ENGLISH, "%s\t%d\t%d\t%d\t%.1f", rule.getId(), time, sentences.size(), matchCount, sentencesPerSecond);
System.out.println();
}
}
use of org.languagetool.rules.Rule in project languagetool by languagetool-org.
the class Main method runOnFileLineByLine.
private void runOnFileLineByLine(String filename, String encoding) throws IOException {
System.err.println("Warning: running in line by line mode. Cross-paragraph checks will not work.\n");
if (options.isVerbose()) {
lt.setOutput(System.err);
}
if (!options.isXmlFormat() && !options.isApplySuggestions()) {
if (isStdIn(filename)) {
System.err.println("Working on STDIN...");
} else {
System.err.println("Working on " + filename + "...");
}
}
if (profileRules && isStdIn(filename)) {
throw new IllegalArgumentException("Profiling mode cannot be used with input from STDIN");
}
int runCount = 1;
List<Rule> rules = lt.getAllActiveRules();
if (profileRules) {
System.out.printf("Testing %d rules\n", rules.size());
System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec.");
runCount = rules.size();
}
int lineOffset = 0;
int tmpLineOffset = 0;
handleLine(ApiPrintMode.START_API, 0, new StringBuilder());
StringBuilder sb = new StringBuilder();
for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount; ruleIndex++) {
currentRule = rules.get(ruleIndex);
try (InputStreamReader isr = getInputStreamReader(filename, encoding);
BufferedReader br = new BufferedReader(isr)) {
String line;
int lineCount = 0;
while ((line = br.readLine()) != null) {
sb.append(line);
lineCount++;
// to detect language from the first input line
if (lineCount == 1 && options.isAutoDetect()) {
Language language = detectLanguageOfString(line);
if (language == null) {
System.err.println("Could not detect language well enough, using American English");
language = new AmericanEnglish();
}
System.err.println("Language used is: " + language.getName());
language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph(options.isSingleLineBreakMarksParagraph());
changeLanguage(language, options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules());
}
sb.append('\n');
tmpLineOffset++;
if (isBreakPoint(line)) {
handleLine(ApiPrintMode.CONTINUE_API, lineOffset, sb);
if (profileRules) {
lt.sentenceTokenize(sb.toString()).size();
}
sb = new StringBuilder();
lineOffset = tmpLineOffset;
}
}
} finally {
if (sb.length() > 0) {
if (profileRules) {
lt.sentenceTokenize(sb.toString()).size();
}
}
handleLine(ApiPrintMode.END_API, tmpLineOffset - 1, sb);
}
}
}
use of org.languagetool.rules.Rule in project languagetool by languagetool-org.
the class Main method main.
/**
* Command line tool to check plain text files.
*/
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException {
JnaTools.setBugWorkaroundProperty();
CommandLineParser commandLineParser = new CommandLineParser();
CommandLineOptions options = null;
try {
options = commandLineParser.parseOptions(args);
} catch (WrongParameterNumberException e) {
commandLineParser.printUsage();
System.exit(1);
} catch (IllegalArgumentException e) {
System.err.println(e.toString());
System.exit(1);
} catch (UnknownParameterException e) {
if (e.getMessage() != null) {
System.err.println(e.getMessage());
} else {
System.err.println(e.toString());
}
commandLineParser.printUsage(System.err);
System.exit(1);
}
if (options.isPrintUsage()) {
commandLineParser.printUsage();
System.exit(1);
}
if (options.isPrintVersion()) {
System.out.println("LanguageTool version " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
System.exit(0);
}
if (options.isPrintLanguages()) {
printLanguages();
System.exit(0);
}
if (options.getFilename() == null) {
options.setFilename("-");
}
String languageHint = null;
if (options.getLanguage() == null) {
if (!options.isXmlFormat() && !options.isAutoDetect()) {
System.err.println("No language specified, using English (no spell checking active, " + "specify a language variant like 'en-GB' if available)");
}
options.setLanguage(new English());
} else if (!options.isXmlFormat() && !options.isApplySuggestions()) {
languageHint = "Expected text language: " + options.getLanguage().getName();
}
options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph(options.isSingleLineBreakMarksParagraph());
Main prg = new Main(options);
if (options.getFalseFriendFile() != null) {
List<AbstractPatternRule> ffRules = prg.lt.loadFalseFriendRules(options.getFalseFriendFile());
for (AbstractPatternRule ffRule : ffRules) {
prg.lt.addRule(ffRule);
}
}
if (prg.lt.getAllActiveRules().size() == 0) {
List<String> catIds = options.getEnabledCategories().stream().map(i -> i.toString()).collect(Collectors.toList());
throw new RuntimeException("No rules are active. Please make sure your rule ids " + "(" + options.getEnabledRules() + ") and " + "category ids (" + catIds + ") are correct");
}
if (languageHint != null) {
String spellHint = prg.isSpellCheckingActive() ? "" : " (no spell checking active, specify a language variant like 'en-GB' if available)";
System.err.println(languageHint + spellHint);
}
prg.setListUnknownWords(options.isListUnknown());
if (options.isProfile()) {
prg.setProfilingMode();
}
if (options.isBitext()) {
if (options.getMotherTongue() == null) {
throw new IllegalArgumentException("You have to set the source language (as mother tongue) in bitext mode");
}
File bitextRuleFile = options.getBitextRuleFile() != null ? new File(options.getBitextRuleFile()) : null;
prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules(), bitextRuleFile);
}
if (options.isRecursive()) {
prg.runRecursive(options.getFilename(), options.getEncoding(), options.isXmlFiltering());
} else {
if (options.isLineByLine()) {
prg.runOnFileLineByLine(options.getFilename(), options.getEncoding());
} else {
prg.runOnFile(options.getFilename(), options.getEncoding(), options.isXmlFiltering());
}
}
prg.cleanUp();
}
use of org.languagetool.rules.Rule in project languagetool by languagetool-org.
the class DatabaseHandler method handleResult.
@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
try {
java.sql.Date nowDate = new java.sql.Date(new Date().getTime());
for (RuleMatch match : ruleMatches) {
String smallContext = smallContextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
insertSt.setString(1, language.getShortCode());
Rule rule = match.getRule();
insertSt.setString(2, rule.getId());
insertSt.setString(3, rule.getCategory().getName());
if (rule instanceof AbstractPatternRule) {
AbstractPatternRule patternRule = (AbstractPatternRule) rule;
insertSt.setString(4, patternRule.getSubId());
} else {
insertSt.setNull(4, Types.VARCHAR);
}
insertSt.setString(5, rule.getDescription());
insertSt.setString(6, StringUtils.abbreviate(match.getMessage(), 255));
String context = contextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
if (context.length() > MAX_CONTEXT_LENGTH) {
// let's skip these strange cases, as shortening the text might leave us behind with invalid markup etc
continue;
}
insertSt.setString(7, context);
insertSt.setString(8, StringUtils.abbreviate(smallContext, 255));
// should actually be the dump's date, but isn't really used anyway...
insertSt.setDate(9, nowDate);
insertSt.setDate(10, nowDate);
insertSt.setString(11, sentence.getUrl());
insertSt.setString(12, sentence.getSource());
insertSt.addBatch();
if (++batchCount >= batchSize) {
executeBatch();
batchCount = 0;
}
checkMaxErrors(++errorCount);
if (errorCount % 100 == 0) {
System.out.println("Storing error #" + errorCount + " for text:");
System.out.println(" " + sentence.getText());
}
}
checkMaxSentences(++sentenceCount);
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Error storing matches for '" + sentence.getTitle() + "'", e);
}
}
use of org.languagetool.rules.Rule in project languagetool by languagetool-org.
the class SentenceSourceChecker method enableOnlySpecifiedRules.
private void enableOnlySpecifiedRules(String[] ruleIds, JLanguageTool languageTool) {
for (Rule rule : languageTool.getAllRules()) {
languageTool.disableRule(rule.getId());
}
for (String ruleId : ruleIds) {
languageTool.enableRule(ruleId);
}
warnOnNonExistingRuleIds(ruleIds, languageTool);
System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds));
}
Aggregations