use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class CommandLineTools method checkBitext.
/**
* Checks the bilingual input (bitext) and displays the output (considering the target
* language) in API format or in the simple text format.
*
* NOTE: the positions returned by the rule matches are adjusted
* according to the data returned by the reader.
*
* @param reader Reader of bitext strings.
* @param srcLt Source JLanguageTool (used to analyze the text).
* @param trgLt Target JLanguageTool (used to analyze the text).
* @param bRules Bilingual rules used in addition to target standard rules.
* @return The number of rules matched on the bitext.
* @since 1.0.1
*/
public static int checkBitext(BitextReader reader, JLanguageTool srcLt, JLanguageTool trgLt, List<BitextRule> bRules, boolean isXmlFormat) throws IOException {
long startTime = System.currentTimeMillis();
int contextSize = DEFAULT_CONTEXT_SIZE;
List<RuleMatch> ruleMatches = new ArrayList<>();
int matchCount = 0;
int sentCount = 0;
RuleMatchAsXmlSerializer serializer = new RuleMatchAsXmlSerializer();
PrintStream out = new PrintStream(System.out, true, "UTF-8");
if (isXmlFormat) {
out.print(serializer.getXmlStart(null, null));
}
for (StringPair srcAndTrg : reader) {
List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), srcLt, trgLt, bRules);
List<RuleMatch> fixedMatches = new ArrayList<>();
for (RuleMatch thisMatch : curMatches) {
fixedMatches.add(trgLt.adjustRuleMatchPos(thisMatch, reader.getSentencePosition(), reader.getColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
}
ruleMatches.addAll(fixedMatches);
if (fixedMatches.size() > 0) {
if (isXmlFormat) {
String xml = serializer.ruleMatchesToXmlSnippet(fixedMatches, reader.getCurrentLine(), contextSize);
out.print(xml);
} else {
printMatches(fixedMatches, matchCount, reader.getCurrentLine(), contextSize);
matchCount += fixedMatches.size();
}
}
sentCount++;
}
displayTimeStats(startTime, sentCount, isXmlFormat);
if (isXmlFormat) {
out.print(serializer.getXmlEnd());
}
return ruleMatches.size();
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class CommandLineTools method correctTextFromMatches.
private static String correctTextFromMatches(String contents, List<RuleMatch> matches) {
StringBuilder sb = new StringBuilder(contents);
List<String> errors = new ArrayList<>();
for (RuleMatch rm : matches) {
List<String> replacements = rm.getSuggestedReplacements();
if (!replacements.isEmpty()) {
errors.add(sb.substring(rm.getFromPos(), rm.getToPos()));
}
}
int offset = 0;
int counter = 0;
for (RuleMatch rm : matches) {
List<String> replacements = rm.getSuggestedReplacements();
if (!replacements.isEmpty()) {
//make sure the error hasn't been already corrected:
if (errors.get(counter).equals(sb.substring(rm.getFromPos() - offset, rm.getToPos() - offset))) {
sb.replace(rm.getFromPos() - offset, rm.getToPos() - offset, replacements.get(0));
offset += (rm.getToPos() - rm.getFromPos()) - replacements.get(0).length();
}
counter++;
}
}
return sb.toString();
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class DatabaseHandler method handleResult.
@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
try {
java.sql.Date nowDate = new java.sql.Date(new Date().getTime());
for (RuleMatch match : ruleMatches) {
String smallContext = smallContextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
insertSt.setString(1, language.getShortCode());
Rule rule = match.getRule();
insertSt.setString(2, rule.getId());
insertSt.setString(3, rule.getCategory().getName());
if (rule instanceof AbstractPatternRule) {
AbstractPatternRule patternRule = (AbstractPatternRule) rule;
insertSt.setString(4, patternRule.getSubId());
} else {
insertSt.setNull(4, Types.VARCHAR);
}
insertSt.setString(5, rule.getDescription());
insertSt.setString(6, StringUtils.abbreviate(match.getMessage(), 255));
String context = contextTools.getContext(match.getFromPos(), match.getToPos(), sentence.getText());
if (context.length() > MAX_CONTEXT_LENGTH) {
// let's skip these strange cases, as shortening the text might leave us behind with invalid markup etc
continue;
}
insertSt.setString(7, context);
insertSt.setString(8, StringUtils.abbreviate(smallContext, 255));
// should actually be the dump's date, but isn't really used anyway...
insertSt.setDate(9, nowDate);
insertSt.setDate(10, nowDate);
insertSt.setString(11, sentence.getUrl());
insertSt.setString(12, sentence.getSource());
insertSt.addBatch();
if (++batchCount >= batchSize) {
executeBatch();
batchCount = 0;
}
checkMaxErrors(++errorCount);
if (errorCount % 100 == 0) {
System.out.println("Storing error #" + errorCount + " for text:");
System.out.println(" " + sentence.getText());
}
}
checkMaxSentences(++sentenceCount);
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Error storing matches for '" + sentence.getTitle() + "'", e);
}
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class SentenceSourceChecker method run.
private void run(File propFile, Set<String> disabledRules, String langCode, List<String> fileNames, String[] ruleIds, String[] additionalCategoryIds, int maxSentences, int maxErrors, File languageModelDir, Pattern filter) throws IOException {
Language lang = Languages.getLanguageForShortCode(langCode);
MultiThreadedJLanguageTool languageTool = new MultiThreadedJLanguageTool(lang);
languageTool.setCleanOverlappingMatches(false);
if (languageModelDir != null) {
languageTool.activateLanguageModelRules(languageModelDir);
}
if (ruleIds != null) {
enableOnlySpecifiedRules(ruleIds, languageTool);
} else {
applyRuleDeactivation(languageTool, disabledRules);
}
if (filter != null) {
System.out.println("*** NOTE: only sentences that match regular expression '" + filter + "' will be checked");
}
activateAdditionalCategories(additionalCategoryIds, languageTool);
disableSpellingRules(languageTool);
System.out.println("Working on: " + StringUtils.join(fileNames, ", "));
System.out.println("Sentence limit: " + (maxSentences > 0 ? maxSentences : "no limit"));
System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
ResultHandler resultHandler = null;
int ruleMatchCount = 0;
int sentenceCount = 0;
try {
if (propFile != null) {
resultHandler = new DatabaseHandler(propFile, maxSentences, maxErrors);
} else {
//resultHandler = new CompactStdoutHandler(maxSentences, maxErrors);
resultHandler = new StdoutHandler(maxSentences, maxErrors);
}
MixingSentenceSource mixingSource = MixingSentenceSource.create(fileNames, lang, filter);
while (mixingSource.hasNext()) {
Sentence sentence = mixingSource.next();
try {
List<RuleMatch> matches = languageTool.check(sentence.getText());
resultHandler.handleResult(sentence, matches, lang);
sentenceCount++;
if (sentenceCount % 5000 == 0) {
System.err.printf("%s sentences checked...\n", NumberFormat.getNumberInstance(Locale.US).format(sentenceCount));
}
ruleMatchCount += matches.size();
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Check failed on sentence: " + StringUtils.abbreviate(sentence.getText(), 250), e);
}
}
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
System.out.println(getClass().getSimpleName() + ": " + e);
} finally {
languageTool.shutdown();
if (resultHandler != null) {
float matchesPerSentence = (float) ruleMatchCount / sentenceCount;
System.out.printf(lang + ": %d total matches\n", ruleMatchCount);
System.out.printf(lang + ": ΓΈ%.2f rule matches per sentence\n", matchesPerSentence);
try {
resultHandler.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class StdoutHandler method handleResult.
@Override
protected void handleResult(Sentence sentence, List<RuleMatch> ruleMatches, Language language) {
if (ruleMatches.size() > 0) {
int i = 1;
System.out.println("\nTitle: " + sentence.getTitle());
for (RuleMatch match : ruleMatches) {
String output = i + ".) Line " + (match.getLine() + 1) + ", column " + match.getColumn() + ", Rule ID: " + match.getRule().getId();
if (match.getRule() instanceof AbstractPatternRule) {
AbstractPatternRule pRule = (AbstractPatternRule) match.getRule();
output += "[" + pRule.getSubId() + "]";
}
System.out.println(output);
String msg = match.getMessage();
msg = msg.replaceAll("<suggestion>", "'");
msg = msg.replaceAll("</suggestion>", "'");
System.out.println("Message: " + msg);
List<String> replacements = match.getSuggestedReplacements();
if (!replacements.isEmpty()) {
System.out.println("Suggestion: " + String.join("; ", replacements));
}
System.out.println(contextTools.getPlainTextContext(match.getFromPos(), match.getToPos(), sentence.getText()));
i++;
checkMaxErrors(++errorCount);
}
}
checkMaxSentences(++sentenceCount);
}
Aggregations