use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class Example method main.
public static void main(String[] args) throws IOException {
List<Language> realLanguages = Languages.get();
System.out.println("This example will test a short string with all languages known to LanguageTool.");
System.out.println("It's just a test to make sure there's at least no crash.");
System.out.println("Using LanguageTool " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
System.out.println("Supported languages: " + realLanguages.size());
for (Language language : realLanguages) {
JLanguageTool langTool = new JLanguageTool(language);
String input = "And the the";
List<RuleMatch> result = langTool.check(input);
System.out.println("Checking '" + input + "' with " + language + ":");
for (RuleMatch ruleMatch : result) {
System.out.println(" " + ruleMatch);
}
}
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class CommandLineTools method checkText.
/**
* Check the given text and print results to System.out.
*
* @param contents a text to check (may be more than one sentence)
* @param lt Initialized LanguageTool
* @param isXmlFormat whether to print the result in XML format
* @param isJsonFormat whether to print the result in JSON format
* @param contextSize error text context size: -1 for default
* @param lineOffset line number offset to be added to line numbers in matches
* @param prevMatches number of previously matched rules
* @param apiMode mode of xml/json printout for simple xml/json output
* @return Number of rule matches to the input text.
*/
public static int checkText(String contents, JLanguageTool lt, boolean isXmlFormat, boolean isJsonFormat, int contextSize, int lineOffset, int prevMatches, StringTools.ApiPrintMode apiMode, boolean listUnknownWords, List<String> unknownWords) throws IOException {
if (contextSize == -1) {
contextSize = DEFAULT_CONTEXT_SIZE;
}
long startTime = System.currentTimeMillis();
List<RuleMatch> ruleMatches = lt.check(contents);
// adjust line numbers
for (RuleMatch r : ruleMatches) {
r.setLine(r.getLine() + lineOffset);
r.setEndLine(r.getEndLine() + lineOffset);
}
if (isXmlFormat) {
if (listUnknownWords && apiMode == StringTools.ApiPrintMode.NORMAL_API) {
unknownWords = lt.getUnknownWords();
}
RuleMatchAsXmlSerializer serializer = new RuleMatchAsXmlSerializer();
String xml = serializer.ruleMatchesToXml(ruleMatches, contents, contextSize, apiMode, lt.getLanguage(), unknownWords);
PrintStream out = new PrintStream(System.out, true, "UTF-8");
out.print(xml);
} else if (isJsonFormat) {
RuleMatchesAsJsonSerializer serializer = new RuleMatchesAsJsonSerializer();
String json = serializer.ruleMatchesToJson(ruleMatches, contents, contextSize, lt.getLanguage());
PrintStream out = new PrintStream(System.out, true, "UTF-8");
out.print(json);
} else {
printMatches(ruleMatches, prevMatches, contents, contextSize);
}
//display stats if it's not in a buffered mode
if (apiMode == StringTools.ApiPrintMode.NORMAL_API && !isJsonFormat) {
SentenceTokenizer sentenceTokenizer = lt.getLanguage().getSentenceTokenizer();
int sentenceCount = sentenceTokenizer.tokenize(contents).size();
displayTimeStats(startTime, sentenceCount, isXmlFormat);
}
return ruleMatches.size();
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class CommandLineTools method correctBitext.
/**
* Automatically applies suggestions to the bilingual text.
* Note: if there is more than one suggestion, always the first
* one is applied, and others ignored silently.
* Prints results to System.out.
*
* @param reader a bitext file reader
* @param sourceLt Initialized source JLanguageTool object
* @param targetLt Initialized target JLanguageTool object
* @param bRules List of all BitextRules to use
*/
public static void correctBitext(BitextReader reader, JLanguageTool sourceLt, JLanguageTool targetLt, List<BitextRule> bRules) throws IOException {
for (StringPair srcAndTrg : reader) {
List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), sourceLt, targetLt, bRules);
List<RuleMatch> fixedMatches = new ArrayList<>();
for (RuleMatch thisMatch : curMatches) {
fixedMatches.add(targetLt.adjustRuleMatchPos(thisMatch, //don't need to adjust at all, we have zero offset related to trg sentence
0, reader.getTargetColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
}
if (fixedMatches.size() > 0) {
System.out.println(correctTextFromMatches(srcAndTrg.getTarget(), fixedMatches));
} else {
System.out.println(srcAndTrg.getTarget());
}
}
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class JLanguageToolTest method testCleanOverlappingWithGerman.
@Test
public void testCleanOverlappingWithGerman() throws IOException {
JLanguageTool tool = new JLanguageTool(new GermanyGerman());
// Juxtaposed errors in "TRGS - Technische" should not be removed.
List<RuleMatch> matches = tool.check("TRGS - Technische Regeln für Gefahrstoffe");
assertEquals(3, matches.size());
}
use of org.languagetool.rules.RuleMatch in project languagetool by languagetool-org.
the class AccentuationCheckRule method match.
@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
final List<RuleMatch> ruleMatches = new ArrayList<>();
final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
// ignoring token 0, i.e. SENT_START
final String token;
if (i == 1) {
token = tokens[i].getToken().toLowerCase();
} else {
token = tokens[i].getToken();
}
final String prevToken = tokens[i - 1].getToken();
String prevPrevToken = "";
if (i > 2) {
prevPrevToken = tokens[i - 2].getToken();
}
String nextToken = "";
if (i < tokens.length - 1) {
nextToken = tokens[i + 1].getToken();
}
String nextNextToken = "";
if (i < tokens.length - 2) {
nextNextToken = tokens[i + 2].getToken();
}
boolean isRelevantWord = false;
boolean isRelevantWord2 = false;
if (StringTools.isEmpty(token)) {
continue;
}
if (relevantWords.containsKey(token)) {
isRelevantWord = true;
}
if (relevantWords2.containsKey(token)) {
isRelevantWord2 = true;
}
if (!isRelevantWord && !isRelevantWord2) {
continue;
}
// verb amb pronom feble davant
if (matchPostagRegexp(tokens[i - 1], PRONOM_FEBLE) && !prevToken.startsWith("'") && !prevToken.startsWith("-")) {
continue;
}
String replacement = null;
final Matcher mPreposicioDE = PREPOSICIO_DE.matcher(nextToken);
final Matcher mExcepcionsDE = EXCEPCIONS_DARRERE_DE.matcher(nextNextToken);
final Matcher mArticleELMS = ARTICLE_EL_MS.matcher(prevToken);
final Matcher mArticleELFS = ARTICLE_EL_FS.matcher(prevToken);
final Matcher mArticleELMP = ARTICLE_EL_MP.matcher(prevToken);
final Matcher mArticleELFP = ARTICLE_EL_FP.matcher(prevToken);
// VERB WITHOUT ACCENT -> NOUN WITH ACCENT
if (isRelevantWord && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
// amb renuncies
if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANT) && !matchPostagRegexp(tokens[i], INFINITIU)) {
replacement = relevantWords.get(token).getToken();
} else if (i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 2], DETERMINANT) && (matchPostagRegexp(tokens[i - 1], DETERMINANT) || mArticleELMS.matches() || mArticleELFS.matches() || mArticleELMP.matches() || mArticleELFP.matches()) && !matchPostagRegexp(tokens[i], INFINITIU)) {
replacement = relevantWords.get(token).getToken();
} else // aquestes renuncies
if (((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS) && !token.equals("cantar")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS) && !token.equals("venia") && !token.equals("tenia") && !token.equals("continua") && !token.equals("genera") && !token.equals("faria")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fumaré una faria (correct: fària)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fem la copia (correct: còpia)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // circumstancies d'una altra classe
if (!matchPostagRegexp(tokens[i], PARTICIPI_MS) && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("espero") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !prevToken.equals("que") && !prevToken.equals("qui") && !prevToken.equals("què") && mPreposicioDE.matches() && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && !matchPostagRegexp(tokens[i + 1], LOCUCIONS) && (i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIU) && !mExcepcionsDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
replacement = relevantWords.get(token).getToken();
} else // la renuncia del president.
if (!token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !token.equals("diferencia") && !token.equals("diferencies") && !token.equals("distancia") && !token.equals("distancies") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP))) && mPreposicioDE.matches()) {
replacement = relevantWords.get(token).getToken();
} else // circumstancies extraordinàries
if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && (i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les seves contraries
if ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && !matchPostagRegexp(tokens[i], VERB_3S) && !matchPostagRegexp(tokens[i], GRUP_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP))) {
replacement = relevantWords.get(token).getToken();
} else //una nova formula que (fórmula)
if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les circumstancies que ens envolten
if (nextToken.equals("que") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
}
// de positiva influencia
if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP)))) {
replacement = relevantWords.get(token).getToken();
}
}
// VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
// de manera obvia, circumstàncies extraordinaries.
if ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i - 1], NOM_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERB_CONJUGAT) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("manera") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i - 1], NOM_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i - 1], NOM_FP))) {
replacement = relevantWords2.get(token).getToken();
} else // de continua disputa
if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
replacement = relevantWords2.get(token).getToken();
} else // la magnifica conservació
if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && mArticleELMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && mArticleELFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && mArticleELMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && mArticleELFP.matches()))) {
replacement = relevantWords2.get(token).getToken();
}
}
if (replacement != null) {
final String msg = "Si és un nom o un adjectiu, ha de portar accent.";
final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta un accent");
ruleMatch.setSuggestedReplacement(replacement);
ruleMatches.add(ruleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
Aggregations