use of org.languagetool.bitext.StringPair in project languagetool by languagetool-org.
the class BitextPatternRuleTest method testBitextRule.
private void testBitextRule(BitextPatternRule rule, Language lang, JLanguageTool languageTool) throws IOException {
JLanguageTool srcTool = new JLanguageTool(rule.getSourceLanguage());
List<StringPair> goodSentences = rule.getCorrectBitextExamples();
for (StringPair goodSentence : goodSentences) {
assertTrue("Got good sentence: '" + goodSentence.getSource() + "'", cleanSentence(goodSentence.getSource()).trim().length() > 0);
assertTrue("Got good sentence: '" + goodSentence.getTarget() + "'", cleanSentence(goodSentence.getTarget()).trim().length() > 0);
assertFalse(lang + ": Did not expect error in: " + goodSentence + " (Rule: " + rule + ")", match(rule, goodSentence.getSource(), goodSentence.getTarget(), srcTool, languageTool));
}
List<IncorrectBitextExample> badSentences = rule.getIncorrectBitextExamples();
for (IncorrectBitextExample origBadExample : badSentences) {
// enable indentation use
StringPair example = origBadExample.getExample();
String origBadSrcSentence = example.getSource().replaceAll("[\\n\\t]+", "");
String origBadTrgSentence = example.getTarget().replaceAll("[\\n\\t]+", "");
List<String> suggestedCorrection = origBadExample.getCorrections();
int expectedSrcMatchStart = origBadSrcSentence.indexOf("<marker>");
int expectedSrcMatchEnd = origBadSrcSentence.indexOf("</marker>") - "<marker>".length();
testMarker(expectedSrcMatchStart, expectedSrcMatchEnd, rule, lang);
int expectedTrgMatchStart = origBadTrgSentence.indexOf("<marker>");
int expectedTrgMatchEnd = origBadTrgSentence.indexOf("</marker>") - "<marker>".length();
testMarker(expectedTrgMatchStart, expectedTrgMatchEnd, rule, lang);
testBadSentence(origBadSrcSentence, suggestedCorrection, expectedSrcMatchStart, expectedSrcMatchEnd, rule.getSrcRule(), lang, srcTool);
testBadSentence(origBadTrgSentence, suggestedCorrection, expectedTrgMatchStart, expectedTrgMatchEnd, rule.getTrgRule(), lang, languageTool);
}
}
use of org.languagetool.bitext.StringPair in project languagetool by languagetool-org.
the class CommandLineTools method correctBitext.
/**
* Automatically applies suggestions to the bilingual text.
* Note: if there is more than one suggestion, always the first
* one is applied, and others ignored silently.
* Prints results to System.out.
*
* @param reader a bitext file reader
* @param sourceLt Initialized source JLanguageTool object
* @param targetLt Initialized target JLanguageTool object
* @param bRules List of all BitextRules to use
*/
public static void correctBitext(BitextReader reader, JLanguageTool sourceLt, JLanguageTool targetLt, List<BitextRule> bRules) throws IOException {
for (StringPair srcAndTrg : reader) {
List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), sourceLt, targetLt, bRules);
List<RuleMatch> fixedMatches = new ArrayList<>();
for (RuleMatch thisMatch : curMatches) {
fixedMatches.add(targetLt.adjustRuleMatchPos(thisMatch, //don't need to adjust at all, we have zero offset related to trg sentence
0, reader.getTargetColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
}
if (fixedMatches.size() > 0) {
System.out.println(correctTextFromMatches(srcAndTrg.getTarget(), fixedMatches));
} else {
System.out.println(srcAndTrg.getTarget());
}
}
}
use of org.languagetool.bitext.StringPair in project languagetool by languagetool-org.
the class BitextPatternRuleHandler method endElement.
@Override
public void endElement(String namespaceURI, String sName, String qName) throws SAXException {
switch(qName) {
case RULE:
trgRule.setMessage(message.toString());
for (Match m : suggestionMatches) {
trgRule.addSuggestionMatch(m);
}
if (phrasePatternTokens.size() <= 1) {
suggestionMatches.clear();
}
BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule);
bRule.setCorrectBitextExamples(correctExamples);
bRule.setIncorrectBitextExamples(incorrectExamples);
bRule.setSourceLanguage(srcLang);
rules.add(bRule);
break;
case SRC_EXAMPLE:
srcExample = setExample();
break;
case TRG_EXAMPLE:
trgExample = setExample();
break;
case SOURCE:
srcRule = finalizeRule();
break;
case TARGET:
trgRule = finalizeRule();
break;
case EXAMPLE:
if (inCorrectExample) {
correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample()));
} else if (inIncorrectExample) {
StringPair examplePair = new StringPair(srcExample.getExample(), trgExample.getExample());
if (trgExample.getCorrections().isEmpty()) {
incorrectExamples.add(new IncorrectBitextExample(examplePair));
} else {
List<String> corrections = trgExample.getCorrections();
incorrectExamples.add(new IncorrectBitextExample(examplePair, corrections));
}
}
inCorrectExample = false;
inIncorrectExample = false;
inErrorTriggerExample = false;
break;
default:
super.endElement(namespaceURI, sName, qName);
break;
}
}
use of org.languagetool.bitext.StringPair in project languagetool by languagetool-org.
the class CommandLineTools method checkBitext.
/**
* Checks the bilingual input (bitext) and displays the output (considering the target
* language) in API format or in the simple text format.
*
* NOTE: the positions returned by the rule matches are adjusted
* according to the data returned by the reader.
*
* @param reader Reader of bitext strings.
* @param srcLt Source JLanguageTool (used to analyze the text).
* @param trgLt Target JLanguageTool (used to analyze the text).
* @param bRules Bilingual rules used in addition to target standard rules.
* @return The number of rules matched on the bitext.
* @since 1.0.1
*/
public static int checkBitext(BitextReader reader, JLanguageTool srcLt, JLanguageTool trgLt, List<BitextRule> bRules, boolean isXmlFormat) throws IOException {
long startTime = System.currentTimeMillis();
int contextSize = DEFAULT_CONTEXT_SIZE;
List<RuleMatch> ruleMatches = new ArrayList<>();
int matchCount = 0;
int sentCount = 0;
RuleMatchAsXmlSerializer serializer = new RuleMatchAsXmlSerializer();
PrintStream out = new PrintStream(System.out, true, "UTF-8");
if (isXmlFormat) {
out.print(serializer.getXmlStart(null, null));
}
for (StringPair srcAndTrg : reader) {
List<RuleMatch> curMatches = Tools.checkBitext(srcAndTrg.getSource(), srcAndTrg.getTarget(), srcLt, trgLt, bRules);
List<RuleMatch> fixedMatches = new ArrayList<>();
for (RuleMatch thisMatch : curMatches) {
fixedMatches.add(trgLt.adjustRuleMatchPos(thisMatch, reader.getSentencePosition(), reader.getColumnCount(), reader.getLineCount(), reader.getCurrentLine(), null));
}
ruleMatches.addAll(fixedMatches);
if (fixedMatches.size() > 0) {
if (isXmlFormat) {
String xml = serializer.ruleMatchesToXmlSnippet(fixedMatches, reader.getCurrentLine(), contextSize);
out.print(xml);
} else {
printMatches(fixedMatches, matchCount, reader.getCurrentLine(), contextSize);
matchCount += fixedMatches.size();
}
}
sentCount++;
}
displayTimeStats(startTime, sentCount, isXmlFormat);
if (isXmlFormat) {
out.print(serializer.getXmlEnd());
}
return ruleMatches.size();
}
Aggregations