use of org.languagetool.synthesis.Synthesizer in project languagetool by languagetool-org.
the class NgramProbabilityRule method getBetterAlternatives.
private Optional<List<Alternative>> getBetterAlternatives(Replacement replacement, GoogleToken prevToken, GoogleToken token, GoogleToken next, Probability p) throws IOException {
Optional<AnalyzedToken> reading = getByPosTag(token.getPosTags(), replacement.tagRegex);
List<Alternative> betterAlternatives = new ArrayList<>();
if (reading.isPresent()) {
Synthesizer synthesizer = language.getSynthesizer();
if (synthesizer != null) {
String[] forms = synthesizer.synthesize(new AnalyzedToken(token.token, "not_used", reading.get().getLemma()), replacement.alternativeTag);
for (String alternativeToken : forms) {
if (alternativeToken.equals(token)) {
continue;
}
List<String> ngram = Arrays.asList(prevToken.token, token.token, next.token);
List<String> alternativeNgram = Arrays.asList(prevToken.token, alternativeToken, next.token);
Probability alternativeProbability = lm.getPseudoProbability(alternativeNgram);
if (alternativeProbability.getProb() >= p.getProb()) {
// TODO: consider a factor?
debug("More probable alternative to '%s': %s\n", ngram, alternativeNgram);
betterAlternatives.add(new Alternative(alternativeToken, alternativeProbability));
} else {
debug("Less probable alternative to '%s': %s\n", ngram, alternativeNgram);
}
}
return Optional.of(betterAlternatives);
}
}
return Optional.empty();
}
use of org.languagetool.synthesis.Synthesizer in project languagetool by languagetool-org.
the class PatternRuleQueryBuilder method getTermQueryOrNull.
@Nullable
private BooleanClause getTermQueryOrNull(PatternToken patternToken, String termStr) throws UnsupportedPatternRuleException {
if (termStr == null || termStr.isEmpty()) {
return null;
}
Query termQuery;
Term termQueryTerm = getTermQueryTerm(patternToken, termStr);
if (patternToken.getNegation() || patternToken.getMinOccurrence() == 0) {
// we need to ignore this - negation, if any, must happen at the same position
return null;
} else if (patternToken.isInflected() && patternToken.isRegularExpression()) {
Term lemmaQueryTerm = getQueryTerm(patternToken, LEMMA_PREFIX + "(", simplifyRegex(termStr), ")");
Query regexpQuery = getRegexQuery(lemmaQueryTerm, termStr, patternToken);
return new BooleanClause(regexpQuery, BooleanClause.Occur.MUST);
} else if (patternToken.isInflected() && !patternToken.isRegularExpression()) {
/*
This is simpler, but leads to problem with e.g. German rules ZEITLICH_SYNCHRON and GEWISSEN_SUBST:
Term lemmaQueryTerm = getQueryTerm(element, LEMMA_PREFIX, termStr, "");
Query query = new TermQuery(lemmaQueryTerm);
return new BooleanClause(query, BooleanClause.Occur.MUST);
*/
Synthesizer synthesizer = language.getSynthesizer();
if (synthesizer != null) {
try {
String[] synthesized = synthesizer.synthesize(new AnalyzedToken(termStr, null, termStr), ".*", true);
Query query;
if (synthesized.length == 0) {
query = new TermQuery(termQueryTerm);
} else {
query = new RegexpQuery(getTermQueryTerm(patternToken, StringUtils.join(synthesized, "|")));
}
return new BooleanClause(query, BooleanClause.Occur.MUST);
} catch (IOException e) {
throw new RuntimeException("Could not build Lucene query for '" + patternToken + "' and '" + termStr + "'", e);
}
}
return null;
} else if (patternToken.isRegularExpression()) {
termQuery = getRegexQuery(termQueryTerm, termStr, patternToken);
} else {
termQuery = new TermQuery(termQueryTerm);
}
return new BooleanClause(termQuery, BooleanClause.Occur.MUST);
}
use of org.languagetool.synthesis.Synthesizer in project languagetool by languagetool-org.
the class TokenInflectionAgreementRule method match.
@Override
public final RuleMatch[] match(AnalyzedSentence text) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
List<AnalyzedToken> adjTokenReadings = new ArrayList<>();
AnalyzedTokenReadings adjAnalyzedTokenReadings = null;
for (int i = 1; i < tokens.length; i++) {
AnalyzedTokenReadings tokenReadings = tokens[i];
String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();
if (posTag0 == null) {
// || posTag0.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
adjTokenReadings.clear();
continue;
}
if (adjTokenReadings.isEmpty()) {
// no need to start checking on last token or if no noun
if (i == tokens.length - 1)
continue;
//TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
if (PosTagHelper.hasPosTagPart(tokens[i], ":nv") || //TODO: turn back on when we can handle pron
PosTagHelper.hasPosTagPart(tokens[i], "&pron") || PosTagHelper.hasPosTagPart(tokens[i], "<"))
continue;
if (!PosTagHelper.hasPosTagPart(tokens[i + 1], "noun:") || PosTagHelper.hasPosTagPart(tokens[i + 1], ":nv") || PosTagHelper.hasPosTagPart(tokens[i + 1], "&pron") || PosTagHelper.hasPosTagPart(tokens[i + 1], "<"))
continue;
if (LemmaHelper.hasLemma(tokens[i], Arrays.asList("червоний", "правий", "місцевий", "найсильніший", "найкращі"), ":p:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("новенький", "головний", "вибраний", "більший", "побачений", "подібний"), ":n:") || LemmaHelper.hasLemma(tokens[i], Arrays.asList("державний"), ":f:")) {
adjTokenReadings.clear();
break;
}
for (AnalyzedToken token : tokenReadings) {
String adjPosTag = token.getPOSTag();
if (adjPosTag == null) {
// can happen for words with ́ or
continue;
}
if (adjPosTag.startsWith("adj")) {
adjTokenReadings.add(token);
adjAnalyzedTokenReadings = tokenReadings;
} else {
adjTokenReadings.clear();
break;
}
}
continue;
}
List<AnalyzedToken> slaveTokenReadings = new ArrayList<>();
for (AnalyzedToken token : tokenReadings) {
String nounPosTag = token.getPOSTag();
if (nounPosTag == null) {
// can happen for words with ́ or
continue;
}
if (nounPosTag.startsWith("noun") && !nounPosTag.contains(NO_VIDMINOK_SUBSTR)) {
slaveTokenReadings.add(token);
} else if (nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) {
continue;
} else {
slaveTokenReadings.clear();
break;
}
}
if (slaveTokenReadings.isEmpty()) {
adjTokenReadings.clear();
continue;
}
if (DEBUG) {
System.err.println(MessageFormat.format("=== Checking:\n\t{0}\n\t{1}", adjTokenReadings, slaveTokenReadings));
}
// perform the check
List<InflectionHelper.Inflection> masterInflections = InflectionHelper.getAdjInflections(adjTokenReadings);
List<InflectionHelper.Inflection> slaveInflections = InflectionHelper.getNounInflections(slaveTokenReadings);
if (Collections.disjoint(masterInflections, slaveInflections)) {
if (TokenInflectionExceptionHelper.isException(tokens, i, masterInflections, slaveInflections, adjTokenReadings, slaveTokenReadings)) {
adjTokenReadings.clear();
continue;
}
if (DEBUG) {
System.err.println(MessageFormat.format("=== Found:\n\t{0}\n\t", adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // " + adjAnalyzedTokenReadings, slaveTokenReadings.get(0).getToken() + ": " + slaveInflections + " // " + slaveTokenReadings));
}
String msg = String.format("Потенційна помилка: прикметник не узгоджений з іменником: \"%s\": [%s] і \"%s\": [%s]", adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true), slaveTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));
if (PosTagHelper.hasPosTagPart(adjTokenReadings, ":m:v_rod") && tokens[i].getToken().matches(".*[ую]") && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*:m:v_dav.*")) {
msg += ". Можливо вжито невнормований родовий відмінок ч.р. з закінченням -у/-ю замість -а/-я (така тенденція є в сучасній мові)?";
}
RuleMatch potentialRuleMatch = new RuleMatch(this, adjAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
List<String> suggestions = new ArrayList<>();
try {
for (Inflection adjInflection : masterInflections) {
String genderTag = ":" + adjInflection.gender + ":";
String vidmTag = adjInflection._case;
if (!adjInflection._case.equals("v_kly") && (adjInflection.gender.equals("p") || PosTagHelper.hasPosTagPart(slaveTokenReadings, genderTag))) {
for (AnalyzedToken nounToken : slaveTokenReadings) {
if (adjInflection.animMatters()) {
if (!nounToken.getPOSTag().contains(":" + adjInflection.animTag))
continue;
}
String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...", genderTag + vidmTag);
String[] synthesized = ukrainianSynthesizer.synthesize(nounToken, newNounPosTag, false);
for (String s : synthesized) {
String suggestion = adjAnalyzedTokenReadings.getToken() + " " + s;
if (!suggestions.contains(suggestion)) {
suggestions.add(suggestion);
}
}
}
}
}
for (Inflection nounInflection : slaveInflections) {
String genderTag = ":" + nounInflection.gender + ":";
String vidmTag = nounInflection._case;
if (nounInflection.animMatters()) {
vidmTag += ":r" + nounInflection.animTag;
}
for (AnalyzedToken adjToken : adjTokenReadings) {
String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?", genderTag + vidmTag);
String[] synthesized = ukrainianSynthesizer.synthesize(adjToken, newAdjTag, false);
for (String s : synthesized) {
String suggestion = s + " " + tokenReadings.getToken();
if (!suggestions.contains(suggestion)) {
suggestions.add(suggestion);
}
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
if (suggestions.size() > 0) {
potentialRuleMatch.setSuggestedReplacements(suggestions);
}
ruleMatches.add(potentialRuleMatch);
}
adjTokenReadings.clear();
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.synthesis.Synthesizer in project languagetool by languagetool-org.
the class TokenAgreementRule method createRuleMatch.
private RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings, AnalyzedTokenReadings reqTokenReadings, List<String> posTagsToFind) {
String tokenString = tokenReadings.getToken();
Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
List<String> suggestions = new ArrayList<>();
String requiredPostTagsRegEx = ":(" + String.join("|", posTagsToFind) + ")";
for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
String oldPosTag = analyzedToken.getPOSTag();
if (oldPosTag == null)
continue;
String requiredPostTagsRegExToApply = requiredPostTagsRegEx;
Matcher matcher = REQ_ANIM_INANIM_PATTERN.matcher(oldPosTag);
if (matcher.find()) {
requiredPostTagsRegExToApply += matcher.group(0);
} else {
requiredPostTagsRegExToApply += "(?:" + reqAnimInanimRegex + ")?";
}
String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegExToApply);
try {
String[] synthesized = ukrainianSynthesizer.synthesize(analyzedToken, posTag, true);
suggestions.addAll(Arrays.asList(synthesized));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
if (suggestions.size() > 0) {
// remove duplicates
suggestions = new ArrayList<>(new LinkedHashSet<>(suggestions));
}
List<String> reqVidminkyNames = new ArrayList<>();
for (String vidm : posTagsToFind) {
reqVidminkyNames.add(PosTagHelper.VIDMINKY_MAP.get(vidm));
}
List<String> foundVidminkyNames = new ArrayList<>();
for (AnalyzedToken token : tokenReadings) {
String posTag2 = token.getPOSTag();
if (posTag2 != null && posTag2.contains(VIDMINOK_SUBSTR)) {
String vidmName = PosTagHelper.VIDMINKY_MAP.get(posTag2.replaceFirst("^.*" + VIDMINOK_REGEX + ".*$", "$1"));
if (foundVidminkyNames.contains(vidmName)) {
if (posTag2.contains(":p:")) {
vidmName = vidmName + " (мн.)";
foundVidminkyNames.add(vidmName);
}
// else skip dup
} else {
foundVidminkyNames.add(vidmName);
}
}
}
String msg = MessageFormat.format("Прийменник «{0}» вимагає іншого відмінка: {1}, а знайдено: {2}", reqTokenReadings.getToken(), String.join(", ", reqVidminkyNames), String.join(", ", foundVidminkyNames));
if (tokenString.equals("їх") && requiredPostTagsRegEx != null) {
msg += ". Можливо тут потрібно присвійний займенник «їхній»?";
try {
String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
String[] synthesized = ukrainianSynthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron:pos", "їхній"), newYihPostag, true);
suggestions.addAll(Arrays.asList(synthesized));
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (reqTokenReadings.getToken().equalsIgnoreCase("о")) {
for (AnalyzedToken token : tokenReadings.getReadings()) {
String posTag2 = token.getPOSTag();
if (posTag2.matches("noun:anim.*:v_naz.*")) {
msg += ". Можливо тут «о» — це вигук і потрібно кличний відмінок?";
try {
String newPostag = posTag2.replace("v_naz", "v_kly");
String[] synthesized = ukrainianSynthesizer.synthesize(token, newPostag, false);
for (String string : synthesized) {
if (!string.equals(token.getToken()) && !suggestions.contains(string)) {
suggestions.add(string);
}
}
break;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
RuleMatch potentialRuleMatch = new RuleMatch(this, tokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
potentialRuleMatch.setSuggestedReplacements(suggestions);
return potentialRuleMatch;
}
Aggregations