use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class ReplaceOperationNamesRule method match.
@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
loop: for (int i = 1; i < tokens.length; i++) {
List<String> replacementLemmas = null;
String token = tokens[i].getToken().toLowerCase();
if (token.length() > 3 && token.endsWith("s")) {
token = token.substring(0, token.length() - 1);
}
if (wrongWords.containsKey(token)) {
replacementLemmas = wrongWords.get(token);
} else {
continue loop;
}
// exceptions
if (token.equals("duplicat") && tokens[i - 1].getToken().equalsIgnoreCase("per")) {
continue loop;
}
// Assecat el braç del riu
if (i + 1 < tokens.length && matchPostagRegexp(tokens[i - 1], PUNTUACIO) && matchPostagRegexp(tokens[i + 1], DETERMINANT)) {
continue loop;
}
// relevant token
if (tokens[i].hasPosTag("_GV_")) {
continue loop;
}
// next token
if (i + 1 < tokens.length && (tokens[i + 1].hasLemma("per") || tokens[i + 1].hasLemma("com") || tokens[i + 1].hasLemma("des") || tokens[i + 1].hasLemma("amb") || matchPostagRegexp(tokens[i + 1], NextToken_POS_Excep))) {
continue loop;
}
// prev token
if (!matchPostagRegexp(tokens[i - 1], PrevToken_POS) || matchPostagRegexp(tokens[i - 1], PrevToken_POS_Excep)) {
continue loop;
}
if (replacementLemmas != null) {
List<String> possibleReplacements = new ArrayList<>();
String[] synthesized = null;
if (!tokens[i].getToken().toLowerCase().endsWith("s")) {
possibleReplacements.addAll(replacementLemmas);
} else {
//synthesize plural
for (String replacementLemma : replacementLemmas) {
try {
synthesized = synth.synthesize(new AnalyzedToken(replacementLemma, "NCMS000", replacementLemma), "NC.P.*");
} catch (IOException e) {
throw new RuntimeException("Could not synthesize: " + replacementLemma + " with tag NC.P.*.", e);
}
possibleReplacements.addAll(Arrays.asList(synthesized));
}
}
if (possibleReplacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokens[i], possibleReplacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class SimpleReplaceVerbsRule method match.
@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tokenReadings : tokens) {
String originalTokenStr = tokenReadings.getToken();
if (ignoreTaggedWords && tokenReadings.isTagged()) {
continue;
}
String tokenString = originalTokenStr.toLowerCase(getLocale());
AnalyzedTokenReadings analyzedTokenReadings = null;
String infinitive = null;
int i = 0;
while (i < 2 && analyzedTokenReadings == null) {
Matcher m;
if (i == 0) {
m = desinencies_1conj_0.matcher(tokenString);
} else {
m = desinencies_1conj_1.matcher(tokenString);
}
if (m.matches()) {
String lexeme = m.group(1);
String desinence = m.group(2);
if (desinence.startsWith("e") || desinence.startsWith("é") || desinence.startsWith("i") || desinence.startsWith("ï")) {
if (lexeme.endsWith("c")) {
lexeme = lexeme.substring(0, lexeme.length() - 1).concat("ç");
} else if (lexeme.endsWith("qu")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("c");
} else if (lexeme.endsWith("g")) {
lexeme = lexeme.substring(0, lexeme.length() - 1).concat("j");
} else if (lexeme.endsWith("gü")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("gu");
} else if (lexeme.endsWith("gu")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("g");
}
}
if (desinence.startsWith("ï")) {
desinence = "i" + desinence.substring(1, desinence.length());
}
infinitive = lexeme.concat("ar");
if (wrongWords.containsKey(infinitive)) {
List<String> wordAsArray = Arrays.asList("cant".concat(desinence));
List<AnalyzedTokenReadings> analyzedTokenReadingsList = null;
try {
analyzedTokenReadingsList = tagger.tag(wordAsArray);
} catch (IOException e) {
throw new RuntimeException("Could not tag sentence: " + wordAsArray, e);
}
if (analyzedTokenReadingsList != null) {
analyzedTokenReadings = analyzedTokenReadingsList.get(0);
}
}
}
i++;
}
// synthesize replacements
if (analyzedTokenReadings != null) {
List<String> possibleReplacements = new ArrayList<>();
String[] synthesized = null;
List<String> replacementInfinitives = wrongWords.get(infinitive);
for (String replacementInfinitive : replacementInfinitives) {
if (replacementInfinitive.startsWith("(")) {
possibleReplacements.add(replacementInfinitive);
} else {
// the first part
String[] parts = replacementInfinitive.split(" ");
// is the verb
AnalyzedToken infinitiveAsAnTkn = new AnalyzedToken(parts[0], "V.*", parts[0]);
for (AnalyzedToken analyzedToken : analyzedTokenReadings) {
try {
synthesized = synth.synthesize(infinitiveAsAnTkn, analyzedToken.getPOSTag());
} catch (IOException e) {
throw new RuntimeException("Could not synthesize: " + infinitiveAsAnTkn + " with tag " + analyzedToken.getPOSTag(), e);
}
for (String s : synthesized) {
for (int j = 1; j < parts.length; j++) {
s = s.concat(" ").concat(parts[j]);
}
if (!possibleReplacements.contains(s)) {
possibleReplacements.add(s);
}
}
}
}
}
if (possibleReplacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, possibleReplacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class TokenPredicate method apply.
@Override
public boolean apply(ChunkTaggedToken analyzedToken) {
String[] parts = getDescription().split("=");
String exprType;
String exprValue;
if (parts.length == 1) {
exprType = "string";
exprValue = parts[0];
} else if (parts.length == 2) {
exprType = parts[0];
exprValue = parts[1];
} else {
throw new RuntimeException("Could not parse expression: " + getDescription());
}
if (exprValue.startsWith("'") && exprValue.endsWith("'")) {
exprValue = exprValue.substring(1, exprValue.length() - 1);
}
switch(exprType) {
case "string":
if (caseSensitive) {
return analyzedToken.getToken().equals(exprValue);
} else {
return analyzedToken.getToken().equalsIgnoreCase(exprValue);
}
case "regex":
Pattern p1 = caseSensitive ? Pattern.compile(exprValue) : Pattern.compile(exprValue, Pattern.CASE_INSENSITIVE);
return p1.matcher(analyzedToken.getToken()).matches();
case // case sensitive
"regexCS":
Pattern p2 = Pattern.compile(exprValue);
return p2.matcher(analyzedToken.getToken()).matches();
case "chunk":
Pattern chunkPattern = Pattern.compile(exprValue);
for (ChunkTag chunkTag : analyzedToken.getChunkTags()) {
if (chunkPattern.matcher(chunkTag.getChunkTag()).matches()) {
return true;
}
}
return false;
case "pos":
AnalyzedTokenReadings readings = analyzedToken.getReadings();
if (readings != null) {
for (AnalyzedToken token : readings) {
if (token.getPOSTag() != null && token.getPOSTag().contains(exprValue)) {
return true;
}
}
}
return false;
case "posre":
case "posregex":
Pattern posPattern = Pattern.compile(exprValue);
AnalyzedTokenReadings readings2 = analyzedToken.getReadings();
if (readings2 != null) {
for (AnalyzedToken token : readings2) {
if (token.getPOSTag() != null && posPattern.matcher(token.getPOSTag()).matches()) {
return true;
}
}
}
return false;
default:
throw new RuntimeException("Expression type not supported: '" + exprType + "'");
}
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class MatchState method getTargetPosTag.
/**
* Format POS tag using parameters already defined in the class.
*
* @return Formatted POS tag as String.
*/
// FIXME: gets only the first POS tag that matches, this can be wrong
// on the other hand, many POS tags = too many suggestions?
public final String getTargetPosTag() {
String targetPosTag = match.getPosTag();
List<String> posTags = new ArrayList<>();
Pattern pPosRegexMatch = match.getPosRegexMatch();
String posTagReplace = match.getPosTagReplace();
if (match.isStaticLemma()) {
for (AnalyzedToken analyzedToken : matchedToken) {
String tst = analyzedToken.getPOSTag();
if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
targetPosTag = analyzedToken.getPOSTag();
posTags.add(targetPosTag);
}
}
if (pPosRegexMatch != null && posTagReplace != null) {
targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
}
} else {
for (AnalyzedToken analyzedToken : formattedToken) {
String tst = analyzedToken.getPOSTag();
if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
targetPosTag = analyzedToken.getPOSTag();
posTags.add(targetPosTag);
}
}
if (pPosRegexMatch != null && posTagReplace != null) {
if (posTags.isEmpty()) {
posTags.add(targetPosTag);
}
StringBuilder sb = new StringBuilder();
int posTagLen = posTags.size();
int l = 0;
for (String lPosTag : posTags) {
l++;
lPosTag = pPosRegexMatch.matcher(lPosTag).replaceAll(posTagReplace);
if (match.setsPos()) {
lPosTag = synthesizer.getPosTagCorrection(lPosTag);
}
sb.append(lPosTag);
if (l < posTagLen) {
sb.append('|');
}
}
targetPosTag = sb.toString();
}
}
return targetPosTag;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class MatchState method getNewToken.
private List<AnalyzedToken> getNewToken(int numRead, String token) {
String posTag = match.getPosTag();
List<AnalyzedToken> list = new ArrayList<>();
String lemma = "";
for (int j = 0; j < numRead; j++) {
String tempPosTag = formattedToken.getAnalyzedToken(j).getPOSTag();
if (tempPosTag != null) {
if (tempPosTag.equals(posTag) && formattedToken.getAnalyzedToken(j).getLemma() != null) {
lemma = formattedToken.getAnalyzedToken(j).getLemma();
}
if (StringTools.isEmpty(lemma)) {
lemma = formattedToken.getAnalyzedToken(0).getLemma();
}
list.add(new AnalyzedToken(token, posTag, lemma));
list.get(list.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
}
}
return list;
}
Aggregations