use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class CatalanTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
for (String word : sentenceTokens) {
// This hack allows all rules and dictionary entries to work with
// typewriter apostrophe
boolean containsTypewriterApostrophe = false;
if (word.length() > 1) {
if (word.contains("'")) {
containsTypewriterApostrophe = true;
}
word = word.replace("’", "'");
}
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(conversionLocale);
final boolean isLowercase = word.equals(lowerWord);
final boolean isMixedCase = StringTools.isMixedCase(word);
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
// normal case:
addTokens(taggerTokens, l);
// word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
addTokens(lowerTaggerTokens, l);
}
// additional tagging with prefixes
if (l.isEmpty() && !isMixedCase) {
addTokens(additionalTags(word, dictLookup), l);
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(word, null, null));
}
AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
if (containsTypewriterApostrophe) {
List<ChunkTag> listChunkTags = new ArrayList<>();
listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
atr.setChunkTags(listChunkTags);
}
tokenReadings.add(atr);
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class ReplaceOperationNamesRule method match.
@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
loop: for (int i = 1; i < tokens.length; i++) {
List<String> replacementLemmas = null;
String token = tokens[i].getToken().toLowerCase();
if (token.length() > 3 && token.endsWith("s")) {
token = token.substring(0, token.length() - 1);
}
if (wrongWords.containsKey(token)) {
replacementLemmas = wrongWords.get(token);
} else {
continue loop;
}
// exceptions
if (token.equals("duplicat") && tokens[i - 1].getToken().equalsIgnoreCase("per")) {
continue loop;
}
// Assecat el braç del riu
if (i + 1 < tokens.length && matchPostagRegexp(tokens[i - 1], PUNTUACIO) && matchPostagRegexp(tokens[i + 1], DETERMINANT)) {
continue loop;
}
// relevant token
if (tokens[i].hasPosTag("_GV_")) {
continue loop;
}
// next token
if (i + 1 < tokens.length && (tokens[i + 1].hasLemma("per") || tokens[i + 1].hasLemma("com") || tokens[i + 1].hasLemma("des") || tokens[i + 1].hasLemma("amb") || matchPostagRegexp(tokens[i + 1], NextToken_POS_Excep))) {
continue loop;
}
// prev token
if (!matchPostagRegexp(tokens[i - 1], PrevToken_POS) || matchPostagRegexp(tokens[i - 1], PrevToken_POS_Excep)) {
continue loop;
}
if (replacementLemmas != null) {
List<String> possibleReplacements = new ArrayList<>();
String[] synthesized = null;
if (!tokens[i].getToken().toLowerCase().endsWith("s")) {
possibleReplacements.addAll(replacementLemmas);
} else {
//synthesize plural
for (String replacementLemma : replacementLemmas) {
try {
synthesized = synth.synthesize(new AnalyzedToken(replacementLemma, "NCMS000", replacementLemma), "NC.P.*");
} catch (IOException e) {
throw new RuntimeException("Could not synthesize: " + replacementLemma + " with tag NC.P.*.", e);
}
possibleReplacements.addAll(Arrays.asList(synthesized));
}
}
if (possibleReplacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokens[i], possibleReplacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class SimpleReplaceVerbsRule method match.
@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tokenReadings : tokens) {
String originalTokenStr = tokenReadings.getToken();
if (ignoreTaggedWords && tokenReadings.isTagged()) {
continue;
}
String tokenString = originalTokenStr.toLowerCase(getLocale());
AnalyzedTokenReadings analyzedTokenReadings = null;
String infinitive = null;
int i = 0;
while (i < 2 && analyzedTokenReadings == null) {
Matcher m;
if (i == 0) {
m = desinencies_1conj_0.matcher(tokenString);
} else {
m = desinencies_1conj_1.matcher(tokenString);
}
if (m.matches()) {
String lexeme = m.group(1);
String desinence = m.group(2);
if (desinence.startsWith("e") || desinence.startsWith("é") || desinence.startsWith("i") || desinence.startsWith("ï")) {
if (lexeme.endsWith("c")) {
lexeme = lexeme.substring(0, lexeme.length() - 1).concat("ç");
} else if (lexeme.endsWith("qu")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("c");
} else if (lexeme.endsWith("g")) {
lexeme = lexeme.substring(0, lexeme.length() - 1).concat("j");
} else if (lexeme.endsWith("gü")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("gu");
} else if (lexeme.endsWith("gu")) {
lexeme = lexeme.substring(0, lexeme.length() - 2).concat("g");
}
}
if (desinence.startsWith("ï")) {
desinence = "i" + desinence.substring(1, desinence.length());
}
infinitive = lexeme.concat("ar");
if (wrongWords.containsKey(infinitive)) {
List<String> wordAsArray = Arrays.asList("cant".concat(desinence));
List<AnalyzedTokenReadings> analyzedTokenReadingsList = null;
try {
analyzedTokenReadingsList = tagger.tag(wordAsArray);
} catch (IOException e) {
throw new RuntimeException("Could not tag sentence: " + wordAsArray, e);
}
if (analyzedTokenReadingsList != null) {
analyzedTokenReadings = analyzedTokenReadingsList.get(0);
}
}
}
i++;
}
// synthesize replacements
if (analyzedTokenReadings != null) {
List<String> possibleReplacements = new ArrayList<>();
String[] synthesized = null;
List<String> replacementInfinitives = wrongWords.get(infinitive);
for (String replacementInfinitive : replacementInfinitives) {
if (replacementInfinitive.startsWith("(")) {
possibleReplacements.add(replacementInfinitive);
} else {
// the first part
String[] parts = replacementInfinitive.split(" ");
// is the verb
AnalyzedToken infinitiveAsAnTkn = new AnalyzedToken(parts[0], "V.*", parts[0]);
for (AnalyzedToken analyzedToken : analyzedTokenReadings) {
try {
synthesized = synth.synthesize(infinitiveAsAnTkn, analyzedToken.getPOSTag());
} catch (IOException e) {
throw new RuntimeException("Could not synthesize: " + infinitiveAsAnTkn + " with tag " + analyzedToken.getPOSTag(), e);
}
for (String s : synthesized) {
for (int j = 1; j < parts.length; j++) {
s = s.concat(" ").concat(parts[j]);
}
if (!possibleReplacements.contains(s)) {
possibleReplacements.add(s);
}
}
}
}
}
if (possibleReplacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, possibleReplacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class TokenPredicate method apply.
@Override
public boolean apply(ChunkTaggedToken analyzedToken) {
String[] parts = getDescription().split("=");
String exprType;
String exprValue;
if (parts.length == 1) {
exprType = "string";
exprValue = parts[0];
} else if (parts.length == 2) {
exprType = parts[0];
exprValue = parts[1];
} else {
throw new RuntimeException("Could not parse expression: " + getDescription());
}
if (exprValue.startsWith("'") && exprValue.endsWith("'")) {
exprValue = exprValue.substring(1, exprValue.length() - 1);
}
switch(exprType) {
case "string":
if (caseSensitive) {
return analyzedToken.getToken().equals(exprValue);
} else {
return analyzedToken.getToken().equalsIgnoreCase(exprValue);
}
case "regex":
Pattern p1 = caseSensitive ? Pattern.compile(exprValue) : Pattern.compile(exprValue, Pattern.CASE_INSENSITIVE);
return p1.matcher(analyzedToken.getToken()).matches();
case // case sensitive
"regexCS":
Pattern p2 = Pattern.compile(exprValue);
return p2.matcher(analyzedToken.getToken()).matches();
case "chunk":
Pattern chunkPattern = Pattern.compile(exprValue);
for (ChunkTag chunkTag : analyzedToken.getChunkTags()) {
if (chunkPattern.matcher(chunkTag.getChunkTag()).matches()) {
return true;
}
}
return false;
case "pos":
AnalyzedTokenReadings readings = analyzedToken.getReadings();
if (readings != null) {
for (AnalyzedToken token : readings) {
if (token.getPOSTag() != null && token.getPOSTag().contains(exprValue)) {
return true;
}
}
}
return false;
case "posre":
case "posregex":
Pattern posPattern = Pattern.compile(exprValue);
AnalyzedTokenReadings readings2 = analyzedToken.getReadings();
if (readings2 != null) {
for (AnalyzedToken token : readings2) {
if (token.getPOSTag() != null && posPattern.matcher(token.getPOSTag()).matches()) {
return true;
}
}
}
return false;
default:
throw new RuntimeException("Expression type not supported: '" + exprType + "'");
}
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AgreementRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
for (int i = 0; i < tokens.length; i++) {
//defaulting to the first reading
//TODO: check for all readings
String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) {
continue;
}
if (tokens[i].isImmunized()) {
continue;
}
AnalyzedTokenReadings tokenReadings = tokens[i];
boolean relevantPronoun = isRelevantPronoun(tokens, i);
boolean ignore = couldBeRelativeClause(tokens, i);
if (i > 0) {
String prevToken = tokens[i - 1].getToken().toLowerCase();
if ((tokens[i].getToken().equals("eine") || tokens[i].getToken().equals("einen")) && (prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das") || prevToken.equals("des") || prevToken.equals("dieses"))) {
// TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken; "auf der einen Seite"
ignore = true;
}
}
// avoid false alarm on "nichts Gutes" and "alles Gute"
if (tokenReadings.getToken().equals("nichts") || tokenReadings.getToken().equals("alles") || tokenReadings.getToken().equals("dies")) {
ignore = true;
}
// avoid false alarm on "Art. 1" and "bisherigen Art. 1" (Art. = Artikel):
boolean detAbbrev = i < tokens.length - 2 && tokens[i + 1].getToken().equals("Art") && tokens[i + 2].getToken().equals(".");
boolean detAdjAbbrev = i < tokens.length - 3 && tokens[i + 2].getToken().equals("Art") && tokens[i + 3].getToken().equals(".");
// "einen Hochwasser führenden Fluss", "die Gott zugeschriebenen Eigenschaften":
boolean followingParticiple = i < tokens.length - 3 && (tokens[i + 2].hasPartialPosTag("PA1") || tokens[i + 2].getToken().matches("zugeschriebenen?|genannten?"));
if (detAbbrev || detAdjAbbrev || followingParticiple) {
ignore = true;
}
if ((GermanHelper.hasReadingOfType(tokenReadings, POSType.DETERMINER) || relevantPronoun) && !ignore) {
int tokenPos = i + 1;
if (tokenPos >= tokens.length) {
break;
}
AnalyzedTokenReadings nextToken = tokens[tokenPos];
if (isNonPredicativeAdjective(nextToken) || isParticiple(nextToken)) {
tokenPos = i + 2;
if (tokenPos >= tokens.length) {
break;
}
if (GermanHelper.hasReadingOfType(tokens[tokenPos], POSType.NOMEN)) {
// e.g. "deren komisches Geschenke" isn't yet detected as incorrect
if (i >= 2 && GermanHelper.hasReadingOfType(tokens[i - 2], POSType.ADJEKTIV) && "als".equals(tokens[i - 1].getToken()) && "das".equals(tokens[i].getToken())) {
// avoid false alarm for e.g. "weniger farbenprächtig als das anderer Papageien"
continue;
}
RuleMatch ruleMatch = checkDetAdjNounAgreement(tokens[i], nextToken, tokens[i + 2]);
if (ruleMatch != null) {
ruleMatches.add(ruleMatch);
}
}
} else if (GermanHelper.hasReadingOfType(nextToken, POSType.NOMEN) && !"Herr".equals(nextToken.getToken())) {
RuleMatch ruleMatch = checkDetNounAgreement(tokens[i], tokens[i + 1]);
if (ruleMatch != null) {
ruleMatches.add(ruleMatch);
}
}
}
}
// for each token
return toRuleMatchArray(ruleMatches);
}
Aggregations