use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AccentuationCheckRule method match.
@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
final List<RuleMatch> ruleMatches = new ArrayList<>();
final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
// ignoring token 0, i.e. SENT_START
final String token;
if (i == 1) {
token = tokens[i].getToken().toLowerCase();
} else {
token = tokens[i].getToken();
}
final String prevToken = tokens[i - 1].getToken();
String prevPrevToken = "";
if (i > 2) {
prevPrevToken = tokens[i - 2].getToken();
}
String nextToken = "";
if (i < tokens.length - 1) {
nextToken = tokens[i + 1].getToken();
}
String nextNextToken = "";
if (i < tokens.length - 2) {
nextNextToken = tokens[i + 2].getToken();
}
boolean isRelevantWord = false;
boolean isRelevantWord2 = false;
if (StringTools.isEmpty(token)) {
continue;
}
if (relevantWords.containsKey(token)) {
isRelevantWord = true;
}
if (relevantWords2.containsKey(token)) {
isRelevantWord2 = true;
}
if (!isRelevantWord && !isRelevantWord2) {
continue;
}
// verb amb pronom feble davant
if (matchPostagRegexp(tokens[i - 1], PRONOM_FEBLE) && !prevToken.startsWith("'") && !prevToken.startsWith("-")) {
continue;
}
String replacement = null;
final Matcher mPreposicioDE = PREPOSICIO_DE.matcher(nextToken);
final Matcher mExcepcionsDE = EXCEPCIONS_DARRERE_DE.matcher(nextNextToken);
final Matcher mArticleELMS = ARTICLE_EL_MS.matcher(prevToken);
final Matcher mArticleELFS = ARTICLE_EL_FS.matcher(prevToken);
final Matcher mArticleELMP = ARTICLE_EL_MP.matcher(prevToken);
final Matcher mArticleELFP = ARTICLE_EL_FP.matcher(prevToken);
// VERB WITHOUT ACCENT -> NOUN WITH ACCENT
if (isRelevantWord && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
// amb renuncies
if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANT) && !matchPostagRegexp(tokens[i], INFINITIU)) {
replacement = relevantWords.get(token).getToken();
} else if (i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 2], DETERMINANT) && (matchPostagRegexp(tokens[i - 1], DETERMINANT) || mArticleELMS.matches() || mArticleELFS.matches() || mArticleELMP.matches() || mArticleELFP.matches()) && !matchPostagRegexp(tokens[i], INFINITIU)) {
replacement = relevantWords.get(token).getToken();
} else // aquestes renuncies
if (((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS) && !token.equals("cantar")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS) && !token.equals("venia") && !token.equals("tenia") && !token.equals("continua") && !token.equals("genera") && !token.equals("faria")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fumaré una faria (correct: fària)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fem la copia (correct: còpia)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // circumstancies d'una altra classe
if (!matchPostagRegexp(tokens[i], PARTICIPI_MS) && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("espero") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !prevToken.equals("que") && !prevToken.equals("qui") && !prevToken.equals("què") && mPreposicioDE.matches() && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && !matchPostagRegexp(tokens[i + 1], LOCUCIONS) && (i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIU) && !mExcepcionsDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
replacement = relevantWords.get(token).getToken();
} else // la renuncia del president.
if (!token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !token.equals("diferencia") && !token.equals("diferencies") && !token.equals("distancia") && !token.equals("distancies") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP))) && mPreposicioDE.matches()) {
replacement = relevantWords.get(token).getToken();
} else // circumstancies extraordinàries
if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && (i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les seves contraries
if ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && !matchPostagRegexp(tokens[i], VERB_3S) && !matchPostagRegexp(tokens[i], GRUP_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP))) {
replacement = relevantWords.get(token).getToken();
} else //una nova formula que (fórmula)
if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les circumstancies que ens envolten
if (nextToken.equals("que") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
replacement = relevantWords.get(token).getToken();
}
// de positiva influencia
if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP)))) {
replacement = relevantWords.get(token).getToken();
}
}
// VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
// de manera obvia, circumstàncies extraordinaries.
if ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i - 1], NOM_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERB_CONJUGAT) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("manera") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i - 1], NOM_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i - 1], NOM_FP))) {
replacement = relevantWords2.get(token).getToken();
} else // de continua disputa
if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
replacement = relevantWords2.get(token).getToken();
} else // la magnifica conservació
if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && mArticleELMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && mArticleELFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && mArticleELMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && mArticleELFP.matches()))) {
replacement = relevantWords2.get(token).getToken();
}
}
if (replacement != null) {
final String msg = "Si és un nom o un adjectiu, ha de portar accent.";
final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta un accent");
ruleMatch.setSuggestedReplacement(replacement);
ruleMatches.add(ruleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AccentuationDataLoader method loadWords.
Map<String, AnalyzedTokenReadings> loadWords(String path) {
final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
while (scanner.hasNextLine()) {
final String line = scanner.nextLine().trim();
if (line.isEmpty() || line.charAt(0) == '#') {
// ignore comments
continue;
}
final String[] parts = line.split(";");
if (parts.length != 3) {
throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
}
final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
}
}
return map;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class GermanChunkerTest method assertFullChunks.
private void assertFullChunks(String input) throws Exception {
String plainInput = getPlainInput(input);
AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(plainInput);
AnalyzedTokenReadings[] result = analyzedSentence.getTokensWithoutWhitespace();
chunker.addChunkTags(Arrays.asList(result));
List<String> expectedChunks = getExpectedChunks(input);
List<ChunkTaggedToken> result2 = new ArrayList<>();
int i = 0;
for (AnalyzedTokenReadings readings : result) {
if (i > 0) {
ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken(readings.getToken(), readings.getChunkTags(), readings);
result2.add(chunkTaggedToken);
}
i++;
}
assertChunks(input, plainInput, result2, expectedChunks);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class TokenPredicateTest method test.
@Test
public void test() {
List<ChunkTag> chunkTags = Arrays.asList(new ChunkTag("CHUNK1"), new ChunkTag("CHUNK2"));
AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("mytoken", "MYPOS", "mylemma"), 0);
ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken("mytoken", chunkTags, readings);
assertMatch("mytoken", chunkTaggedToken);
assertNoMatch("mytoken2", chunkTaggedToken);
assertMatch("string=mytoken", chunkTaggedToken);
assertNoMatch("string=mytoken2", chunkTaggedToken);
assertMatch("regex=my[abct]oken", chunkTaggedToken);
assertNoMatch("regex=my[abc]oken", chunkTaggedToken);
assertMatch("chunk=CHUNK1", chunkTaggedToken);
assertMatch("chunk=CHUNK2", chunkTaggedToken);
assertNoMatch("chunk=OTHERCHUNK", chunkTaggedToken);
assertMatch("pos=MYPOS", chunkTaggedToken);
assertNoMatch("pos=OTHER", chunkTaggedToken);
assertMatch("posre=M.POS", chunkTaggedToken);
assertNoMatch("posre=O.HER", chunkTaggedToken);
try {
assertNoMatch("invalid=token", chunkTaggedToken);
fail();
} catch (RuntimeException expected) {
//expected
}
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class CaseRule method isSpecialCase.
private boolean isSpecialCase(int i, AnalyzedTokenReadings[] tokens) {
String prevToken = i > 1 ? tokens[i - 1].getToken() : "";
String token = tokens[i].getToken();
AnalyzedTokenReadings nextReadings = i < tokens.length - 1 ? tokens[i + 1] : null;
// ignore "im Allgemeinen gilt" but not "im Allgemeinen Fall":
return "im".equalsIgnoreCase(prevToken) && "Allgemeinen".equals(token) && !hasNounReading(nextReadings);
}
Aggregations