use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class UkrainianHybridDisambiguator method removeIanimVKly.
private void removeIanimVKly(AnalyzedSentence input) {
AnalyzedTokenReadings[] tokens = input.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
List<AnalyzedToken> analyzedTokens = tokens[i].getReadings();
if (i < tokens.length - 1 && Arrays.asList(",", "!", "»").contains(tokens[i + 1].getToken()) && PosTagHelper.hasPosTag(tokens[i - 1], "adj.*v_kly.*"))
continue;
ArrayList<AnalyzedToken> inanimVklyReadings = new ArrayList<>();
boolean otherFound = false;
for (int j = 0; j < analyzedTokens.size(); j++) {
String posTag = analyzedTokens.get(j).getPOSTag();
if (posTag == null)
break;
if (posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME))
continue;
if (INANIM_VKLY.matcher(posTag).matches()) {
inanimVklyReadings.add(analyzedTokens.get(j));
} else {
otherFound = true;
}
}
if (inanimVklyReadings.size() > 0 && otherFound) {
// System.err.println("====================1 " + tokens[i]);
for (AnalyzedToken analyzedToken : inanimVklyReadings) {
tokens[i].removeReading(analyzedToken);
// System.err.println("===== Removing: " + analyzedToken);
// System.err.println("====================2 " + tokens[i]);
}
}
}
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class HiddenCharacterRule method match.
@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tokenReadings : tokens) {
String tokenString = tokenReadings.getToken();
if (tokenString.indexOf(HIDDEN_CHAR) != -1) {
RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings);
ruleMatches.add(potentialRuleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AbstractRomanianTaggerTest method assertHasLemmaAndPos.
/**
* Verify if <code>inflected</code> contains the specified lemma and pos
*
* @param inflected input word, inflected form
* @param lemma expected lemma
* @param posTag expected tag for lemma
*/
protected void assertHasLemmaAndPos(String inflected, String lemma, String posTag) throws IOException {
final List<AnalyzedTokenReadings> tags = tagger.tag(Arrays.asList(inflected));
final StringBuilder allTags = new StringBuilder();
boolean found = false;
for (AnalyzedTokenReadings analyzedTokenReadings : tags) {
for (AnalyzedToken token : analyzedTokenReadings) {
final String crtLemma = token.getLemma();
final String crtPOSTag = token.getPOSTag();
allTags.append(String.format("[%s/%s]", crtLemma, crtPOSTag));
found = (lemma == null || lemma.equals(crtLemma)) && (posTag == null || posTag.equals(crtPOSTag));
if (found) {
break;
}
}
if (found) {
break;
}
}
assertTrue(String.format("Lemma and POS not found for word [%s]! " + "Expected [%s/%s]. Actual: %s", inflected, lemma, posTag, allTags.toString()), found);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class PortugueseAccentuationCheckRule method match.
@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
final List<RuleMatch> ruleMatches = new ArrayList<>();
final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
// ignoring token 0, i.e. SENT_START
final String token;
if (i == 1) {
token = tokens[i].getToken().toLowerCase();
} else {
token = tokens[i].getToken();
}
final String prevToken = tokens[i - 1].getToken();
String prevPrevToken = "";
if (i > 2) {
prevPrevToken = tokens[i - 2].getToken();
}
String nextToken = "";
if (i < tokens.length - 1) {
nextToken = tokens[i + 1].getToken();
}
String nextNextToken = "";
if (i < tokens.length - 2) {
nextNextToken = tokens[i + 2].getToken();
}
boolean isRelevantWord = false;
boolean isRelevantWord2 = false;
if (StringTools.isEmpty(token)) {
continue;
}
if (relevantWords.containsKey(token)) {
isRelevantWord = true;
}
if (relevantWords2.containsKey(token)) {
isRelevantWord2 = true;
}
if (!isRelevantWord && !isRelevantWord2) {
continue;
}
// verbo precedido de pronome reflexo
if (matchPostagRegexp(tokens[i - 1], PRONOME_PESSOAL) && !prevToken.startsWith("-")) {
continue;
}
String replacement = null;
final Matcher mPreposicaoDE = PREPOSICAO_DE.matcher(nextToken);
final Matcher mExcepcoesDE = EXCEPCOES_ANTES_DE.matcher(nextNextToken);
final Matcher mArtigoOMS = ARTIGO_O_MS.matcher(prevToken);
final Matcher mArtigoOFS = ARTIGO_O_FS.matcher(prevToken);
final Matcher mArtigoOMP = ARTIGO_O_MP.matcher(prevToken);
final Matcher mArtigoOFP = ARTIGO_O_FP.matcher(prevToken);
// VERB WITHOUT ACCENT -> NOUN WITH ACCENT
if (isRelevantWord && !matchPostagRegexp(tokens[i], GN)) /* && !matchPostagRegexp(tokens[i], LOCUCOES)*/
{
// amb renuncies
if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANTE) && !matchPostagRegexp(tokens[i], INFINITIVO)) {
replacement = relevantWords.get(token).getToken();
} else // aquestes renuncies
if (((matchPostagRegexp(tokens[i - 1], DETERMINANTE_MS) && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_MP) && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FS) && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FP) && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fumaré una faria (correct: fària)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERBO_CONJUGADO) && ((matchPostagRegexp(tokens[i - 1], DETERMINANTE_MS) && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_MP) && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FS) && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FP) && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // fem la copia (correct: còpia)
if (i > 2 && matchPostagRegexp(tokens[i - 2], VERBO_CONJUGADO) && ((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // circumstancies d'una altra classe
if (!matchPostagRegexp(tokens[i], PARTICIPIO_MS) && /*
&& !token.equals("venia") && !token.equals("venies")
&& !token.equals("tenia") && !token.equals("tenies")
&& !token.equals("faria") && !token.equals("faries")
&& !token.equals("espero") && !token.equals("continua")
&& !token.equals("continues") && !token.equals("cantar")
&& !prevToken.equals("que") && !prevToken.equals("qui")
&& !prevToken.equals("què") && mPreposicaoDE.matches() */
!matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && /* && !matchPostagRegexp(tokens[i + 1], LOCUCOES) */
(i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIVO) && !mExcepcoesDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
replacement = relevantWords.get(token).getToken();
} else // la renuncia del president.
if (/* !token.equals("venia")
&& !token.equals("venies") && !token.equals("tenia")
&& !token.equals("tenies") && !token.equals("faria")
&& !token.equals("faries") && !token.equals("continua")
&& !token.equals("continues") && !token.equals("cantar")
&& !token.equals("diferencia") && !token.equals("diferencies")
&& !token.equals("distancia") && !token.equals("distancies")
&& */
((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP))) && mPreposicaoDE.matches()) {
replacement = relevantWords.get(token).getToken();
} else // circunstancias extraordináries
if (/*!token.equals("pronuncia")
&& !token.equals("espero") && !token.equals("pronuncies")
&& !token.equals("venia") && !token.equals("venies")
&& !token.equals("tenia") && !token.equals("tenies")
&& !token.equals("continua") && !token.equals("continues")
&& !token.equals("faria") && !token.equals("faries")
&& !token.equals("genera") && !token.equals("figuri")
&& */
(i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i + 1], ADJETIVO_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i + 1], ADJETIVO_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i + 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i + 1], ADJETIVO_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les seves contraries
if ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS) && !matchPostagRegexp(tokens[i], VERBO_3S) && !matchPostagRegexp(tokens[i], GRUPO_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS) && !matchPostagRegexp(tokens[i], VERBO_3S)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP))) {
replacement = relevantWords.get(token).getToken();
} else //uma nova formula que (fórmula)
if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_FP)))) {
replacement = relevantWords.get(token).getToken();
} else // les circumstancies que ens envolten
if (nextToken.equals("que") && ((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
replacement = relevantWords.get(token).getToken();
}
// de positiva influencia
if (/*!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies")
&& !token.equals("venia") && !token.equals("venies") && !token.equals("tenia")
&& !token.equals("tenies") && !token.equals("continua") && !token.equals("continues")
&& !token.equals("faria") && !token.equals("faries") && !token.equals("genera")
&& !token.equals("figuri")
&& */
i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP)))) {
replacement = relevantWords.get(token).getToken();
}
}
// VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN)) /* && !matchPostagRegexp(tokens[i], LOCUCOES) */
{
// de maneira obvia, circumstancias extraordinarias.
if ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i - 1], NOME_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERBO_CONJUGADO) && !matchPostagRegexp(tokens[i], VERBO_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("maneira") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i - 1], NOME_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i - 1], NOME_FP))) {
replacement = relevantWords2.get(token).getToken();
} else // de continua disputa
if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i + 1], NOME_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && matchPostagRegexp(tokens[i + 1], NOME_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i + 1], NOME_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i + 1], NOME_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
replacement = relevantWords2.get(token).getToken();
} else // a magnifica conservação
if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i + 1], NOME_MS) && mArtigoOMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && matchPostagRegexp(tokens[i + 1], NOME_FS) && mArtigoOFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i + 1], NOME_MP) && mArtigoOMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i + 1], NOME_FP) && mArtigoOFP.matches()))) {
replacement = relevantWords2.get(token).getToken();
}
}
if (replacement != null) {
final String msg = "Se é um nome ou um adjectivo, tem acento.";
final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta um acento");
ruleMatch.setSuggestedReplacement(replacement);
ruleMatches.add(ruleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class BretonTagger method tag.
// This method is almost the same as the 'tag' method in
// BaseTagger class, except that when probing the
// dictionary fails, it retry without the suffixes
// -mañ, -se, -hont.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
List<AnalyzedToken> taggerTokens;
List<AnalyzedToken> lowerTaggerTokens;
List<AnalyzedToken> upperTaggerTokens;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
Matcher matcher;
for (String word : sentenceTokens) {
String probeWord = word;
// which happens rarely when trying to remove suffixes -mañ, -se, etc.
for (; ; ) {
List<AnalyzedToken> l = new ArrayList<>();
String lowerWord = probeWord.toLowerCase(conversionLocale);
taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(probeWord));
lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
boolean isLowercase = probeWord.equals(lowerWord);
// Normal case.
addTokens(taggerTokens, l);
if (!isLowercase) {
// Lowercase.
addTokens(lowerTaggerTokens, l);
}
// Uppercase.
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(probeWord)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, l);
}
}
if (l.isEmpty()) {
if ((matcher = patternSuffix.matcher(probeWord)).find()) {
// Remove the suffix and probe dictionary again.
// So given a word such as "xxx-mañ", we're going to
// try to probe the dictionary again with "xxx" this time.
probeWord = matcher.group(1);
continue;
}
l.add(new AnalyzedToken(word, null, null));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
break;
}
}
return tokenReadings;
}
Aggregations