Search in sources :

Example 66 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class UkrainianHybridDisambiguator method removeIanimVKly.

private void removeIanimVKly(AnalyzedSentence input) {
    AnalyzedTokenReadings[] tokens = input.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        List<AnalyzedToken> analyzedTokens = tokens[i].getReadings();
        if (i < tokens.length - 1 && Arrays.asList(",", "!", "»").contains(tokens[i + 1].getToken()) && PosTagHelper.hasPosTag(tokens[i - 1], "adj.*v_kly.*"))
            continue;
        ArrayList<AnalyzedToken> inanimVklyReadings = new ArrayList<>();
        boolean otherFound = false;
        for (int j = 0; j < analyzedTokens.size(); j++) {
            String posTag = analyzedTokens.get(j).getPOSTag();
            if (posTag == null)
                break;
            if (posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME))
                continue;
            if (INANIM_VKLY.matcher(posTag).matches()) {
                inanimVklyReadings.add(analyzedTokens.get(j));
            } else {
                otherFound = true;
            }
        }
        if (inanimVklyReadings.size() > 0 && otherFound) {
            //        System.err.println("====================1 " + tokens[i]);
            for (AnalyzedToken analyzedToken : inanimVklyReadings) {
                tokens[i].removeReading(analyzedToken);
            //          System.err.println("===== Removing: " + analyzedToken);
            //          System.err.println("====================2 " + tokens[i]);
            }
        }
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 67 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class HiddenCharacterRule method match.

@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        String tokenString = tokenReadings.getToken();
        if (tokenString.indexOf(HIDDEN_CHAR) != -1) {
            RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings);
            ruleMatches.add(potentialRuleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 68 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractRomanianTaggerTest method assertHasLemmaAndPos.

/**
   * Verify if <code>inflected</code> contains the specified lemma and pos
   *
   * @param inflected input word, inflected form
   * @param lemma expected lemma
   * @param posTag expected tag for lemma
   */
protected void assertHasLemmaAndPos(String inflected, String lemma, String posTag) throws IOException {
    final List<AnalyzedTokenReadings> tags = tagger.tag(Arrays.asList(inflected));
    final StringBuilder allTags = new StringBuilder();
    boolean found = false;
    for (AnalyzedTokenReadings analyzedTokenReadings : tags) {
        for (AnalyzedToken token : analyzedTokenReadings) {
            final String crtLemma = token.getLemma();
            final String crtPOSTag = token.getPOSTag();
            allTags.append(String.format("[%s/%s]", crtLemma, crtPOSTag));
            found = (lemma == null || lemma.equals(crtLemma)) && (posTag == null || posTag.equals(crtPOSTag));
            if (found) {
                break;
            }
        }
        if (found) {
            break;
        }
    }
    assertTrue(String.format("Lemma and POS not found for word [%s]! " + "Expected [%s/%s]. Actual: %s", inflected, lemma, posTag, allTags.toString()), found);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 69 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class PortugueseAccentuationCheckRule method match.

@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        // ignoring token 0, i.e. SENT_START
        final String token;
        if (i == 1) {
            token = tokens[i].getToken().toLowerCase();
        } else {
            token = tokens[i].getToken();
        }
        final String prevToken = tokens[i - 1].getToken();
        String prevPrevToken = "";
        if (i > 2) {
            prevPrevToken = tokens[i - 2].getToken();
        }
        String nextToken = "";
        if (i < tokens.length - 1) {
            nextToken = tokens[i + 1].getToken();
        }
        String nextNextToken = "";
        if (i < tokens.length - 2) {
            nextNextToken = tokens[i + 2].getToken();
        }
        boolean isRelevantWord = false;
        boolean isRelevantWord2 = false;
        if (StringTools.isEmpty(token)) {
            continue;
        }
        if (relevantWords.containsKey(token)) {
            isRelevantWord = true;
        }
        if (relevantWords2.containsKey(token)) {
            isRelevantWord2 = true;
        }
        if (!isRelevantWord && !isRelevantWord2) {
            continue;
        }
        // verbo precedido de pronome reflexo
        if (matchPostagRegexp(tokens[i - 1], PRONOME_PESSOAL) && !prevToken.startsWith("-")) {
            continue;
        }
        String replacement = null;
        final Matcher mPreposicaoDE = PREPOSICAO_DE.matcher(nextToken);
        final Matcher mExcepcoesDE = EXCEPCOES_ANTES_DE.matcher(nextNextToken);
        final Matcher mArtigoOMS = ARTIGO_O_MS.matcher(prevToken);
        final Matcher mArtigoOFS = ARTIGO_O_FS.matcher(prevToken);
        final Matcher mArtigoOMP = ARTIGO_O_MP.matcher(prevToken);
        final Matcher mArtigoOFP = ARTIGO_O_FP.matcher(prevToken);
        // VERB WITHOUT ACCENT -> NOUN WITH ACCENT
        if (isRelevantWord && !matchPostagRegexp(tokens[i], GN)) /* && !matchPostagRegexp(tokens[i], LOCUCOES)*/
        {
            // amb renuncies
            if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANTE) && !matchPostagRegexp(tokens[i], INFINITIVO)) {
                replacement = relevantWords.get(token).getToken();
            } else // aquestes renuncies
            if (((matchPostagRegexp(tokens[i - 1], DETERMINANTE_MS) && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_MP) && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FS) && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FP) && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fumaré una faria (correct: fària)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERBO_CONJUGADO) && ((matchPostagRegexp(tokens[i - 1], DETERMINANTE_MS) && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_MP) && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FS) && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANTE_FP) && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fem la copia (correct: còpia)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERBO_CONJUGADO) && ((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // circumstancies d'una altra classe
            if (!matchPostagRegexp(tokens[i], PARTICIPIO_MS) && /*
            && !token.equals("venia") && !token.equals("venies")
            && !token.equals("tenia") && !token.equals("tenies")
            && !token.equals("faria") && !token.equals("faries")
            && !token.equals("espero") && !token.equals("continua")
            && !token.equals("continues") && !token.equals("cantar")
            && !prevToken.equals("que") && !prevToken.equals("qui")
            && !prevToken.equals("què") && mPreposicaoDE.matches() */
            !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && /* && !matchPostagRegexp(tokens[i + 1], LOCUCOES) */
            (i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIVO) && !mExcepcoesDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
                replacement = relevantWords.get(token).getToken();
            } else // la renuncia del president.
            if (/* !token.equals("venia")
            && !token.equals("venies") && !token.equals("tenia")
            && !token.equals("tenies") && !token.equals("faria")
            && !token.equals("faries") && !token.equals("continua")
            && !token.equals("continues") && !token.equals("cantar")
            && !token.equals("diferencia") && !token.equals("diferencies")
            && !token.equals("distancia")  && !token.equals("distancies") 
            && */
            ((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP))) && mPreposicaoDE.matches()) {
                replacement = relevantWords.get(token).getToken();
            } else // circunstancias extraordináries
            if (/*!token.equals("pronuncia") 
            && !token.equals("espero") && !token.equals("pronuncies")
            && !token.equals("venia")  && !token.equals("venies") 
            && !token.equals("tenia")  && !token.equals("tenies") 
            && !token.equals("continua") && !token.equals("continues")
            && !token.equals("faria") && !token.equals("faries") 
            && !token.equals("genera") && !token.equals("figuri")
            && */
            (i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i + 1], ADJETIVO_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i + 1], ADJETIVO_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i + 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i + 1], ADJETIVO_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les seves contraries
            if ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS) && !matchPostagRegexp(tokens[i], VERBO_3S) && !matchPostagRegexp(tokens[i], GRUPO_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS) && !matchPostagRegexp(tokens[i], VERBO_3S)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP))) {
                replacement = relevantWords.get(token).getToken();
            } else //uma nova formula que (fórmula)
            if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANTE_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les circumstancies que ens envolten
            if (nextToken.equals("que") && ((mArtigoOMS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MS)) || (mArtigoOFS.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FS)) || (mArtigoOMP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_MP)) || (mArtigoOFP.matches() && matchPostagRegexp(relevantWords.get(token), NOME_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
            // de positiva influencia
            if (/*!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies")
                && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia")
                && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues")
                && !token.equals("faria") && !token.equals("faries") && !token.equals("genera")
                && !token.equals("figuri") 
            && */
            i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOME_MS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MS)) || (matchPostagRegexp(relevantWords.get(token), NOME_FS) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FS)) || (matchPostagRegexp(relevantWords.get(token), NOME_MP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_MP)) || (matchPostagRegexp(relevantWords.get(token), NOME_FP) && matchPostagRegexp(tokens[i - 1], ADJETIVO_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
        }
        // VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
        if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN)) /* && !matchPostagRegexp(tokens[i], LOCUCOES) */
        {
            // de maneira obvia, circumstancias extraordinarias.
            if ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i - 1], NOME_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERBO_CONJUGADO) && !matchPostagRegexp(tokens[i], VERBO_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("maneira") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i - 1], NOME_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i - 1], NOME_FP))) {
                replacement = relevantWords2.get(token).getToken();
            } else // de continua disputa
            if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i + 1], NOME_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && matchPostagRegexp(tokens[i + 1], NOME_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i + 1], NOME_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i + 1], NOME_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
                replacement = relevantWords2.get(token).getToken();
            } else // a magnifica conservação
            if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MS) && matchPostagRegexp(tokens[i + 1], NOME_MS) && mArtigoOMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FS) && matchPostagRegexp(tokens[i + 1], NOME_FS) && mArtigoOFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_MP) && matchPostagRegexp(tokens[i + 1], NOME_MP) && mArtigoOMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJETIVO_FP) && matchPostagRegexp(tokens[i + 1], NOME_FP) && mArtigoOFP.matches()))) {
                replacement = relevantWords2.get(token).getToken();
            }
        }
        if (replacement != null) {
            final String msg = "Se é um nome ou um adjectivo, tem acento.";
            final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta um acento");
            ruleMatch.setSuggestedReplacement(replacement);
            ruleMatches.add(ruleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 70 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class BretonTagger method tag.

// This method is almost the same as the 'tag' method in
// BaseTagger class, except that when probing the
// dictionary fails, it retry without the suffixes
// -mañ, -se, -hont.
@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    Matcher matcher;
    for (String word : sentenceTokens) {
        String probeWord = word;
        // which happens rarely when trying to remove suffixes -mañ, -se, etc.
        for (; ; ) {
            List<AnalyzedToken> l = new ArrayList<>();
            String lowerWord = probeWord.toLowerCase(conversionLocale);
            taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(probeWord));
            lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
            boolean isLowercase = probeWord.equals(lowerWord);
            // Normal case.
            addTokens(taggerTokens, l);
            if (!isLowercase) {
                // Lowercase.
                addTokens(lowerTaggerTokens, l);
            }
            // Uppercase.
            if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
                if (isLowercase) {
                    upperTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(StringTools.uppercaseFirstChar(probeWord)));
                    if (!upperTaggerTokens.isEmpty()) {
                        addTokens(upperTaggerTokens, l);
                    }
                }
                if (l.isEmpty()) {
                    if ((matcher = patternSuffix.matcher(probeWord)).find()) {
                        // Remove the suffix and probe dictionary again.
                        // So given a word such as "xxx-mañ", we're going to
                        // try to probe the dictionary again with "xxx" this time.
                        probeWord = matcher.group(1);
                        continue;
                    }
                    l.add(new AnalyzedToken(word, null, null));
                }
            }
            tokenReadings.add(new AnalyzedTokenReadings(l, pos));
            pos += word.length();
            break;
        }
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2