Search in sources :

Example 56 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AccentuationCheckRule method match.

@Override
public RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        // ignoring token 0, i.e. SENT_START
        final String token;
        if (i == 1) {
            token = tokens[i].getToken().toLowerCase();
        } else {
            token = tokens[i].getToken();
        }
        final String prevToken = tokens[i - 1].getToken();
        String prevPrevToken = "";
        if (i > 2) {
            prevPrevToken = tokens[i - 2].getToken();
        }
        String nextToken = "";
        if (i < tokens.length - 1) {
            nextToken = tokens[i + 1].getToken();
        }
        String nextNextToken = "";
        if (i < tokens.length - 2) {
            nextNextToken = tokens[i + 2].getToken();
        }
        boolean isRelevantWord = false;
        boolean isRelevantWord2 = false;
        if (StringTools.isEmpty(token)) {
            continue;
        }
        if (relevantWords.containsKey(token)) {
            isRelevantWord = true;
        }
        if (relevantWords2.containsKey(token)) {
            isRelevantWord2 = true;
        }
        if (!isRelevantWord && !isRelevantWord2) {
            continue;
        }
        // verb amb pronom feble davant
        if (matchPostagRegexp(tokens[i - 1], PRONOM_FEBLE) && !prevToken.startsWith("'") && !prevToken.startsWith("-")) {
            continue;
        }
        String replacement = null;
        final Matcher mPreposicioDE = PREPOSICIO_DE.matcher(nextToken);
        final Matcher mExcepcionsDE = EXCEPCIONS_DARRERE_DE.matcher(nextNextToken);
        final Matcher mArticleELMS = ARTICLE_EL_MS.matcher(prevToken);
        final Matcher mArticleELFS = ARTICLE_EL_FS.matcher(prevToken);
        final Matcher mArticleELMP = ARTICLE_EL_MP.matcher(prevToken);
        final Matcher mArticleELFP = ARTICLE_EL_FP.matcher(prevToken);
        // VERB WITHOUT ACCENT -> NOUN WITH ACCENT
        if (isRelevantWord && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
            // amb renuncies
            if (tokens[i - 1].hasPosTag("SPS00") && !tokens[i - 1].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 1], DETERMINANT) && !matchPostagRegexp(tokens[i], INFINITIU)) {
                replacement = relevantWords.get(token).getToken();
            } else if (i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && !matchPostagRegexp(tokens[i - 2], DETERMINANT) && (matchPostagRegexp(tokens[i - 1], DETERMINANT) || mArticleELMS.matches() || mArticleELFS.matches() || mArticleELMP.matches() || mArticleELFP.matches()) && !matchPostagRegexp(tokens[i], INFINITIU)) {
                replacement = relevantWords.get(token).getToken();
            } else // aquestes renuncies
            if (((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS) && !token.equals("cantar")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS) && !token.equals("venia") && !token.equals("tenia") && !token.equals("continua") && !token.equals("genera") && !token.equals("faria")) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fumaré una faria (correct: fària)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((matchPostagRegexp(tokens[i - 1], DETERMINANT_MS) && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_MP) && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FS) && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (matchPostagRegexp(tokens[i - 1], DETERMINANT_FP) && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // fem la copia (correct: còpia)
            if (i > 2 && matchPostagRegexp(tokens[i - 2], VERB_CONJUGAT) && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // circumstancies d'una altra classe
            if (!matchPostagRegexp(tokens[i], PARTICIPI_MS) && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("espero") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !prevToken.equals("que") && !prevToken.equals("qui") && !prevToken.equals("què") && mPreposicioDE.matches() && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && !matchPostagRegexp(tokens[i + 1], LOCUCIONS) && (i < tokens.length - 2) && !matchPostagRegexp(tokens[i + 2], INFINITIU) && !mExcepcionsDE.matches() && !tokens[i - 1].hasPosTag("RG")) {
                replacement = relevantWords.get(token).getToken();
            } else // la renuncia del president.
            if (!token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("faria") && !token.equals("faries") && !token.equals("continua") && !token.equals("continues") && !token.equals("cantar") && !token.equals("diferencia") && !token.equals("diferencies") && !token.equals("distancia") && !token.equals("distancies") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP))) && mPreposicioDE.matches()) {
                replacement = relevantWords.get(token).getToken();
            } else // circumstancies extraordinàries
            if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && (i < tokens.length - 1) && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i + 1], ADJECTIU_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les seves contraries
            if ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && !matchPostagRegexp(tokens[i], VERB_3S) && !matchPostagRegexp(tokens[i], GRUP_VERBAL)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP))) {
                replacement = relevantWords.get(token).getToken();
            } else //una nova formula que (fórmula)
            if (nextToken.equals("que") && i > 2 && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP) && matchPostagRegexp(tokens[i - 2], DETERMINANT_FP)))) {
                replacement = relevantWords.get(token).getToken();
            } else // les circumstancies que ens envolten
            if (nextToken.equals("que") && ((mArticleELMS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MS)) || (mArticleELFS.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FS)) || (mArticleELMP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_MP)) || (mArticleELFP.matches() && matchPostagRegexp(relevantWords.get(token), NOM_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
            // de positiva influencia
            if (!token.equals("pronuncia") && !token.equals("espero") && !token.equals("pronuncies") && !token.equals("venia") && !token.equals("venies") && !token.equals("tenia") && !token.equals("tenies") && !token.equals("continua") && !token.equals("continues") && !token.equals("faria") && !token.equals("faries") && !token.equals("genera") && !token.equals("figuri") && i > 2 && tokens[i - 2].hasPosTag("SPS00") && !tokens[i - 2].hasPosTag("RG") && ((matchPostagRegexp(relevantWords.get(token), NOM_MS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MS)) || (matchPostagRegexp(relevantWords.get(token), NOM_FS) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FS)) || (matchPostagRegexp(relevantWords.get(token), NOM_MP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_MP)) || (matchPostagRegexp(relevantWords.get(token), NOM_FP) && matchPostagRegexp(tokens[i - 1], ADJECTIU_FP)))) {
                replacement = relevantWords.get(token).getToken();
            }
        }
        // VERB WITHOUT ACCENT -> ADJECTIVE WITH ACCENT
        if (isRelevantWord2 && !matchPostagRegexp(tokens[i], GN) && !matchPostagRegexp(tokens[i], LOCUCIONS)) {
            // de manera obvia, circumstàncies extraordinaries.
            if ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i - 1], NOM_MS) && !tokens[i - 1].hasPosTag("_GN_FS") && matchPostagRegexp(tokens[i], VERB_CONJUGAT) && !matchPostagRegexp(tokens[i], VERB_3S)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && prevPrevToken.equalsIgnoreCase("de") && (prevToken.equals("manera") || prevToken.equals("forma"))) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i - 1], NOM_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i - 1], NOM_FP))) {
                replacement = relevantWords2.get(token).getToken();
            } else // de continua disputa
            if ((i < tokens.length - 1) && !prevToken.equals("que") && !matchPostagRegexp(tokens[i - 1], NOT_IN_PREV_TOKEN) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FS)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_MP)) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && matchPostagRegexp(tokens[i - 1], BEFORE_ADJECTIVE_FP)))) {
                replacement = relevantWords2.get(token).getToken();
            } else // la magnifica conservació
            if ((i < tokens.length - 1) && ((matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MS) && matchPostagRegexp(tokens[i + 1], NOM_MS) && mArticleELMS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FS) && matchPostagRegexp(tokens[i + 1], NOM_FS) && mArticleELFS.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_MP) && matchPostagRegexp(tokens[i + 1], NOM_MP) && mArticleELMP.matches()) || (matchPostagRegexp(relevantWords2.get(token), ADJECTIU_FP) && matchPostagRegexp(tokens[i + 1], NOM_FP) && mArticleELFP.matches()))) {
                replacement = relevantWords2.get(token).getToken();
            }
        }
        if (replacement != null) {
            final String msg = "Si és un nom o un adjectiu, ha de portar accent.";
            final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), tokens[i].getEndPos(), msg, "Falta un accent");
            ruleMatch.setSuggestedReplacement(replacement);
            ruleMatches.add(ruleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : RuleMatch(org.languagetool.rules.RuleMatch) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 57 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AccentuationDataLoader method loadWords.

Map<String, AnalyzedTokenReadings> loadWords(String path) {
    final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
    final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
    try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
        while (scanner.hasNextLine()) {
            final String line = scanner.nextLine().trim();
            if (line.isEmpty() || line.charAt(0) == '#') {
                // ignore comments
                continue;
            }
            final String[] parts = line.split(";");
            if (parts.length != 3) {
                throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
            }
            final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
            map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
        }
    }
    return map;
}
Also used : Scanner(java.util.Scanner) AnalyzedToken(org.languagetool.AnalyzedToken) HashMap(java.util.HashMap) InputStream(java.io.InputStream) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 58 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GermanChunkerTest method assertFullChunks.

private void assertFullChunks(String input) throws Exception {
    String plainInput = getPlainInput(input);
    AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(plainInput);
    AnalyzedTokenReadings[] result = analyzedSentence.getTokensWithoutWhitespace();
    chunker.addChunkTags(Arrays.asList(result));
    List<String> expectedChunks = getExpectedChunks(input);
    List<ChunkTaggedToken> result2 = new ArrayList<>();
    int i = 0;
    for (AnalyzedTokenReadings readings : result) {
        if (i > 0) {
            ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken(readings.getToken(), readings.getChunkTags(), readings);
            result2.add(chunkTaggedToken);
        }
        i++;
    }
    assertChunks(input, plainInput, result2, expectedChunks);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 59 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class TokenPredicateTest method test.

@Test
public void test() {
    List<ChunkTag> chunkTags = Arrays.asList(new ChunkTag("CHUNK1"), new ChunkTag("CHUNK2"));
    AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("mytoken", "MYPOS", "mylemma"), 0);
    ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken("mytoken", chunkTags, readings);
    assertMatch("mytoken", chunkTaggedToken);
    assertNoMatch("mytoken2", chunkTaggedToken);
    assertMatch("string=mytoken", chunkTaggedToken);
    assertNoMatch("string=mytoken2", chunkTaggedToken);
    assertMatch("regex=my[abct]oken", chunkTaggedToken);
    assertNoMatch("regex=my[abc]oken", chunkTaggedToken);
    assertMatch("chunk=CHUNK1", chunkTaggedToken);
    assertMatch("chunk=CHUNK2", chunkTaggedToken);
    assertNoMatch("chunk=OTHERCHUNK", chunkTaggedToken);
    assertMatch("pos=MYPOS", chunkTaggedToken);
    assertNoMatch("pos=OTHER", chunkTaggedToken);
    assertMatch("posre=M.POS", chunkTaggedToken);
    assertNoMatch("posre=O.HER", chunkTaggedToken);
    try {
        assertNoMatch("invalid=token", chunkTaggedToken);
        fail();
    } catch (RuntimeException expected) {
    //expected
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 60 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class CaseRule method isSpecialCase.

private boolean isSpecialCase(int i, AnalyzedTokenReadings[] tokens) {
    String prevToken = i > 1 ? tokens[i - 1].getToken() : "";
    String token = tokens[i].getToken();
    AnalyzedTokenReadings nextReadings = i < tokens.length - 1 ? tokens[i + 1] : null;
    // ignore "im Allgemeinen gilt" but not "im Allgemeinen Fall":
    return "im".equalsIgnoreCase(prevToken) && "Allgemeinen".equals(token) && !hasNounReading(nextReadings);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2