Search in sources :

Example 56 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class ReplaceOperationNamesRule method match.

@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    loop: for (int i = 1; i < tokens.length; i++) {
        List<String> replacementLemmas = null;
        String token = tokens[i].getToken().toLowerCase();
        if (token.length() > 3 && token.endsWith("s")) {
            token = token.substring(0, token.length() - 1);
        }
        if (wrongWords.containsKey(token)) {
            replacementLemmas = wrongWords.get(token);
        } else {
            continue loop;
        }
        // exceptions
        if (token.equals("duplicat") && tokens[i - 1].getToken().equalsIgnoreCase("per")) {
            continue loop;
        }
        // Assecat el braç del riu
        if (i + 1 < tokens.length && matchPostagRegexp(tokens[i - 1], PUNTUACIO) && matchPostagRegexp(tokens[i + 1], DETERMINANT)) {
            continue loop;
        }
        // relevant token
        if (tokens[i].hasPosTag("_GV_")) {
            continue loop;
        }
        // next token
        if (i + 1 < tokens.length && (tokens[i + 1].hasLemma("per") || tokens[i + 1].hasLemma("com") || tokens[i + 1].hasLemma("des") || tokens[i + 1].hasLemma("amb") || matchPostagRegexp(tokens[i + 1], NextToken_POS_Excep))) {
            continue loop;
        }
        // prev token
        if (!matchPostagRegexp(tokens[i - 1], PrevToken_POS) || matchPostagRegexp(tokens[i - 1], PrevToken_POS_Excep)) {
            continue loop;
        }
        if (replacementLemmas != null) {
            List<String> possibleReplacements = new ArrayList<>();
            String[] synthesized = null;
            if (!tokens[i].getToken().toLowerCase().endsWith("s")) {
                possibleReplacements.addAll(replacementLemmas);
            } else {
                //synthesize plural
                for (String replacementLemma : replacementLemmas) {
                    try {
                        synthesized = synth.synthesize(new AnalyzedToken(replacementLemma, "NCMS000", replacementLemma), "NC.P.*");
                    } catch (IOException e) {
                        throw new RuntimeException("Could not synthesize: " + replacementLemma + " with tag NC.P.*.", e);
                    }
                    possibleReplacements.addAll(Arrays.asList(synthesized));
                }
            }
            if (possibleReplacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokens[i], possibleReplacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 57 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class SimpleReplaceVerbsRule method match.

@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        String originalTokenStr = tokenReadings.getToken();
        if (ignoreTaggedWords && tokenReadings.isTagged()) {
            continue;
        }
        String tokenString = originalTokenStr.toLowerCase(getLocale());
        AnalyzedTokenReadings analyzedTokenReadings = null;
        String infinitive = null;
        int i = 0;
        while (i < 2 && analyzedTokenReadings == null) {
            Matcher m;
            if (i == 0) {
                m = desinencies_1conj_0.matcher(tokenString);
            } else {
                m = desinencies_1conj_1.matcher(tokenString);
            }
            if (m.matches()) {
                String lexeme = m.group(1);
                String desinence = m.group(2);
                if (desinence.startsWith("e") || desinence.startsWith("é") || desinence.startsWith("i") || desinence.startsWith("ï")) {
                    if (lexeme.endsWith("c")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 1).concat("ç");
                    } else if (lexeme.endsWith("qu")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("c");
                    } else if (lexeme.endsWith("g")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 1).concat("j");
                    } else if (lexeme.endsWith("gü")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("gu");
                    } else if (lexeme.endsWith("gu")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("g");
                    }
                }
                if (desinence.startsWith("ï")) {
                    desinence = "i" + desinence.substring(1, desinence.length());
                }
                infinitive = lexeme.concat("ar");
                if (wrongWords.containsKey(infinitive)) {
                    List<String> wordAsArray = Arrays.asList("cant".concat(desinence));
                    List<AnalyzedTokenReadings> analyzedTokenReadingsList = null;
                    try {
                        analyzedTokenReadingsList = tagger.tag(wordAsArray);
                    } catch (IOException e) {
                        throw new RuntimeException("Could not tag sentence: " + wordAsArray, e);
                    }
                    if (analyzedTokenReadingsList != null) {
                        analyzedTokenReadings = analyzedTokenReadingsList.get(0);
                    }
                }
            }
            i++;
        }
        // synthesize replacements
        if (analyzedTokenReadings != null) {
            List<String> possibleReplacements = new ArrayList<>();
            String[] synthesized = null;
            List<String> replacementInfinitives = wrongWords.get(infinitive);
            for (String replacementInfinitive : replacementInfinitives) {
                if (replacementInfinitive.startsWith("(")) {
                    possibleReplacements.add(replacementInfinitive);
                } else {
                    // the first part
                    String[] parts = replacementInfinitive.split(" ");
                    // is the verb
                    AnalyzedToken infinitiveAsAnTkn = new AnalyzedToken(parts[0], "V.*", parts[0]);
                    for (AnalyzedToken analyzedToken : analyzedTokenReadings) {
                        try {
                            synthesized = synth.synthesize(infinitiveAsAnTkn, analyzedToken.getPOSTag());
                        } catch (IOException e) {
                            throw new RuntimeException("Could not synthesize: " + infinitiveAsAnTkn + " with tag " + analyzedToken.getPOSTag(), e);
                        }
                        for (String s : synthesized) {
                            for (int j = 1; j < parts.length; j++) {
                                s = s.concat(" ").concat(parts[j]);
                            }
                            if (!possibleReplacements.contains(s)) {
                                possibleReplacements.add(s);
                            }
                        }
                    }
                }
            }
            if (possibleReplacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, possibleReplacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken)

Example 58 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class TokenPredicate method apply.

@Override
public boolean apply(ChunkTaggedToken analyzedToken) {
    String[] parts = getDescription().split("=");
    String exprType;
    String exprValue;
    if (parts.length == 1) {
        exprType = "string";
        exprValue = parts[0];
    } else if (parts.length == 2) {
        exprType = parts[0];
        exprValue = parts[1];
    } else {
        throw new RuntimeException("Could not parse expression: " + getDescription());
    }
    if (exprValue.startsWith("'") && exprValue.endsWith("'")) {
        exprValue = exprValue.substring(1, exprValue.length() - 1);
    }
    switch(exprType) {
        case "string":
            if (caseSensitive) {
                return analyzedToken.getToken().equals(exprValue);
            } else {
                return analyzedToken.getToken().equalsIgnoreCase(exprValue);
            }
        case "regex":
            Pattern p1 = caseSensitive ? Pattern.compile(exprValue) : Pattern.compile(exprValue, Pattern.CASE_INSENSITIVE);
            return p1.matcher(analyzedToken.getToken()).matches();
        case // case sensitive
        "regexCS":
            Pattern p2 = Pattern.compile(exprValue);
            return p2.matcher(analyzedToken.getToken()).matches();
        case "chunk":
            Pattern chunkPattern = Pattern.compile(exprValue);
            for (ChunkTag chunkTag : analyzedToken.getChunkTags()) {
                if (chunkPattern.matcher(chunkTag.getChunkTag()).matches()) {
                    return true;
                }
            }
            return false;
        case "pos":
            AnalyzedTokenReadings readings = analyzedToken.getReadings();
            if (readings != null) {
                for (AnalyzedToken token : readings) {
                    if (token.getPOSTag() != null && token.getPOSTag().contains(exprValue)) {
                        return true;
                    }
                }
            }
            return false;
        case "posre":
        case "posregex":
            Pattern posPattern = Pattern.compile(exprValue);
            AnalyzedTokenReadings readings2 = analyzedToken.getReadings();
            if (readings2 != null) {
                for (AnalyzedToken token : readings2) {
                    if (token.getPOSTag() != null && posPattern.matcher(token.getPOSTag()).matches()) {
                        return true;
                    }
                }
            }
            return false;
        default:
            throw new RuntimeException("Expression type not supported: '" + exprType + "'");
    }
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 59 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class MatchState method getTargetPosTag.

/**
   * Format POS tag using parameters already defined in the class.
   *
   * @return Formatted POS tag as String.
   */
// FIXME: gets only the first POS tag that matches, this can be wrong
// on the other hand, many POS tags = too many suggestions?
public final String getTargetPosTag() {
    String targetPosTag = match.getPosTag();
    List<String> posTags = new ArrayList<>();
    Pattern pPosRegexMatch = match.getPosRegexMatch();
    String posTagReplace = match.getPosTagReplace();
    if (match.isStaticLemma()) {
        for (AnalyzedToken analyzedToken : matchedToken) {
            String tst = analyzedToken.getPOSTag();
            if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
                targetPosTag = analyzedToken.getPOSTag();
                posTags.add(targetPosTag);
            }
        }
        if (pPosRegexMatch != null && posTagReplace != null) {
            targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
        }
    } else {
        for (AnalyzedToken analyzedToken : formattedToken) {
            String tst = analyzedToken.getPOSTag();
            if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
                targetPosTag = analyzedToken.getPOSTag();
                posTags.add(targetPosTag);
            }
        }
        if (pPosRegexMatch != null && posTagReplace != null) {
            if (posTags.isEmpty()) {
                posTags.add(targetPosTag);
            }
            StringBuilder sb = new StringBuilder();
            int posTagLen = posTags.size();
            int l = 0;
            for (String lPosTag : posTags) {
                l++;
                lPosTag = pPosRegexMatch.matcher(lPosTag).replaceAll(posTagReplace);
                if (match.setsPos()) {
                    lPosTag = synthesizer.getPosTagCorrection(lPosTag);
                }
                sb.append(lPosTag);
                if (l < posTagLen) {
                    sb.append('|');
                }
            }
            targetPosTag = sb.toString();
        }
    }
    return targetPosTag;
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList)

Example 60 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class MatchState method getNewToken.

private List<AnalyzedToken> getNewToken(int numRead, String token) {
    String posTag = match.getPosTag();
    List<AnalyzedToken> list = new ArrayList<>();
    String lemma = "";
    for (int j = 0; j < numRead; j++) {
        String tempPosTag = formattedToken.getAnalyzedToken(j).getPOSTag();
        if (tempPosTag != null) {
            if (tempPosTag.equals(posTag) && formattedToken.getAnalyzedToken(j).getLemma() != null) {
                lemma = formattedToken.getAnalyzedToken(j).getLemma();
            }
            if (StringTools.isEmpty(lemma)) {
                lemma = formattedToken.getAnalyzedToken(0).getLemma();
            }
            list.add(new AnalyzedToken(token, posTag, lemma));
            list.get(list.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
        }
    }
    return list;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2