Search in sources :

Example 71 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class CatalanTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());
    for (String word : sentenceTokens) {
        // This hack allows all rules and dictionary entries to work with
        // typewriter apostrophe
        boolean containsTypewriterApostrophe = false;
        if (word.length() > 1) {
            if (word.contains("'")) {
                containsTypewriterApostrophe = true;
            }
            word = word.replace("’", "'");
        }
        final List<AnalyzedToken> l = new ArrayList<>();
        final String lowerWord = word.toLowerCase(conversionLocale);
        final boolean isLowercase = word.equals(lowerWord);
        final boolean isMixedCase = StringTools.isMixedCase(word);
        List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
        // normal case:
        addTokens(taggerTokens, l);
        // word with lowercase word tags:
        if (!isLowercase && !isMixedCase) {
            List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
            addTokens(lowerTaggerTokens, l);
        }
        // additional tagging with prefixes
        if (l.isEmpty() && !isMixedCase) {
            addTokens(additionalTags(word, dictLookup), l);
        }
        if (l.isEmpty()) {
            l.add(new AnalyzedToken(word, null, null));
        }
        AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
        if (containsTypewriterApostrophe) {
            List<ChunkTag> listChunkTags = new ArrayList<>();
            listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
            atr.setChunkTags(listChunkTags);
        }
        tokenReadings.add(atr);
        pos += word.length();
    }
    return tokenReadings;
}
Also used : ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedToken(org.languagetool.AnalyzedToken) IStemmer(morfologik.stemming.IStemmer) ArrayList(java.util.ArrayList) DictionaryLookup(morfologik.stemming.DictionaryLookup) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 72 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class ReplaceOperationNamesRule method match.

@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    loop: for (int i = 1; i < tokens.length; i++) {
        List<String> replacementLemmas = null;
        String token = tokens[i].getToken().toLowerCase();
        if (token.length() > 3 && token.endsWith("s")) {
            token = token.substring(0, token.length() - 1);
        }
        if (wrongWords.containsKey(token)) {
            replacementLemmas = wrongWords.get(token);
        } else {
            continue loop;
        }
        // exceptions
        if (token.equals("duplicat") && tokens[i - 1].getToken().equalsIgnoreCase("per")) {
            continue loop;
        }
        // Assecat el braç del riu
        if (i + 1 < tokens.length && matchPostagRegexp(tokens[i - 1], PUNTUACIO) && matchPostagRegexp(tokens[i + 1], DETERMINANT)) {
            continue loop;
        }
        // relevant token
        if (tokens[i].hasPosTag("_GV_")) {
            continue loop;
        }
        // next token
        if (i + 1 < tokens.length && (tokens[i + 1].hasLemma("per") || tokens[i + 1].hasLemma("com") || tokens[i + 1].hasLemma("des") || tokens[i + 1].hasLemma("amb") || matchPostagRegexp(tokens[i + 1], NextToken_POS_Excep))) {
            continue loop;
        }
        // prev token
        if (!matchPostagRegexp(tokens[i - 1], PrevToken_POS) || matchPostagRegexp(tokens[i - 1], PrevToken_POS_Excep)) {
            continue loop;
        }
        if (replacementLemmas != null) {
            List<String> possibleReplacements = new ArrayList<>();
            String[] synthesized = null;
            if (!tokens[i].getToken().toLowerCase().endsWith("s")) {
                possibleReplacements.addAll(replacementLemmas);
            } else {
                //synthesize plural
                for (String replacementLemma : replacementLemmas) {
                    try {
                        synthesized = synth.synthesize(new AnalyzedToken(replacementLemma, "NCMS000", replacementLemma), "NC.P.*");
                    } catch (IOException e) {
                        throw new RuntimeException("Could not synthesize: " + replacementLemma + " with tag NC.P.*.", e);
                    }
                    possibleReplacements.addAll(Arrays.asList(synthesized));
                }
            }
            if (possibleReplacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokens[i], possibleReplacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 73 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class SimpleReplaceVerbsRule method match.

@Override
public final RuleMatch[] match(final AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    for (AnalyzedTokenReadings tokenReadings : tokens) {
        String originalTokenStr = tokenReadings.getToken();
        if (ignoreTaggedWords && tokenReadings.isTagged()) {
            continue;
        }
        String tokenString = originalTokenStr.toLowerCase(getLocale());
        AnalyzedTokenReadings analyzedTokenReadings = null;
        String infinitive = null;
        int i = 0;
        while (i < 2 && analyzedTokenReadings == null) {
            Matcher m;
            if (i == 0) {
                m = desinencies_1conj_0.matcher(tokenString);
            } else {
                m = desinencies_1conj_1.matcher(tokenString);
            }
            if (m.matches()) {
                String lexeme = m.group(1);
                String desinence = m.group(2);
                if (desinence.startsWith("e") || desinence.startsWith("é") || desinence.startsWith("i") || desinence.startsWith("ï")) {
                    if (lexeme.endsWith("c")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 1).concat("ç");
                    } else if (lexeme.endsWith("qu")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("c");
                    } else if (lexeme.endsWith("g")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 1).concat("j");
                    } else if (lexeme.endsWith("gü")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("gu");
                    } else if (lexeme.endsWith("gu")) {
                        lexeme = lexeme.substring(0, lexeme.length() - 2).concat("g");
                    }
                }
                if (desinence.startsWith("ï")) {
                    desinence = "i" + desinence.substring(1, desinence.length());
                }
                infinitive = lexeme.concat("ar");
                if (wrongWords.containsKey(infinitive)) {
                    List<String> wordAsArray = Arrays.asList("cant".concat(desinence));
                    List<AnalyzedTokenReadings> analyzedTokenReadingsList = null;
                    try {
                        analyzedTokenReadingsList = tagger.tag(wordAsArray);
                    } catch (IOException e) {
                        throw new RuntimeException("Could not tag sentence: " + wordAsArray, e);
                    }
                    if (analyzedTokenReadingsList != null) {
                        analyzedTokenReadings = analyzedTokenReadingsList.get(0);
                    }
                }
            }
            i++;
        }
        // synthesize replacements
        if (analyzedTokenReadings != null) {
            List<String> possibleReplacements = new ArrayList<>();
            String[] synthesized = null;
            List<String> replacementInfinitives = wrongWords.get(infinitive);
            for (String replacementInfinitive : replacementInfinitives) {
                if (replacementInfinitive.startsWith("(")) {
                    possibleReplacements.add(replacementInfinitive);
                } else {
                    // the first part
                    String[] parts = replacementInfinitive.split(" ");
                    // is the verb
                    AnalyzedToken infinitiveAsAnTkn = new AnalyzedToken(parts[0], "V.*", parts[0]);
                    for (AnalyzedToken analyzedToken : analyzedTokenReadings) {
                        try {
                            synthesized = synth.synthesize(infinitiveAsAnTkn, analyzedToken.getPOSTag());
                        } catch (IOException e) {
                            throw new RuntimeException("Could not synthesize: " + infinitiveAsAnTkn + " with tag " + analyzedToken.getPOSTag(), e);
                        }
                        for (String s : synthesized) {
                            for (int j = 1; j < parts.length; j++) {
                                s = s.concat(" ").concat(parts[j]);
                            }
                            if (!possibleReplacements.contains(s)) {
                                possibleReplacements.add(s);
                            }
                        }
                    }
                }
            }
            if (possibleReplacements.size() > 0) {
                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, possibleReplacements);
                ruleMatches.add(potentialRuleMatch);
            }
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) IOException(java.io.IOException) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken)

Example 74 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class TokenPredicate method apply.

@Override
public boolean apply(ChunkTaggedToken analyzedToken) {
    String[] parts = getDescription().split("=");
    String exprType;
    String exprValue;
    if (parts.length == 1) {
        exprType = "string";
        exprValue = parts[0];
    } else if (parts.length == 2) {
        exprType = parts[0];
        exprValue = parts[1];
    } else {
        throw new RuntimeException("Could not parse expression: " + getDescription());
    }
    if (exprValue.startsWith("'") && exprValue.endsWith("'")) {
        exprValue = exprValue.substring(1, exprValue.length() - 1);
    }
    switch(exprType) {
        case "string":
            if (caseSensitive) {
                return analyzedToken.getToken().equals(exprValue);
            } else {
                return analyzedToken.getToken().equalsIgnoreCase(exprValue);
            }
        case "regex":
            Pattern p1 = caseSensitive ? Pattern.compile(exprValue) : Pattern.compile(exprValue, Pattern.CASE_INSENSITIVE);
            return p1.matcher(analyzedToken.getToken()).matches();
        case // case sensitive
        "regexCS":
            Pattern p2 = Pattern.compile(exprValue);
            return p2.matcher(analyzedToken.getToken()).matches();
        case "chunk":
            Pattern chunkPattern = Pattern.compile(exprValue);
            for (ChunkTag chunkTag : analyzedToken.getChunkTags()) {
                if (chunkPattern.matcher(chunkTag.getChunkTag()).matches()) {
                    return true;
                }
            }
            return false;
        case "pos":
            AnalyzedTokenReadings readings = analyzedToken.getReadings();
            if (readings != null) {
                for (AnalyzedToken token : readings) {
                    if (token.getPOSTag() != null && token.getPOSTag().contains(exprValue)) {
                        return true;
                    }
                }
            }
            return false;
        case "posre":
        case "posregex":
            Pattern posPattern = Pattern.compile(exprValue);
            AnalyzedTokenReadings readings2 = analyzedToken.getReadings();
            if (readings2 != null) {
                for (AnalyzedToken token : readings2) {
                    if (token.getPOSTag() != null && posPattern.matcher(token.getPOSTag()).matches()) {
                        return true;
                    }
                }
            }
            return false;
        default:
            throw new RuntimeException("Expression type not supported: '" + exprType + "'");
    }
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 75 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AgreementRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    for (int i = 0; i < tokens.length; i++) {
        //defaulting to the first reading
        //TODO: check for all readings
        String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
        if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) {
            continue;
        }
        if (tokens[i].isImmunized()) {
            continue;
        }
        AnalyzedTokenReadings tokenReadings = tokens[i];
        boolean relevantPronoun = isRelevantPronoun(tokens, i);
        boolean ignore = couldBeRelativeClause(tokens, i);
        if (i > 0) {
            String prevToken = tokens[i - 1].getToken().toLowerCase();
            if ((tokens[i].getToken().equals("eine") || tokens[i].getToken().equals("einen")) && (prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das") || prevToken.equals("des") || prevToken.equals("dieses"))) {
                // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken; "auf der einen Seite"
                ignore = true;
            }
        }
        // avoid false alarm on "nichts Gutes" and "alles Gute"
        if (tokenReadings.getToken().equals("nichts") || tokenReadings.getToken().equals("alles") || tokenReadings.getToken().equals("dies")) {
            ignore = true;
        }
        // avoid false alarm on "Art. 1" and "bisherigen Art. 1" (Art. = Artikel):
        boolean detAbbrev = i < tokens.length - 2 && tokens[i + 1].getToken().equals("Art") && tokens[i + 2].getToken().equals(".");
        boolean detAdjAbbrev = i < tokens.length - 3 && tokens[i + 2].getToken().equals("Art") && tokens[i + 3].getToken().equals(".");
        // "einen Hochwasser führenden Fluss", "die Gott zugeschriebenen Eigenschaften":
        boolean followingParticiple = i < tokens.length - 3 && (tokens[i + 2].hasPartialPosTag("PA1") || tokens[i + 2].getToken().matches("zugeschriebenen?|genannten?"));
        if (detAbbrev || detAdjAbbrev || followingParticiple) {
            ignore = true;
        }
        if ((GermanHelper.hasReadingOfType(tokenReadings, POSType.DETERMINER) || relevantPronoun) && !ignore) {
            int tokenPos = i + 1;
            if (tokenPos >= tokens.length) {
                break;
            }
            AnalyzedTokenReadings nextToken = tokens[tokenPos];
            if (isNonPredicativeAdjective(nextToken) || isParticiple(nextToken)) {
                tokenPos = i + 2;
                if (tokenPos >= tokens.length) {
                    break;
                }
                if (GermanHelper.hasReadingOfType(tokens[tokenPos], POSType.NOMEN)) {
                    // e.g. "deren komisches Geschenke" isn't yet detected as incorrect
                    if (i >= 2 && GermanHelper.hasReadingOfType(tokens[i - 2], POSType.ADJEKTIV) && "als".equals(tokens[i - 1].getToken()) && "das".equals(tokens[i].getToken())) {
                        // avoid false alarm for e.g. "weniger farbenprächtig als das anderer Papageien"
                        continue;
                    }
                    RuleMatch ruleMatch = checkDetAdjNounAgreement(tokens[i], nextToken, tokens[i + 2]);
                    if (ruleMatch != null) {
                        ruleMatches.add(ruleMatch);
                    }
                }
            } else if (GermanHelper.hasReadingOfType(nextToken, POSType.NOMEN) && !"Herr".equals(nextToken.getToken())) {
                RuleMatch ruleMatch = checkDetNounAgreement(tokens[i], tokens[i + 1]);
                if (ruleMatch != null) {
                    ruleMatches.add(ruleMatch);
                }
            }
        }
    }
    // for each token
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2