Search in sources :

Example 16 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class Unifier method checkNext.

private boolean checkNext(AnalyzedToken aToken, Map<String, List<String>> uFeatures) {
    boolean anyFeatUnified = false;
    List<Boolean> tokenFeaturesFound = new ArrayList<>(tmpFeaturesFound);
    Map<String, Set<String>> equivalencesMatchedHere = new ConcurrentHashMap<>();
    if (allFeatsIn) {
        for (int i = 0; i < tokCnt; i++) {
            boolean allFeatsUnified = true;
            for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
                boolean featUnified = false;
                List<String> types = feat.getValue();
                if (types == null || types.isEmpty()) {
                    types = equivalenceFeatures.get(feat.getKey());
                }
                for (String typeName : types) {
                    if (equivalencesMatched.get(i).containsKey(feat.getKey()) && equivalencesMatched.get(i).get(feat.getKey()).contains(typeName)) {
                        PatternToken testElem = equivalenceTypes.get(new EquivalenceTypeLocator(feat.getKey(), typeName));
                        boolean matched = testElem.isMatched(aToken);
                        featUnified = featUnified || matched;
                        //Stores equivalences to be kept
                        if (matched) {
                            if (!equivalencesToBeKept.containsKey(feat.getKey())) {
                                Set<String> typeSet = new HashSet<>();
                                typeSet.add(typeName);
                                equivalencesToBeKept.put(feat.getKey(), typeSet);
                            } else {
                                equivalencesToBeKept.get(feat.getKey()).add(typeName);
                            }
                            if (!equivalencesMatchedHere.containsKey(feat.getKey())) {
                                // just for this reading
                                Set<String> typeSet = new HashSet<>();
                                typeSet.add(typeName);
                                equivalencesMatchedHere.put(feat.getKey(), typeSet);
                            } else {
                                equivalencesMatchedHere.get(feat.getKey()).add(typeName);
                            }
                        }
                    }
                }
                allFeatsUnified &= featUnified;
            }
            tokenFeaturesFound.set(i, tokenFeaturesFound.get(i) || allFeatsUnified);
            anyFeatUnified = anyFeatUnified || allFeatsUnified;
        }
        if (anyFeatUnified) {
            if (tokSequence.size() == readingsCounter) {
                tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
                List<Map<String, Set<String>>> equivList = new ArrayList<>();
                equivList.add(equivalencesMatchedHere);
                tokSequenceEquivalences.add(equivList);
            } else {
                if (readingsCounter < tokSequence.size()) {
                    tokSequence.get(readingsCounter).addReading(aToken);
                    tokSequenceEquivalences.get(readingsCounter).add(equivalencesMatchedHere);
                } else {
                    anyFeatUnified = false;
                }
            }
            tmpFeaturesFound = tokenFeaturesFound;
        }
    }
    return anyFeatUnified;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 17 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class Unifier method isSatisfied.

/**
   * Tests if a token has shared features with other tokens.
   * 
   * @param aToken token to be tested
   * @param uFeatures features to be tested
   * @return true if the token shares this type of feature with other tokens
   */
protected final boolean isSatisfied(AnalyzedToken aToken, Map<String, List<String>> uFeatures) {
    if (allFeatsIn && equivalencesMatched.isEmpty()) {
        return false;
    }
    if (uFeatures == null) {
        throw new RuntimeException("isSatisfied called without features being set");
    }
    unificationFeats = uFeatures;
    boolean unified = true;
    if (allFeatsIn) {
        unified = checkNext(aToken, uFeatures);
    } else {
        while (equivalencesMatched.size() <= tokCnt) {
            equivalencesMatched.add(new ConcurrentHashMap<>());
        }
        for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
            List<String> types = feat.getValue();
            if (types == null || types.isEmpty()) {
                types = equivalenceFeatures.get(feat.getKey());
            }
            for (String typeName : types) {
                PatternToken testElem = equivalenceTypes.get(new EquivalenceTypeLocator(feat.getKey(), typeName));
                if (testElem == null) {
                    return false;
                }
                if (testElem.isMatched(aToken)) {
                    if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) {
                        Set<String> typeSet = new HashSet<>();
                        typeSet.add(typeName);
                        equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet);
                    } else {
                        equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typeName);
                    }
                }
            }
            unified = equivalencesMatched.get(tokCnt).containsKey(feat.getKey());
            if (!unified) {
                equivalencesMatched.remove(tokCnt);
                break;
            }
        }
        if (unified) {
            if (tokCnt == 0 || tokSequence.isEmpty()) {
                tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
                List<Map<String, Set<String>>> equivList = new ArrayList<>();
                equivList.add(equivalencesMatched.get(tokCnt));
                tokSequenceEquivalences.add(equivList);
            } else {
                tokSequence.get(0).addReading(aToken);
                tokSequenceEquivalences.get(0).add(equivalencesMatched.get(tokCnt));
            }
            tokCnt++;
        }
    }
    return unified;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 18 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class MultiWordChunker method setAndAnnotate.

private AnalyzedTokenReadings setAndAnnotate(AnalyzedTokenReadings oldReading, AnalyzedToken newReading) {
    String old = oldReading.toString();
    String prevAnot = oldReading.getHistoricalAnnotations();
    AnalyzedTokenReadings newAtr = new AnalyzedTokenReadings(oldReading.getReadings(), oldReading.getStartPos());
    newAtr.setWhitespaceBefore(oldReading.isWhitespaceBefore());
    newAtr.addReading(newReading);
    newAtr.setHistoricalAnnotations(annotateToken(prevAnot, old, newAtr.toString()));
    newAtr.setChunkTags(oldReading.getChunkTags());
    return newAtr;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 19 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class DisambiguationPatternRuleReplacer method executeAction.

private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
    AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
    DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
    int correctedStPos = 0;
    int startPositionCorrection = rule.getStartPositionCorrection();
    int endPositionCorrection = rule.getEndPositionCorrection();
    int matchingTokensWithCorrection = matchingTokens;
    List<Integer> tokenPositionList = new ArrayList<>();
    for (int i : tokenPositions) {
        tokenPositionList.add(i);
    }
    if (startPositionCorrection > 0) {
        //token positions are shifted by 1
        correctedStPos--;
        for (int j = 0; j < pTokensMatched.size(); j++) {
            if (!pTokensMatched.get(j)) {
                // add zero-length token corresponding to the non-matching pattern element so that position count is fine
                tokenPositionList.add(j, 0);
            }
        }
        for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
            correctedStPos += tokenPositionList.get(l);
        }
        // adjust to make sure the token count is fine as it's checked later
        int w = startPositionCorrection;
        for (int j = 0; j <= w; j++) {
            if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
                startPositionCorrection--;
            }
        }
    }
    if (endPositionCorrection < 0) {
        // adjust the end position correction if one of the elements has not been matched
        for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
            if (!pTokensMatched.get(d)) {
                endPositionCorrection++;
            }
        }
    }
    if (lastMatchToken != -1) {
        int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
        matchingTokensWithCorrection += maxPosCorrection;
    }
    int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
    boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
    DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
    AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
    Match matchElement = rule.getMatchElement();
    String disambiguatedPOS = rule.getDisambiguatedPOS();
    switch(disAction) {
        case UNIFY:
            if (unifiedTokens != null) {
                //TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
                if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
                        unifiedTokens[unifiedTokens.length - 1].setSentEnd();
                    }
                    for (int i = 0; i < unifiedTokens.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        List<ChunkTag> chTags = whTokens[position].getChunkTags();
                        whTokens[position] = unifiedTokens[i];
                        whTokens[position].setChunkTags(chTags);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case REMOVE:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].removeReading(newTokenReadings[i]);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            } else if (!StringTools.isEmpty(disambiguatedPOS)) {
                // negative filtering
                Pattern p = Pattern.compile(disambiguatedPOS);
                AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
                for (AnalyzedToken analyzedToken : tmp) {
                    if (analyzedToken.getPOSTag() != null) {
                        Matcher mPos = p.matcher(analyzedToken.getPOSTag());
                        if (mPos.matches()) {
                            int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
                            String prevValue = whTokens[position].toString();
                            String prevAnot = whTokens[position].getHistoricalAnnotations();
                            whTokens[position].removeReading(analyzedToken);
                            annotateChange(whTokens[position], prevValue, prevAnot);
                        }
                    }
                }
            }
            break;
        case ADD:
            if (newTokenReadings != null) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if (newTokenReadings[i].getToken().isEmpty()) {
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].addReading(newTok);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case FILTERALL:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                PatternToken pToken;
                if (pTokensMatched.get(i + startPositionCorrection)) {
                    pToken = rule.getPatternTokens().get(i + startPositionCorrection);
                } else {
                    int k = 1;
                    while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
                        k++;
                    }
                    pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
                }
                Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
                String prevValue = whTokens[position].toString();
                String prevAnot = whTokens[position].getHistoricalAnnotations();
                whTokens[position] = matchState.filterReadings();
                annotateChange(whTokens[position], prevValue, prevAnot);
            }
            break;
        case IMMUNIZE:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
            }
            break;
        case IGNORE_SPELLING:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
            }
            break;
        case FILTER:
            if (matchElement == null) {
                // same as REPLACE if using <match>
                Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                boolean newPOSmatches = false;
                // only apply filter rule when it matches previous tags:
                for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
                    if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
                        newPOSmatches = true;
                        break;
                    }
                }
                if (newPOSmatches) {
                    MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                    String prevValue = whTokens[fromPos].toString();
                    String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                    whTokens[fromPos] = matchState.filterReadings();
                    annotateChange(whTokens[fromPos], prevValue, prevAnot);
                }
                break;
            }
        //fallthrough
        case REPLACE:
        default:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if ("".equals(newTokenReadings[i].getToken())) {
                            // empty token
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            // empty lemma
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                        whTokens[position] = replaceTokens(whTokens[position], toReplace);
                    }
                }
            } else if (matchElement == null) {
                String lemma = "";
                for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
                    if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
                        lemma = analyzedToken.getLemma();
                    }
                }
                if (StringTools.isEmpty(lemma)) {
                    lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
                }
                AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
                AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
            } else {
                // using the match element
                MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                String prevValue = whTokens[fromPos].toString();
                String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                whTokens[fromPos] = matchElementState.filterReadings();
                whTokens[fromPos].setWhitespaceBefore(spaceBefore);
                annotateChange(whTokens[fromPos], prevValue, prevAnot);
            }
    }
    return whTokens;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedToken(org.languagetool.AnalyzedToken)

Example 20 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GenericUnpairedBracketsRule method match.

@Override
public final RuleMatch[] match(List<AnalyzedSentence> sentences) {
    // the stack for pairing symbols
    UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<>();
    UnsyncStack<SymbolLocator> ruleMatchStack = new UnsyncStack<>();
    List<RuleMatch> ruleMatches = new ArrayList<>();
    int startPosBase = 0;
    for (AnalyzedSentence sentence : sentences) {
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
        for (int i = 1; i < tokens.length; i++) {
            for (int j = 0; j < startSymbols.length; j++) {
                if (fillSymbolStack(startPosBase, tokens, i, j, symbolStack)) {
                    break;
                }
            }
        }
        for (AnalyzedTokenReadings readings : sentence.getTokens()) {
            startPosBase += readings.getToken().length();
        }
    }
    for (SymbolLocator sLoc : symbolStack) {
        RuleMatch rMatch = createMatch(ruleMatches, ruleMatchStack, sLoc.getStartPos(), sLoc.getSymbol());
        if (rMatch != null) {
            ruleMatches.add(rMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2