Search in sources :

Example 16 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class MatchState method toFinalString.

/**
   * Gets all strings formatted using the match element.
   */
public final String[] toFinalString(Language lang) throws IOException {
    String[] formattedString = new String[1];
    if (formattedToken != null) {
        int readingCount = formattedToken.getReadingsLength();
        formattedString[0] = formattedToken.getToken();
        Pattern pRegexMatch = match.getRegexMatch();
        String regexReplace = match.getRegexReplace();
        if (pRegexMatch != null) {
            formattedString[0] = pRegexMatch.matcher(formattedString[0]).replaceAll(regexReplace);
        }
        String posTag = match.getPosTag();
        if (posTag != null) {
            if (synthesizer == null) {
                formattedString[0] = formattedToken.getToken();
            } else if (match.isPostagRegexp()) {
                TreeSet<String> wordForms = new TreeSet<>();
                boolean oneForm = false;
                for (int k = 0; k < readingCount; k++) {
                    if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
                        String posUnique = formattedToken.getAnalyzedToken(k).getPOSTag();
                        if (posUnique == null) {
                            wordForms.add(formattedToken.getToken());
                            oneForm = true;
                        } else {
                            if (SENTENCE_START_TAGNAME.equals(posUnique) || SENTENCE_END_TAGNAME.equals(posUnique) || PARAGRAPH_END_TAGNAME.equals(posUnique)) {
                                if (!oneForm) {
                                    wordForms.add(formattedToken.getToken());
                                }
                                oneForm = true;
                            } else {
                                oneForm = false;
                            }
                        }
                    }
                }
                String targetPosTag = getTargetPosTag();
                if (!oneForm) {
                    for (int i = 0; i < readingCount; i++) {
                        String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i), targetPosTag, true);
                        if (possibleWordForms != null) {
                            wordForms.addAll(Arrays.asList(possibleWordForms));
                        }
                    }
                }
                if (wordForms.isEmpty()) {
                    if (match.checksSpelling()) {
                        formattedString[0] = "";
                    } else {
                        formattedString[0] = "(" + formattedToken.getToken() + ")";
                    }
                } else {
                    formattedString = wordForms.toArray(new String[wordForms.size()]);
                }
            } else {
                TreeSet<String> wordForms = new TreeSet<>();
                for (int i = 0; i < readingCount; i++) {
                    String[] possibleWordForms = synthesizer.synthesize(formattedToken.getAnalyzedToken(i), posTag);
                    if (possibleWordForms != null) {
                        wordForms.addAll(Arrays.asList(possibleWordForms));
                    }
                }
                formattedString = wordForms.toArray(new String[wordForms.size()]);
            }
        }
    }
    String original;
    if (match.isStaticLemma()) {
        original = matchedToken != null ? matchedToken.getToken() : "";
    } else {
        original = formattedToken != null ? formattedToken.getToken() : "";
    }
    for (int i = 0; i < formattedString.length; i++) {
        formattedString[i] = convertCase(formattedString[i], original, lang);
    }
    // TODO should case conversion happen before or after including skipped tokens?
    IncludeRange includeSkipped = match.getIncludeSkipped();
    if (includeSkipped != IncludeRange.NONE && skippedTokens != null && !skippedTokens.isEmpty()) {
        String[] helper = new String[formattedString.length];
        for (int i = 0; i < formattedString.length; i++) {
            if (formattedString[i] == null) {
                formattedString[i] = "";
            }
            helper[i] = formattedString[i] + skippedTokens;
        }
        formattedString = helper;
    }
    if (match.checksSpelling() && lang != null) {
        List<String> formattedStringElements = Arrays.asList(formattedString);
        // tagger-based speller
        List<AnalyzedTokenReadings> analyzed = lang.getTagger().tag(formattedStringElements);
        for (int i = 0; i < formattedString.length; i++) {
            AnalyzedToken analyzedToken = analyzed.get(i).getAnalyzedToken(0);
            if (analyzedToken.getLemma() == null && analyzedToken.hasNoTag()) {
                formattedString[i] = "";
            }
        }
    }
    return formattedString;
}
Also used : IncludeRange(org.languagetool.rules.patterns.Match.IncludeRange) Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) TreeSet(java.util.TreeSet) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 17 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class DisambiguationPatternRuleReplacer method executeAction.

private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
    AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
    DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
    int correctedStPos = 0;
    int startPositionCorrection = rule.getStartPositionCorrection();
    int endPositionCorrection = rule.getEndPositionCorrection();
    int matchingTokensWithCorrection = matchingTokens;
    List<Integer> tokenPositionList = new ArrayList<>();
    for (int i : tokenPositions) {
        tokenPositionList.add(i);
    }
    if (startPositionCorrection > 0) {
        //token positions are shifted by 1
        correctedStPos--;
        for (int j = 0; j < pTokensMatched.size(); j++) {
            if (!pTokensMatched.get(j)) {
                // add zero-length token corresponding to the non-matching pattern element so that position count is fine
                tokenPositionList.add(j, 0);
            }
        }
        for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
            correctedStPos += tokenPositionList.get(l);
        }
        // adjust to make sure the token count is fine as it's checked later
        int w = startPositionCorrection;
        for (int j = 0; j <= w; j++) {
            if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
                startPositionCorrection--;
            }
        }
    }
    if (endPositionCorrection < 0) {
        // adjust the end position correction if one of the elements has not been matched
        for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
            if (!pTokensMatched.get(d)) {
                endPositionCorrection++;
            }
        }
    }
    if (lastMatchToken != -1) {
        int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
        matchingTokensWithCorrection += maxPosCorrection;
    }
    int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
    boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
    DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
    AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
    Match matchElement = rule.getMatchElement();
    String disambiguatedPOS = rule.getDisambiguatedPOS();
    switch(disAction) {
        case UNIFY:
            if (unifiedTokens != null) {
                //TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
                if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
                        unifiedTokens[unifiedTokens.length - 1].setSentEnd();
                    }
                    for (int i = 0; i < unifiedTokens.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        List<ChunkTag> chTags = whTokens[position].getChunkTags();
                        whTokens[position] = unifiedTokens[i];
                        whTokens[position].setChunkTags(chTags);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case REMOVE:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].removeReading(newTokenReadings[i]);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            } else if (!StringTools.isEmpty(disambiguatedPOS)) {
                // negative filtering
                Pattern p = Pattern.compile(disambiguatedPOS);
                AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
                for (AnalyzedToken analyzedToken : tmp) {
                    if (analyzedToken.getPOSTag() != null) {
                        Matcher mPos = p.matcher(analyzedToken.getPOSTag());
                        if (mPos.matches()) {
                            int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
                            String prevValue = whTokens[position].toString();
                            String prevAnot = whTokens[position].getHistoricalAnnotations();
                            whTokens[position].removeReading(analyzedToken);
                            annotateChange(whTokens[position], prevValue, prevAnot);
                        }
                    }
                }
            }
            break;
        case ADD:
            if (newTokenReadings != null) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if (newTokenReadings[i].getToken().isEmpty()) {
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        String prevValue = whTokens[position].toString();
                        String prevAnot = whTokens[position].getHistoricalAnnotations();
                        whTokens[position].addReading(newTok);
                        annotateChange(whTokens[position], prevValue, prevAnot);
                    }
                }
            }
            break;
        case FILTERALL:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                PatternToken pToken;
                if (pTokensMatched.get(i + startPositionCorrection)) {
                    pToken = rule.getPatternTokens().get(i + startPositionCorrection);
                } else {
                    int k = 1;
                    while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
                        k++;
                    }
                    pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
                }
                Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
                String prevValue = whTokens[position].toString();
                String prevAnot = whTokens[position].getHistoricalAnnotations();
                whTokens[position] = matchState.filterReadings();
                annotateChange(whTokens[position], prevValue, prevAnot);
            }
            break;
        case IMMUNIZE:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
            }
            break;
        case IGNORE_SPELLING:
            for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
                whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
            }
            break;
        case FILTER:
            if (matchElement == null) {
                // same as REPLACE if using <match>
                Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
                boolean newPOSmatches = false;
                // only apply filter rule when it matches previous tags:
                for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
                    if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
                        newPOSmatches = true;
                        break;
                    }
                }
                if (newPOSmatches) {
                    MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                    String prevValue = whTokens[fromPos].toString();
                    String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                    whTokens[fromPos] = matchState.filterReadings();
                    annotateChange(whTokens[fromPos], prevValue, prevAnot);
                }
                break;
            }
        //fallthrough
        case REPLACE:
        default:
            if (newTokenReadings != null && newTokenReadings.length > 0) {
                if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
                    for (int i = 0; i < newTokenReadings.length; i++) {
                        String token;
                        int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
                        if ("".equals(newTokenReadings[i].getToken())) {
                            // empty token
                            token = whTokens[position].getToken();
                        } else {
                            token = newTokenReadings[i].getToken();
                        }
                        String lemma;
                        if (newTokenReadings[i].getLemma() == null) {
                            // empty lemma
                            lemma = token;
                        } else {
                            lemma = newTokenReadings[i].getLemma();
                        }
                        AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
                        AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                        whTokens[position] = replaceTokens(whTokens[position], toReplace);
                    }
                }
            } else if (matchElement == null) {
                String lemma = "";
                for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
                    if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
                        lemma = analyzedToken.getLemma();
                    }
                }
                if (StringTools.isEmpty(lemma)) {
                    lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
                }
                AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
                AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
                whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
            } else {
                // using the match element
                MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
                String prevValue = whTokens[fromPos].toString();
                String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
                whTokens[fromPos] = matchElementState.filterReadings();
                whTokens[fromPos].setWhitespaceBefore(spaceBefore);
                annotateChange(whTokens[fromPos], prevValue, prevAnot);
            }
    }
    return whTokens;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) ChunkTag(org.languagetool.chunking.ChunkTag) AnalyzedToken(org.languagetool.AnalyzedToken)

Example 18 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class PortugueseAccentuationDataLoader method loadWords.

Map<String, AnalyzedTokenReadings> loadWords(String path) {
    final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
    final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
    try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
        while (scanner.hasNextLine()) {
            final String line = scanner.nextLine().trim();
            if (line.isEmpty() || line.charAt(0) == '#') {
                // ignore comments
                continue;
            }
            final String[] parts = line.split(";");
            if (parts.length != 3) {
                throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
            }
            final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
            map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
        }
    }
    return map;
}
Also used : Scanner(java.util.Scanner) AnalyzedToken(org.languagetool.AnalyzedToken) HashMap(java.util.HashMap) InputStream(java.io.InputStream) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 19 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class RussianTagger method tag.

@Override
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException {
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    for (String word : sentenceTokens) {
        if (word.length() > 1) {
            word = word.replace("о́", "о");
            word = word.replace("а́", "а");
            word = word.replace("е́", "е");
            word = word.replace("у́", "у");
            word = word.replace("и́", "и");
            word = word.replace("ы́", "ы");
            word = word.replace("э́", "э");
            word = word.replace("ю́", "ю");
            word = word.replace("я́", "я");
            word = word.replace("о̀", "о");
            word = word.replace("а̀", "а");
            word = word.replace("ѐ", "е");
            word = word.replace("у̀", "у");
            word = word.replace("ѝ", "и");
            word = word.replace("ы̀", "ы");
            word = word.replace("э̀", "э");
            word = word.replace("ю̀", "ю");
            word = word.replace("я̀", "я");
            word = word.replace("ʼ", "ъ");
        }
        List<AnalyzedToken> l = getAnalyzedTokens(word);
        tokenReadings.add(new AnalyzedTokenReadings(l, pos));
        pos += word.length();
    }
    return tokenReadings;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 20 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class RuleFilterEvaluatorTest method testGetResolvedArguments.

@Test
public void testGetResolvedArguments() throws Exception {
    AnalyzedTokenReadings[] readingsList = { new AnalyzedTokenReadings(new AnalyzedToken("fake1", "pos", null), 0), new AnalyzedTokenReadings(new AnalyzedToken("fake2", "pos", null), 0) };
    Map<String, String> map = eval.getResolvedArguments("year:\\1 month:\\2", readingsList, Arrays.asList(1, 1));
    assertThat(map.get("year"), is("fake1"));
    assertThat(map.get("month"), is("fake2"));
    assertThat(map.size(), is(2));
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2