Search in sources :

Example 61 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class MatchState method filterReadings.

public final AnalyzedTokenReadings filterReadings() {
    List<AnalyzedToken> l = new ArrayList<>();
    if (formattedToken != null) {
        if (match.isStaticLemma()) {
            matchedToken.leaveReading(new AnalyzedToken(matchedToken.getToken(), match.getPosTag(), formattedToken.getToken()));
            formattedToken = matchedToken;
        }
        String token = formattedToken.getToken();
        Pattern regexMatch = match.getRegexMatch();
        String regexReplace = match.getRegexReplace();
        if (regexMatch != null && regexReplace != null) {
            /* only replace if it is something to replace */
            token = regexMatch.matcher(token).replaceAll(regexReplace);
        }
        token = convertCase(token, token, null);
        String posTag = match.getPosTag();
        if (posTag != null) {
            int numRead = formattedToken.getReadingsLength();
            if (match.isPostagRegexp()) {
                Pattern pPosRegexMatch = match.getPosRegexMatch();
                String posTagReplace = match.getPosTagReplace();
                String targetPosTag;
                for (int i = 0; i < numRead; i++) {
                    String testTag = formattedToken.getAnalyzedToken(i).getPOSTag();
                    if (testTag != null && pPosRegexMatch.matcher(testTag).matches()) {
                        targetPosTag = testTag;
                        if (posTagReplace != null) {
                            targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
                        }
                        l.add(new AnalyzedToken(token, targetPosTag, formattedToken.getAnalyzedToken(i).getLemma()));
                        l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
                    }
                }
                if (l.isEmpty()) {
                    l.addAll(getNewToken(numRead, token));
                }
            } else {
                l.addAll(getNewToken(numRead, token));
            }
            String lemma = formattedToken.getAnalyzedToken(0).getLemma();
            if (formattedToken.isSentenceEnd()) {
                l.add(new AnalyzedToken(formattedToken.getToken(), SENTENCE_END_TAGNAME, lemma));
            }
            if (formattedToken.isParagraphEnd()) {
                l.add(new AnalyzedToken(formattedToken.getToken(), PARAGRAPH_END_TAGNAME, lemma));
            }
        }
    }
    if (l.isEmpty()) {
        return formattedToken;
    }
    final AnalyzedTokenReadings anTkRead = new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
    anTkRead.setWhitespaceBefore(formattedToken.isWhitespaceBefore());
    if (!formattedToken.getChunkTags().isEmpty()) {
        anTkRead.setChunkTags(formattedToken.getChunkTags());
    }
    if (formattedToken.isImmunized()) {
        anTkRead.immunize();
    }
    return anTkRead;
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 62 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class AdvancedWordRepeatRule method match.

/*
   * Tests if any word form is repeated in the sentence.
   */
@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    boolean repetition = false;
    Set<String> inflectedWords = new TreeSet<>();
    String prevLemma;
    int curToken = 0;
    // start from real token, 0 = SENT_START
    for (int i = 1; i < tokens.length; i++) {
        String token = tokens[i].getToken();
        // avoid "..." etc. to be matched:
        boolean isWord = true;
        boolean hasLemma = true;
        if (token.length() < 2) {
            isWord = false;
        }
        for (AnalyzedToken analyzedToken : tokens[i]) {
            String posTag = analyzedToken.getPOSTag();
            if (posTag != null) {
                if (StringTools.isEmpty(posTag)) {
                    isWord = false;
                    break;
                }
                String lemma = analyzedToken.getLemma();
                if (lemma == null) {
                    hasLemma = false;
                    break;
                }
                if (getExcludedWordsPattern().contains(lemma)) {
                    isWord = false;
                    break;
                }
                Matcher m2 = getExcludedPos().matcher(posTag);
                if (m2.matches()) {
                    isWord = false;
                    break;
                }
            } else {
                hasLemma = false;
            }
        }
        Matcher m1 = getExcludedNonWordsPattern().matcher(tokens[i].getToken());
        if (isWord && m1.matches()) {
            isWord = false;
        }
        prevLemma = "";
        if (isWord) {
            boolean notSentEnd = false;
            for (AnalyzedToken analyzedToken : tokens[i]) {
                String pos = analyzedToken.getPOSTag();
                if (pos != null) {
                    notSentEnd |= JLanguageTool.SENTENCE_END_TAGNAME.equals(pos);
                }
                if (hasLemma) {
                    String curLemma = analyzedToken.getLemma();
                    if (!prevLemma.equals(curLemma) && !notSentEnd) {
                        if (inflectedWords.contains(curLemma) && curToken != i) {
                            repetition = true;
                        } else {
                            inflectedWords.add(analyzedToken.getLemma());
                            curToken = i;
                        }
                    }
                    prevLemma = curLemma;
                } else {
                    if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
                        repetition = true;
                    } else {
                        inflectedWords.add(tokens[i].getToken());
                    }
                }
            }
        }
        if (repetition) {
            int pos = tokens[i].getStartPos();
            RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), getMessage(), getShortMessage());
            ruleMatches.add(ruleMatch);
            repetition = false;
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 63 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class DemoRule method match.

// This is the method with the error detection logic that you need to implement:
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    // Let's get all the tokens (i.e. words) of this sentence, but not the spaces:
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    // be a special token that indicates the start of a sentence:
    for (AnalyzedTokenReadings token : tokens) {
        // the original word from the input text
        System.out.println("Token: " + token.getToken());
        // so we iterate over the readings:
        for (AnalyzedToken analyzedToken : token.getReadings()) {
            System.out.println("  Lemma: " + analyzedToken.getLemma());
            System.out.println("  POS: " + analyzedToken.getPOSTag());
        }
        // then show to the user:
        if (token.getToken().equals("demo")) {
            RuleMatch ruleMatch = new RuleMatch(this, token.getStartPos(), token.getEndPos(), "The demo rule thinks this looks wrong");
            // the user will see this as a suggested correction
            ruleMatch.setSuggestedReplacement("blablah");
            ruleMatches.add(ruleMatch);
        }
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) ArrayList(java.util.ArrayList) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 64 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class PatternTokenTest method testUnknownTag.

@Test
public void testUnknownTag() {
    PatternToken patternToken = new PatternToken("", false, false, false);
    patternToken.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, false));
    PatternToken patternToken2 = new PatternToken("", false, false, false);
    patternToken2.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, true));
    PatternToken patternToken3 = new PatternToken("", false, false, false);
    patternToken3.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG + "|VBG", true, false));
    PatternToken patternToken4 = new PatternToken("", false, false, false);
    patternToken4.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG + "|VBG", true, true));
    PatternToken patternToken5 = new PatternToken("\\p{Ll}+", false, true, false);
    patternToken5.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, false));
    AnalyzedToken an = new AnalyzedToken("schword", null, null);
    assertTrue(patternToken.isMatched(an));
    assertFalse(patternToken2.isMatched(an));
    assertTrue(patternToken3.isMatched(an));
    assertFalse(patternToken4.isMatched(an));
    assertTrue(patternToken5.isMatched(an));
    // if the AnalyzedToken is in the set of readings that have
    //non-null tags...
    an.setNoPOSTag(false);
    assertFalse(patternToken.isMatched(an));
    assertTrue(patternToken2.isMatched(an));
    assertFalse(patternToken3.isMatched(an));
    assertTrue(patternToken4.isMatched(an));
    assertFalse(patternToken5.isMatched(an));
    AnalyzedToken anSentEnd = new AnalyzedToken("schword", SENTENCE_END_TAGNAME, null);
    assertTrue(patternToken.isMatched(anSentEnd));
    assertFalse(patternToken2.isMatched(anSentEnd));
    assertTrue(patternToken3.isMatched(anSentEnd));
    assertFalse(patternToken4.isMatched(anSentEnd));
    assertTrue(patternToken5.isMatched(anSentEnd));
    PatternToken patternToken6 = new PatternToken("\\p{Ll}+", false, true, false);
    patternToken6.setPosToken(new PatternToken.PosToken(SENTENCE_END_TAGNAME, false, false));
    assertTrue(patternToken6.isMatched(anSentEnd));
    PatternToken patternToken7 = new PatternToken("\\p{Ll}+", false, true, false);
    patternToken7.setPosToken(new PatternToken.PosToken(SENTENCE_END_TAGNAME + "|BLABLA", true, false));
    assertTrue(patternToken7.isMatched(anSentEnd));
    // if the AnalyzedToken is in the set of readings that have
    //non-null tags...
    anSentEnd.setNoPOSTag(false);
    assertFalse(patternToken.isMatched(anSentEnd));
    assertTrue(patternToken2.isMatched(anSentEnd));
    assertFalse(patternToken3.isMatched(anSentEnd));
    assertTrue(patternToken4.isMatched(anSentEnd));
    assertFalse(patternToken5.isMatched(anSentEnd));
    AnalyzedToken anParaEnd = new AnalyzedToken("schword", PARAGRAPH_END_TAGNAME, null);
    assertTrue(patternToken.isMatched(anParaEnd));
    assertFalse(patternToken2.isMatched(anParaEnd));
    assertTrue(patternToken3.isMatched(anParaEnd));
    assertFalse(patternToken4.isMatched(anParaEnd));
    assertTrue(patternToken5.isMatched(anParaEnd));
    // if the AnalyzedToken is in the set of readings that have
    //non-null tags...
    anParaEnd.setNoPOSTag(false);
    assertFalse(patternToken.isMatched(anParaEnd));
    assertTrue(patternToken2.isMatched(anParaEnd));
    assertFalse(patternToken3.isMatched(anParaEnd));
    assertTrue(patternToken4.isMatched(anParaEnd));
    assertFalse(patternToken5.isMatched(anParaEnd));
    AnalyzedToken anWithPOS = new AnalyzedToken("schword", "POS", null);
    assertFalse(patternToken.isMatched(anWithPOS));
    assertTrue(patternToken2.isMatched(anWithPOS));
    assertFalse(patternToken3.isMatched(anWithPOS));
    assertTrue(patternToken4.isMatched(anWithPOS));
    assertFalse(patternToken5.isMatched(anWithPOS));
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) Test(org.junit.Test)

Example 65 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class GermanReflexiveVerbGuesser method run.

private void run(File indexTopDir, File lemmaListFile) throws IOException {
    List<String> lemmas = Files.readAllLines(lemmaListFile.toPath());
    System.out.println("Durchschnitt Prozent | Anzahl Lemma | mich/uns/euch ... | ... mich/uns/euch | Lemma");
    try (LuceneLanguageModel lm = new LuceneLanguageModel(indexTopDir)) {
        for (String lemma : lemmas) {
            //if (!lemma.equals("reklamieren")) { continue; }
            //if (!lemma.equals("hertreiben")) { continue; }
            String[] firstPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:1:SIN:PRÄ.*", true);
            String[] thirdPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:3:SIN:PRÄ.*", true);
            String firstPsSin = firstPsSinArray.length > 0 ? firstPsSinArray[0] : null;
            String thirdPsSin = thirdPsSinArray.length > 0 ? thirdPsSinArray[0] : null;
            long reflexiveCount1 = count1(lm, lemma, firstPsSin, thirdPsSin) - counterExamples("für", lm, lemma, firstPsSin, thirdPsSin) - counterExamples("vor", lm, lemma, firstPsSin, thirdPsSin);
            long reflexiveCount2 = count2(lm, lemma, firstPsSin, thirdPsSin);
            long lemmaCount = lm.getCount(lemma);
            float factor1 = ((float) reflexiveCount1 / lemmaCount) * 100.0f;
            float factor2 = ((float) reflexiveCount2 / lemmaCount) * 100.0f;
            float avgFactor = (factor1 + factor2) / 2;
            //System.out.printf("%.2f%% %.2f%% " + reflexiveCount1 + " " + reflexiveCount2 + " " + lemmaCount + " " + lemma + "\n", factor1, factor2);
            //System.out.printf("%.2f%% %.2f%% " + lemmaCount + " " + lemma + "\n", factor1, factor2);
            System.out.printf("%.2f %d %.2f%% %.2f%% %s\n", avgFactor, lemmaCount, factor1, factor2, lemma);
        }
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) LuceneLanguageModel(org.languagetool.languagemodel.LuceneLanguageModel)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2