Search in sources :

Example 41 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class TokenPredicateTest method test.

@Test
public void test() {
    List<ChunkTag> chunkTags = Arrays.asList(new ChunkTag("CHUNK1"), new ChunkTag("CHUNK2"));
    AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("mytoken", "MYPOS", "mylemma"), 0);
    ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken("mytoken", chunkTags, readings);
    assertMatch("mytoken", chunkTaggedToken);
    assertNoMatch("mytoken2", chunkTaggedToken);
    assertMatch("string=mytoken", chunkTaggedToken);
    assertNoMatch("string=mytoken2", chunkTaggedToken);
    assertMatch("regex=my[abct]oken", chunkTaggedToken);
    assertNoMatch("regex=my[abc]oken", chunkTaggedToken);
    assertMatch("chunk=CHUNK1", chunkTaggedToken);
    assertMatch("chunk=CHUNK2", chunkTaggedToken);
    assertNoMatch("chunk=OTHERCHUNK", chunkTaggedToken);
    assertMatch("pos=MYPOS", chunkTaggedToken);
    assertNoMatch("pos=OTHER", chunkTaggedToken);
    assertMatch("posre=M.POS", chunkTaggedToken);
    assertNoMatch("posre=O.HER", chunkTaggedToken);
    try {
        assertNoMatch("invalid=token", chunkTaggedToken);
        fail();
    } catch (RuntimeException expected) {
    //expected
    }
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 42 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class Main method appendTagsWithDisambigLog.

private boolean appendTagsWithDisambigLog(StringBuilder sb, AnalyzedSentence sentence, boolean odd) {
    for (AnalyzedTokenReadings t : sentence.getTokens()) {
        if (t.isWhitespace() && !t.isSentenceStart()) {
            continue;
        }
        odd = !odd;
        sb.append("<tr>");
        sb.append("<td bgcolor=\"");
        if (odd) {
            sb.append("#ffffff");
        } else {
            sb.append("#f1f1f1");
        }
        sb.append("\">");
        if (!t.isWhitespace()) {
            sb.append(t.getToken());
            sb.append("<font color='");
            sb.append(TAG_COLOR);
            sb.append("'>[");
        }
        Iterator<AnalyzedToken> iterator = t.iterator();
        while (iterator.hasNext()) {
            AnalyzedToken token = iterator.next();
            String posTag = token.getPOSTag();
            if (t.isSentenceStart()) {
                sb.append(StringTools.escapeHTML("<S>"));
            } else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
                sb.append(StringTools.escapeHTML("</S>"));
            } else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
                sb.append(StringTools.escapeHTML("<P/>"));
            } else {
                if (!t.isWhitespace()) {
                    sb.append(token);
                    if (iterator.hasNext()) {
                        sb.append(", ");
                    }
                }
            }
        }
        if (!t.isWhitespace()) {
            if (t.getChunkTags().size() > 0) {
                sb.append(',');
                sb.append(StringUtils.join(t.getChunkTags(), "|"));
            }
            if (t.isImmunized()) {
                sb.append("{!}");
            }
            sb.append("]</font>");
        } else {
            sb.append(' ');
        }
        sb.append("</td>");
        sb.append("<td bgcolor=\"");
        if (odd) {
            sb.append("#ffffff");
        } else {
            sb.append("#f1f1f1");
        }
        sb.append("\">");
        if (!"".equals(t.getHistoricalAnnotations())) {
            sb.append(StringTools.escapeHTML(t.getHistoricalAnnotations()).trim().replace("\n", "<br>"));
        }
        sb.append("</td>");
        sb.append("</tr>");
    }
    return odd;
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 43 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class PatternRuleQueryBuilder method getTermQueryOrNull.

@Nullable
private BooleanClause getTermQueryOrNull(PatternToken patternToken, String termStr) throws UnsupportedPatternRuleException {
    if (termStr == null || termStr.isEmpty()) {
        return null;
    }
    Query termQuery;
    Term termQueryTerm = getTermQueryTerm(patternToken, termStr);
    if (patternToken.getNegation() || patternToken.getMinOccurrence() == 0) {
        // we need to ignore this - negation, if any, must happen at the same position
        return null;
    } else if (patternToken.isInflected() && patternToken.isRegularExpression()) {
        Term lemmaQueryTerm = getQueryTerm(patternToken, LEMMA_PREFIX + "(", simplifyRegex(termStr), ")");
        Query regexpQuery = getRegexQuery(lemmaQueryTerm, termStr, patternToken);
        return new BooleanClause(regexpQuery, BooleanClause.Occur.MUST);
    } else if (patternToken.isInflected() && !patternToken.isRegularExpression()) {
        /*
      This is simpler, but leads to problem with e.g. German rules ZEITLICH_SYNCHRON and GEWISSEN_SUBST:
      Term lemmaQueryTerm = getQueryTerm(element, LEMMA_PREFIX, termStr, "");
      Query query = new TermQuery(lemmaQueryTerm);
      return new BooleanClause(query, BooleanClause.Occur.MUST);
      */
        Synthesizer synthesizer = language.getSynthesizer();
        if (synthesizer != null) {
            try {
                String[] synthesized = synthesizer.synthesize(new AnalyzedToken(termStr, null, termStr), ".*", true);
                Query query;
                if (synthesized.length == 0) {
                    query = new TermQuery(termQueryTerm);
                } else {
                    query = new RegexpQuery(getTermQueryTerm(patternToken, StringUtils.join(synthesized, "|")));
                }
                return new BooleanClause(query, BooleanClause.Occur.MUST);
            } catch (IOException e) {
                throw new RuntimeException("Could not build Lucene query for '" + patternToken + "' and '" + termStr + "'", e);
            }
        }
        return null;
    } else if (patternToken.isRegularExpression()) {
        termQuery = getRegexQuery(termQueryTerm, termStr, patternToken);
    } else {
        termQuery = new TermQuery(termQueryTerm);
    }
    return new BooleanClause(termQuery, BooleanClause.Occur.MUST);
}
Also used : SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) AnalyzedToken(org.languagetool.AnalyzedToken) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) Synthesizer(org.languagetool.synthesis.Synthesizer) Nullable(org.jetbrains.annotations.Nullable)

Example 44 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class CompoundTagger method tagMatch.

@Nullable
private List<AnalyzedToken> tagMatch(String word, List<AnalyzedToken> leftAnalyzedTokens, List<AnalyzedToken> rightAnalyzedTokens) {
    List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>();
    List<AnalyzedToken> newAnalyzedTokensAnimInanim = new ArrayList<>();
    String animInanimNotTagged = null;
    for (AnalyzedToken leftAnalyzedToken : leftAnalyzedTokens) {
        String leftPosTag = leftAnalyzedToken.getPOSTag();
        if (leftPosTag == null || IPOSTag.contains(leftPosTag, IPOSTag.abbr.getText()))
            continue;
        // we don't want to mess with v_kly, e.g. no v_kly у рибо-полювання
        if (leftPosTag.startsWith("noun") && leftPosTag.contains("v_kly"))
            continue;
        String leftPosTagExtra = "";
        boolean leftNv = false;
        if (leftPosTag.contains(NV_TAG)) {
            leftNv = true;
            leftPosTag = leftPosTag.replace(NV_TAG, "");
        }
        Matcher matcher = EXTRA_TAGS_DROP.matcher(leftPosTag);
        if (matcher.find()) {
            leftPosTag = matcher.replaceAll("");
        }
        matcher = EXTRA_TAGS.matcher(leftPosTag);
        if (matcher.find()) {
            leftPosTagExtra += matcher.group();
            leftPosTag = matcher.replaceAll("");
        }
        for (AnalyzedToken rightAnalyzedToken : rightAnalyzedTokens) {
            String rightPosTag = rightAnalyzedToken.getPOSTag();
            if (rightPosTag == null || IPOSTag.contains(rightPosTag, IPOSTag.abbr.getText()))
                continue;
            String extraNvTag = "";
            boolean rightNv = false;
            if (rightPosTag.contains(NV_TAG)) {
                rightNv = true;
                if (leftNv) {
                    extraNvTag += NV_TAG;
                }
            }
            Matcher matcherR = EXTRA_TAGS_DROP.matcher(rightPosTag);
            if (matcherR.find()) {
                rightPosTag = matcherR.replaceAll("");
            }
            matcherR = EXTRA_TAGS.matcher(rightPosTag);
            if (matcherR.find()) {
                rightPosTag = matcherR.replaceAll("");
            }
            if (leftPosTag.equals(rightPosTag) && (IPOSTag.startsWith(leftPosTag, IPOSTag.numr, IPOSTag.adv, IPOSTag.adj, IPOSTag.verb) || (IPOSTag.startsWith(leftPosTag, IPOSTag.intj) && leftAnalyzedToken.getLemma().equalsIgnoreCase(rightAnalyzedToken.getLemma())))) {
                newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
            } else // noun-noun
            if (leftPosTag.startsWith(IPOSTag.noun.getText()) && rightPosTag.startsWith(IPOSTag.noun.getText())) {
                // discard чорний-чорний as noun:anim
                if (leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken()) && leftPosTag.contains(TAG_ANIM) && rightPosTag.contains(TAG_ANIM))
                    continue;
                String agreedPosTag = getAgreedPosTag(leftPosTag, rightPosTag, leftNv, word);
                if (agreedPosTag == null && rightPosTag.startsWith("noun:inanim:m:v_naz") && isMinMax(rightAnalyzedToken.getToken())) {
                    agreedPosTag = leftPosTag;
                }
                if (agreedPosTag == null && !isSameAnimStatus(leftPosTag, rightPosTag)) {
                    agreedPosTag = tryAnimInanim(leftPosTag, rightPosTag, leftAnalyzedToken.getLemma(), rightAnalyzedToken.getLemma(), leftNv, rightNv, word);
                    if (agreedPosTag == null) {
                        animInanimNotTagged = leftPosTag.contains(":anim") ? "anim-inanim" : "inanim-anim";
                    } else {
                        newAnalyzedTokensAnimInanim.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                        continue;
                    }
                }
                if (agreedPosTag != null) {
                    newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                }
            } else // numr-numr: один-два
            if (leftPosTag.startsWith(IPOSTag.numr.getText()) && rightPosTag.startsWith(IPOSTag.numr.getText())) {
                String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
                if (agreedPosTag != null) {
                    newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                }
            } else // noun-numr match
            if (IPOSTag.startsWith(leftPosTag, IPOSTag.noun) && IPOSTag.startsWith(rightPosTag, IPOSTag.numr)) {
                // gender tags match
                String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
                if (leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag))) {
                    newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                    // година-півтори може бути як одниною так і множиною: минула година-півтори, минулі година-півтори
                    if (!leftPosTag.contains(":p:")) {
                        newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                    }
                } else {
                    // (with different gender tags): сотні (:p:) - дві (:f:)
                    String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
                    if (agreedPosTag != null) {
                        newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                        // рік-два може бути як одниною так і множиною: минулий рік-два, минулі рік-два
                        if (!agreedPosTag.contains(":p:")) {
                            newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                        }
                    }
                }
            } else // не робимо братів-православних — загальний noun-adj дає забагато фальшивих спрацьовувань
            if (leftPosTag.startsWith(IPOSTag.noun.getText()) && IPOSTag.startsWith(rightPosTag, IPOSTag.numr) || (IPOSTag.startsWith(rightPosTag, IPOSTag.adj) && isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken))) {
                //          if( ! leftPosTag.contains(":prop")
                //              || isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken) ) { 
                // discard чорний-чорний as noun:anim
                //          	if( leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken()) )
                //          		continue;
                String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
                if (leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag))) {
                    newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
                }
            //        }
            }
        }
    }
    // remove duplicates
    newAnalyzedTokens = new ArrayList<>(new LinkedHashSet<>(newAnalyzedTokens));
    if (newAnalyzedTokens.isEmpty()) {
        newAnalyzedTokens = newAnalyzedTokensAnimInanim;
    }
    if (animInanimNotTagged != null && newAnalyzedTokens.isEmpty()) {
        debug_compound_unknown_write(word + " " + animInanimNotTagged);
    }
    return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) AnalyzedToken(org.languagetool.AnalyzedToken) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Nullable(org.jetbrains.annotations.Nullable)

Example 45 with AnalyzedToken

use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.

the class PosTagHelper method getGenders.

public static String getGenders(AnalyzedTokenReadings tokenReadings, String posTagRegex) {
    Pattern posTagPattern = Pattern.compile(posTagRegex);
    StringBuilder sb = new StringBuilder(4);
    for (AnalyzedToken tokenReading : tokenReadings) {
        String posTag = tokenReading.getPOSTag();
        if (posTagPattern.matcher(posTag).matches()) {
            String gender = getGender(posTag);
            if (sb.indexOf(gender) == -1) {
                sb.append(gender);
            }
        }
    }
    return sb.toString();
}
Also used : Pattern(java.util.regex.Pattern) AnalyzedToken(org.languagetool.AnalyzedToken)

Aggregations

AnalyzedToken (org.languagetool.AnalyzedToken)89 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)48 ArrayList (java.util.ArrayList)43 Matcher (java.util.regex.Matcher)16 Test (org.junit.Test)16 IOException (java.io.IOException)9 Pattern (java.util.regex.Pattern)7 Nullable (org.jetbrains.annotations.Nullable)6 TaggedWord (org.languagetool.tagging.TaggedWord)6 RuleMatch (org.languagetool.rules.RuleMatch)4 Synthesizer (org.languagetool.synthesis.Synthesizer)4 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 LinkedHashSet (java.util.LinkedHashSet)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 DictionaryLookup (morfologik.stemming.DictionaryLookup)2 IStemmer (morfologik.stemming.IStemmer)2 AnalyzedSentence (org.languagetool.AnalyzedSentence)2 ChunkTag (org.languagetool.chunking.ChunkTag)2