use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class TokenPredicateTest method test.
@Test
public void test() {
List<ChunkTag> chunkTags = Arrays.asList(new ChunkTag("CHUNK1"), new ChunkTag("CHUNK2"));
AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("mytoken", "MYPOS", "mylemma"), 0);
ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken("mytoken", chunkTags, readings);
assertMatch("mytoken", chunkTaggedToken);
assertNoMatch("mytoken2", chunkTaggedToken);
assertMatch("string=mytoken", chunkTaggedToken);
assertNoMatch("string=mytoken2", chunkTaggedToken);
assertMatch("regex=my[abct]oken", chunkTaggedToken);
assertNoMatch("regex=my[abc]oken", chunkTaggedToken);
assertMatch("chunk=CHUNK1", chunkTaggedToken);
assertMatch("chunk=CHUNK2", chunkTaggedToken);
assertNoMatch("chunk=OTHERCHUNK", chunkTaggedToken);
assertMatch("pos=MYPOS", chunkTaggedToken);
assertNoMatch("pos=OTHER", chunkTaggedToken);
assertMatch("posre=M.POS", chunkTaggedToken);
assertNoMatch("posre=O.HER", chunkTaggedToken);
try {
assertNoMatch("invalid=token", chunkTaggedToken);
fail();
} catch (RuntimeException expected) {
//expected
}
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class Main method appendTagsWithDisambigLog.
private boolean appendTagsWithDisambigLog(StringBuilder sb, AnalyzedSentence sentence, boolean odd) {
for (AnalyzedTokenReadings t : sentence.getTokens()) {
if (t.isWhitespace() && !t.isSentenceStart()) {
continue;
}
odd = !odd;
sb.append("<tr>");
sb.append("<td bgcolor=\"");
if (odd) {
sb.append("#ffffff");
} else {
sb.append("#f1f1f1");
}
sb.append("\">");
if (!t.isWhitespace()) {
sb.append(t.getToken());
sb.append("<font color='");
sb.append(TAG_COLOR);
sb.append("'>[");
}
Iterator<AnalyzedToken> iterator = t.iterator();
while (iterator.hasNext()) {
AnalyzedToken token = iterator.next();
String posTag = token.getPOSTag();
if (t.isSentenceStart()) {
sb.append(StringTools.escapeHTML("<S>"));
} else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(posTag)) {
sb.append(StringTools.escapeHTML("</S>"));
} else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posTag)) {
sb.append(StringTools.escapeHTML("<P/>"));
} else {
if (!t.isWhitespace()) {
sb.append(token);
if (iterator.hasNext()) {
sb.append(", ");
}
}
}
}
if (!t.isWhitespace()) {
if (t.getChunkTags().size() > 0) {
sb.append(',');
sb.append(StringUtils.join(t.getChunkTags(), "|"));
}
if (t.isImmunized()) {
sb.append("{!}");
}
sb.append("]</font>");
} else {
sb.append(' ');
}
sb.append("</td>");
sb.append("<td bgcolor=\"");
if (odd) {
sb.append("#ffffff");
} else {
sb.append("#f1f1f1");
}
sb.append("\">");
if (!"".equals(t.getHistoricalAnnotations())) {
sb.append(StringTools.escapeHTML(t.getHistoricalAnnotations()).trim().replace("\n", "<br>"));
}
sb.append("</td>");
sb.append("</tr>");
}
return odd;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class PatternRuleQueryBuilder method getTermQueryOrNull.
@Nullable
private BooleanClause getTermQueryOrNull(PatternToken patternToken, String termStr) throws UnsupportedPatternRuleException {
if (termStr == null || termStr.isEmpty()) {
return null;
}
Query termQuery;
Term termQueryTerm = getTermQueryTerm(patternToken, termStr);
if (patternToken.getNegation() || patternToken.getMinOccurrence() == 0) {
// we need to ignore this - negation, if any, must happen at the same position
return null;
} else if (patternToken.isInflected() && patternToken.isRegularExpression()) {
Term lemmaQueryTerm = getQueryTerm(patternToken, LEMMA_PREFIX + "(", simplifyRegex(termStr), ")");
Query regexpQuery = getRegexQuery(lemmaQueryTerm, termStr, patternToken);
return new BooleanClause(regexpQuery, BooleanClause.Occur.MUST);
} else if (patternToken.isInflected() && !patternToken.isRegularExpression()) {
/*
This is simpler, but leads to problem with e.g. German rules ZEITLICH_SYNCHRON and GEWISSEN_SUBST:
Term lemmaQueryTerm = getQueryTerm(element, LEMMA_PREFIX, termStr, "");
Query query = new TermQuery(lemmaQueryTerm);
return new BooleanClause(query, BooleanClause.Occur.MUST);
*/
Synthesizer synthesizer = language.getSynthesizer();
if (synthesizer != null) {
try {
String[] synthesized = synthesizer.synthesize(new AnalyzedToken(termStr, null, termStr), ".*", true);
Query query;
if (synthesized.length == 0) {
query = new TermQuery(termQueryTerm);
} else {
query = new RegexpQuery(getTermQueryTerm(patternToken, StringUtils.join(synthesized, "|")));
}
return new BooleanClause(query, BooleanClause.Occur.MUST);
} catch (IOException e) {
throw new RuntimeException("Could not build Lucene query for '" + patternToken + "' and '" + termStr + "'", e);
}
}
return null;
} else if (patternToken.isRegularExpression()) {
termQuery = getRegexQuery(termQueryTerm, termStr, patternToken);
} else {
termQuery = new TermQuery(termQueryTerm);
}
return new BooleanClause(termQuery, BooleanClause.Occur.MUST);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class CompoundTagger method tagMatch.
@Nullable
private List<AnalyzedToken> tagMatch(String word, List<AnalyzedToken> leftAnalyzedTokens, List<AnalyzedToken> rightAnalyzedTokens) {
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>();
List<AnalyzedToken> newAnalyzedTokensAnimInanim = new ArrayList<>();
String animInanimNotTagged = null;
for (AnalyzedToken leftAnalyzedToken : leftAnalyzedTokens) {
String leftPosTag = leftAnalyzedToken.getPOSTag();
if (leftPosTag == null || IPOSTag.contains(leftPosTag, IPOSTag.abbr.getText()))
continue;
// we don't want to mess with v_kly, e.g. no v_kly у рибо-полювання
if (leftPosTag.startsWith("noun") && leftPosTag.contains("v_kly"))
continue;
String leftPosTagExtra = "";
boolean leftNv = false;
if (leftPosTag.contains(NV_TAG)) {
leftNv = true;
leftPosTag = leftPosTag.replace(NV_TAG, "");
}
Matcher matcher = EXTRA_TAGS_DROP.matcher(leftPosTag);
if (matcher.find()) {
leftPosTag = matcher.replaceAll("");
}
matcher = EXTRA_TAGS.matcher(leftPosTag);
if (matcher.find()) {
leftPosTagExtra += matcher.group();
leftPosTag = matcher.replaceAll("");
}
for (AnalyzedToken rightAnalyzedToken : rightAnalyzedTokens) {
String rightPosTag = rightAnalyzedToken.getPOSTag();
if (rightPosTag == null || IPOSTag.contains(rightPosTag, IPOSTag.abbr.getText()))
continue;
String extraNvTag = "";
boolean rightNv = false;
if (rightPosTag.contains(NV_TAG)) {
rightNv = true;
if (leftNv) {
extraNvTag += NV_TAG;
}
}
Matcher matcherR = EXTRA_TAGS_DROP.matcher(rightPosTag);
if (matcherR.find()) {
rightPosTag = matcherR.replaceAll("");
}
matcherR = EXTRA_TAGS.matcher(rightPosTag);
if (matcherR.find()) {
rightPosTag = matcherR.replaceAll("");
}
if (leftPosTag.equals(rightPosTag) && (IPOSTag.startsWith(leftPosTag, IPOSTag.numr, IPOSTag.adv, IPOSTag.adj, IPOSTag.verb) || (IPOSTag.startsWith(leftPosTag, IPOSTag.intj) && leftAnalyzedToken.getLemma().equalsIgnoreCase(rightAnalyzedToken.getLemma())))) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
} else // noun-noun
if (leftPosTag.startsWith(IPOSTag.noun.getText()) && rightPosTag.startsWith(IPOSTag.noun.getText())) {
// discard чорний-чорний as noun:anim
if (leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken()) && leftPosTag.contains(TAG_ANIM) && rightPosTag.contains(TAG_ANIM))
continue;
String agreedPosTag = getAgreedPosTag(leftPosTag, rightPosTag, leftNv, word);
if (agreedPosTag == null && rightPosTag.startsWith("noun:inanim:m:v_naz") && isMinMax(rightAnalyzedToken.getToken())) {
agreedPosTag = leftPosTag;
}
if (agreedPosTag == null && !isSameAnimStatus(leftPosTag, rightPosTag)) {
agreedPosTag = tryAnimInanim(leftPosTag, rightPosTag, leftAnalyzedToken.getLemma(), rightAnalyzedToken.getLemma(), leftNv, rightNv, word);
if (agreedPosTag == null) {
animInanimNotTagged = leftPosTag.contains(":anim") ? "anim-inanim" : "inanim-anim";
} else {
newAnalyzedTokensAnimInanim.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
continue;
}
}
if (agreedPosTag != null) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
} else // numr-numr: один-два
if (leftPosTag.startsWith(IPOSTag.numr.getText()) && rightPosTag.startsWith(IPOSTag.numr.getText())) {
String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
if (agreedPosTag != null) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
} else // noun-numr match
if (IPOSTag.startsWith(leftPosTag, IPOSTag.noun) && IPOSTag.startsWith(rightPosTag, IPOSTag.numr)) {
// gender tags match
String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
if (leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag))) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
// година-півтори може бути як одниною так і множиною: минула година-півтори, минулі година-півтори
if (!leftPosTag.contains(":p:")) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
} else {
// (with different gender tags): сотні (:p:) - дві (:f:)
String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
if (agreedPosTag != null) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
// рік-два може бути як одниною так і множиною: минулий рік-два, минулі рік-два
if (!agreedPosTag.contains(":p:")) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
}
}
} else // не робимо братів-православних — загальний noun-adj дає забагато фальшивих спрацьовувань
if (leftPosTag.startsWith(IPOSTag.noun.getText()) && IPOSTag.startsWith(rightPosTag, IPOSTag.numr) || (IPOSTag.startsWith(rightPosTag, IPOSTag.adj) && isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken))) {
// if( ! leftPosTag.contains(":prop")
// || isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken) ) {
// discard чорний-чорний as noun:anim
// if( leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken()) )
// continue;
String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
if (leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag))) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
// }
}
}
}
// remove duplicates
newAnalyzedTokens = new ArrayList<>(new LinkedHashSet<>(newAnalyzedTokens));
if (newAnalyzedTokens.isEmpty()) {
newAnalyzedTokens = newAnalyzedTokensAnimInanim;
}
if (animInanimNotTagged != null && newAnalyzedTokens.isEmpty()) {
debug_compound_unknown_write(word + " " + animInanimNotTagged);
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class PosTagHelper method getGenders.
public static String getGenders(AnalyzedTokenReadings tokenReadings, String posTagRegex) {
Pattern posTagPattern = Pattern.compile(posTagRegex);
StringBuilder sb = new StringBuilder(4);
for (AnalyzedToken tokenReading : tokenReadings) {
String posTag = tokenReading.getPOSTag();
if (posTagPattern.matcher(posTag).matches()) {
String gender = getGender(posTag);
if (sb.indexOf(gender) == -1) {
sb.append(gender);
}
}
}
return sb.toString();
}
Aggregations