use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class MatchState method filterReadings.
public final AnalyzedTokenReadings filterReadings() {
List<AnalyzedToken> l = new ArrayList<>();
if (formattedToken != null) {
if (match.isStaticLemma()) {
matchedToken.leaveReading(new AnalyzedToken(matchedToken.getToken(), match.getPosTag(), formattedToken.getToken()));
formattedToken = matchedToken;
}
String token = formattedToken.getToken();
Pattern regexMatch = match.getRegexMatch();
String regexReplace = match.getRegexReplace();
if (regexMatch != null && regexReplace != null) {
/* only replace if it is something to replace */
token = regexMatch.matcher(token).replaceAll(regexReplace);
}
token = convertCase(token, token, null);
String posTag = match.getPosTag();
if (posTag != null) {
int numRead = formattedToken.getReadingsLength();
if (match.isPostagRegexp()) {
Pattern pPosRegexMatch = match.getPosRegexMatch();
String posTagReplace = match.getPosTagReplace();
String targetPosTag;
for (int i = 0; i < numRead; i++) {
String testTag = formattedToken.getAnalyzedToken(i).getPOSTag();
if (testTag != null && pPosRegexMatch.matcher(testTag).matches()) {
targetPosTag = testTag;
if (posTagReplace != null) {
targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(posTagReplace);
}
l.add(new AnalyzedToken(token, targetPosTag, formattedToken.getAnalyzedToken(i).getLemma()));
l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
}
}
if (l.isEmpty()) {
l.addAll(getNewToken(numRead, token));
}
} else {
l.addAll(getNewToken(numRead, token));
}
String lemma = formattedToken.getAnalyzedToken(0).getLemma();
if (formattedToken.isSentenceEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(), SENTENCE_END_TAGNAME, lemma));
}
if (formattedToken.isParagraphEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(), PARAGRAPH_END_TAGNAME, lemma));
}
}
}
if (l.isEmpty()) {
return formattedToken;
}
final AnalyzedTokenReadings anTkRead = new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
anTkRead.setWhitespaceBefore(formattedToken.isWhitespaceBefore());
if (!formattedToken.getChunkTags().isEmpty()) {
anTkRead.setChunkTags(formattedToken.getChunkTags());
}
if (formattedToken.isImmunized()) {
anTkRead.immunize();
}
return anTkRead;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AdvancedWordRepeatRule method match.
/*
* Tests if any word form is repeated in the sentence.
*/
@Override
public final RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
boolean repetition = false;
Set<String> inflectedWords = new TreeSet<>();
String prevLemma;
int curToken = 0;
// start from real token, 0 = SENT_START
for (int i = 1; i < tokens.length; i++) {
String token = tokens[i].getToken();
// avoid "..." etc. to be matched:
boolean isWord = true;
boolean hasLemma = true;
if (token.length() < 2) {
isWord = false;
}
for (AnalyzedToken analyzedToken : tokens[i]) {
String posTag = analyzedToken.getPOSTag();
if (posTag != null) {
if (StringTools.isEmpty(posTag)) {
isWord = false;
break;
}
String lemma = analyzedToken.getLemma();
if (lemma == null) {
hasLemma = false;
break;
}
if (getExcludedWordsPattern().contains(lemma)) {
isWord = false;
break;
}
Matcher m2 = getExcludedPos().matcher(posTag);
if (m2.matches()) {
isWord = false;
break;
}
} else {
hasLemma = false;
}
}
Matcher m1 = getExcludedNonWordsPattern().matcher(tokens[i].getToken());
if (isWord && m1.matches()) {
isWord = false;
}
prevLemma = "";
if (isWord) {
boolean notSentEnd = false;
for (AnalyzedToken analyzedToken : tokens[i]) {
String pos = analyzedToken.getPOSTag();
if (pos != null) {
notSentEnd |= JLanguageTool.SENTENCE_END_TAGNAME.equals(pos);
}
if (hasLemma) {
String curLemma = analyzedToken.getLemma();
if (!prevLemma.equals(curLemma) && !notSentEnd) {
if (inflectedWords.contains(curLemma) && curToken != i) {
repetition = true;
} else {
inflectedWords.add(analyzedToken.getLemma());
curToken = i;
}
}
prevLemma = curLemma;
} else {
if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
repetition = true;
} else {
inflectedWords.add(tokens[i].getToken());
}
}
}
}
if (repetition) {
int pos = tokens[i].getStartPos();
RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), getMessage(), getShortMessage());
ruleMatches.add(ruleMatch);
repetition = false;
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class DemoRule method match.
// This is the method with the error detection logic that you need to implement:
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List<RuleMatch> ruleMatches = new ArrayList<>();
// Let's get all the tokens (i.e. words) of this sentence, but not the spaces:
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
// be a special token that indicates the start of a sentence:
for (AnalyzedTokenReadings token : tokens) {
// the original word from the input text
System.out.println("Token: " + token.getToken());
// so we iterate over the readings:
for (AnalyzedToken analyzedToken : token.getReadings()) {
System.out.println(" Lemma: " + analyzedToken.getLemma());
System.out.println(" POS: " + analyzedToken.getPOSTag());
}
// then show to the user:
if (token.getToken().equals("demo")) {
RuleMatch ruleMatch = new RuleMatch(this, token.getStartPos(), token.getEndPos(), "The demo rule thinks this looks wrong");
// the user will see this as a suggested correction
ruleMatch.setSuggestedReplacement("blablah");
ruleMatches.add(ruleMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class PatternTokenTest method testUnknownTag.
@Test
public void testUnknownTag() {
PatternToken patternToken = new PatternToken("", false, false, false);
patternToken.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, false));
PatternToken patternToken2 = new PatternToken("", false, false, false);
patternToken2.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, true));
PatternToken patternToken3 = new PatternToken("", false, false, false);
patternToken3.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG + "|VBG", true, false));
PatternToken patternToken4 = new PatternToken("", false, false, false);
patternToken4.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG + "|VBG", true, true));
PatternToken patternToken5 = new PatternToken("\\p{Ll}+", false, true, false);
patternToken5.setPosToken(new PatternToken.PosToken(UNKNOWN_TAG, false, false));
AnalyzedToken an = new AnalyzedToken("schword", null, null);
assertTrue(patternToken.isMatched(an));
assertFalse(patternToken2.isMatched(an));
assertTrue(patternToken3.isMatched(an));
assertFalse(patternToken4.isMatched(an));
assertTrue(patternToken5.isMatched(an));
// if the AnalyzedToken is in the set of readings that have
//non-null tags...
an.setNoPOSTag(false);
assertFalse(patternToken.isMatched(an));
assertTrue(patternToken2.isMatched(an));
assertFalse(patternToken3.isMatched(an));
assertTrue(patternToken4.isMatched(an));
assertFalse(patternToken5.isMatched(an));
AnalyzedToken anSentEnd = new AnalyzedToken("schword", SENTENCE_END_TAGNAME, null);
assertTrue(patternToken.isMatched(anSentEnd));
assertFalse(patternToken2.isMatched(anSentEnd));
assertTrue(patternToken3.isMatched(anSentEnd));
assertFalse(patternToken4.isMatched(anSentEnd));
assertTrue(patternToken5.isMatched(anSentEnd));
PatternToken patternToken6 = new PatternToken("\\p{Ll}+", false, true, false);
patternToken6.setPosToken(new PatternToken.PosToken(SENTENCE_END_TAGNAME, false, false));
assertTrue(patternToken6.isMatched(anSentEnd));
PatternToken patternToken7 = new PatternToken("\\p{Ll}+", false, true, false);
patternToken7.setPosToken(new PatternToken.PosToken(SENTENCE_END_TAGNAME + "|BLABLA", true, false));
assertTrue(patternToken7.isMatched(anSentEnd));
// if the AnalyzedToken is in the set of readings that have
//non-null tags...
anSentEnd.setNoPOSTag(false);
assertFalse(patternToken.isMatched(anSentEnd));
assertTrue(patternToken2.isMatched(anSentEnd));
assertFalse(patternToken3.isMatched(anSentEnd));
assertTrue(patternToken4.isMatched(anSentEnd));
assertFalse(patternToken5.isMatched(anSentEnd));
AnalyzedToken anParaEnd = new AnalyzedToken("schword", PARAGRAPH_END_TAGNAME, null);
assertTrue(patternToken.isMatched(anParaEnd));
assertFalse(patternToken2.isMatched(anParaEnd));
assertTrue(patternToken3.isMatched(anParaEnd));
assertFalse(patternToken4.isMatched(anParaEnd));
assertTrue(patternToken5.isMatched(anParaEnd));
// if the AnalyzedToken is in the set of readings that have
//non-null tags...
anParaEnd.setNoPOSTag(false);
assertFalse(patternToken.isMatched(anParaEnd));
assertTrue(patternToken2.isMatched(anParaEnd));
assertFalse(patternToken3.isMatched(anParaEnd));
assertTrue(patternToken4.isMatched(anParaEnd));
assertFalse(patternToken5.isMatched(anParaEnd));
AnalyzedToken anWithPOS = new AnalyzedToken("schword", "POS", null);
assertFalse(patternToken.isMatched(anWithPOS));
assertTrue(patternToken2.isMatched(anWithPOS));
assertFalse(patternToken3.isMatched(anWithPOS));
assertTrue(patternToken4.isMatched(anWithPOS));
assertFalse(patternToken5.isMatched(anWithPOS));
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class GermanReflexiveVerbGuesser method run.
private void run(File indexTopDir, File lemmaListFile) throws IOException {
List<String> lemmas = Files.readAllLines(lemmaListFile.toPath());
System.out.println("Durchschnitt Prozent | Anzahl Lemma | mich/uns/euch ... | ... mich/uns/euch | Lemma");
try (LuceneLanguageModel lm = new LuceneLanguageModel(indexTopDir)) {
for (String lemma : lemmas) {
//if (!lemma.equals("reklamieren")) { continue; }
//if (!lemma.equals("hertreiben")) { continue; }
String[] firstPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:1:SIN:PRÄ.*", true);
String[] thirdPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:3:SIN:PRÄ.*", true);
String firstPsSin = firstPsSinArray.length > 0 ? firstPsSinArray[0] : null;
String thirdPsSin = thirdPsSinArray.length > 0 ? thirdPsSinArray[0] : null;
long reflexiveCount1 = count1(lm, lemma, firstPsSin, thirdPsSin) - counterExamples("für", lm, lemma, firstPsSin, thirdPsSin) - counterExamples("vor", lm, lemma, firstPsSin, thirdPsSin);
long reflexiveCount2 = count2(lm, lemma, firstPsSin, thirdPsSin);
long lemmaCount = lm.getCount(lemma);
float factor1 = ((float) reflexiveCount1 / lemmaCount) * 100.0f;
float factor2 = ((float) reflexiveCount2 / lemmaCount) * 100.0f;
float avgFactor = (factor1 + factor2) / 2;
//System.out.printf("%.2f%% %.2f%% " + reflexiveCount1 + " " + reflexiveCount2 + " " + lemmaCount + " " + lemma + "\n", factor1, factor2);
//System.out.printf("%.2f%% %.2f%% " + lemmaCount + " " + lemma + "\n", factor1, factor2);
System.out.printf("%.2f %d %.2f%% %.2f%% %s\n", avgFactor, lemmaCount, factor1, factor2, lemma);
}
}
}
Aggregations