use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.
the class SubjectVerbAgreementRule method getPluralMatchOrNull.
@Nullable
private RuleMatch getPluralMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) {
if (plural.contains(tokenStr)) {
AnalyzedTokenReadings prevToken = tokens[i - 1];
List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
boolean match = prevChunkTags.contains(NPS) && !prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && !isCurrency(prevToken) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasUnknownTokenToTheRight(tokens, i + 1) && // z.B. "Die Zielgruppe sind Männer." - beides Nominativ, aber 'Männer' ist das Subjekt
!isFollowedByNominativePlural(tokens, i + 1);
if (match) {
String message = "Bitte prüfen, ob hier <suggestion>" + getSingularFor(tokenStr) + "</suggestion> stehen sollte.";
return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
}
}
return null;
}
use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.
the class SubjectVerbAgreementRule method getSingularMatchOrNull.
@Nullable
private RuleMatch getSingularMatchOrNull(AnalyzedTokenReadings[] tokens, int i, AnalyzedTokenReadings token, String tokenStr) throws IOException {
if (singular.contains(tokenStr)) {
AnalyzedTokenReadings prevToken = tokens[i - 1];
AnalyzedTokenReadings nextToken = i + 1 < tokens.length ? tokens[i + 1] : null;
List<ChunkTag> prevChunkTags = prevToken.getChunkTags();
boolean match = prevChunkTags.contains(NPP) && !prevChunkTags.contains(PP) && // 'um 18 Uhr ist Feierabend'
!prevToken.getToken().equals("Uhr") && !isCurrency(prevToken) && // 'zehn Jahre ist es her'
!(nextToken != null && nextToken.getToken().equals("es")) && prevChunkIsNominative(tokens, i - 1) && !hasUnknownTokenToTheLeft(tokens, i) && !hasQuestionPronounToTheLeft(tokens, i - 1) && !containsRegexToTheLeft("wer", tokens, i - 1) && !containsRegexToTheLeft("(?i)alle[nr]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)jede[rs]?", tokens, i - 1) && !containsRegexToTheLeft("(?i)manche[nrs]?", tokens, i - 1) && !containsOnlyInfinitivesToTheLeft(tokens, i - 1);
if (match) {
String message = "Bitte prüfen, ob hier <suggestion>" + getPluralFor(tokenStr) + "</suggestion> stehen sollte.";
return new RuleMatch(this, token.getStartPos(), token.getEndPos(), message);
}
}
return null;
}
use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.
the class DisambiguationPatternRuleReplacer method executeAction.
private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
int correctedStPos = 0;
int startPositionCorrection = rule.getStartPositionCorrection();
int endPositionCorrection = rule.getEndPositionCorrection();
int matchingTokensWithCorrection = matchingTokens;
List<Integer> tokenPositionList = new ArrayList<>();
for (int i : tokenPositions) {
tokenPositionList.add(i);
}
if (startPositionCorrection > 0) {
//token positions are shifted by 1
correctedStPos--;
for (int j = 0; j < pTokensMatched.size(); j++) {
if (!pTokensMatched.get(j)) {
// add zero-length token corresponding to the non-matching pattern element so that position count is fine
tokenPositionList.add(j, 0);
}
}
for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
correctedStPos += tokenPositionList.get(l);
}
// adjust to make sure the token count is fine as it's checked later
int w = startPositionCorrection;
for (int j = 0; j <= w; j++) {
if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
startPositionCorrection--;
}
}
}
if (endPositionCorrection < 0) {
// adjust the end position correction if one of the elements has not been matched
for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
if (!pTokensMatched.get(d)) {
endPositionCorrection++;
}
}
}
if (lastMatchToken != -1) {
int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
matchingTokensWithCorrection += maxPosCorrection;
}
int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
Match matchElement = rule.getMatchElement();
String disambiguatedPOS = rule.getDisambiguatedPOS();
switch(disAction) {
case UNIFY:
if (unifiedTokens != null) {
//TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
unifiedTokens[unifiedTokens.length - 1].setSentEnd();
}
for (int i = 0; i < unifiedTokens.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
List<ChunkTag> chTags = whTokens[position].getChunkTags();
whTokens[position] = unifiedTokens[i];
whTokens[position].setChunkTags(chTags);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case REMOVE:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(newTokenReadings[i]);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
} else if (!StringTools.isEmpty(disambiguatedPOS)) {
// negative filtering
Pattern p = Pattern.compile(disambiguatedPOS);
AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
for (AnalyzedToken analyzedToken : tmp) {
if (analyzedToken.getPOSTag() != null) {
Matcher mPos = p.matcher(analyzedToken.getPOSTag());
if (mPos.matches()) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(analyzedToken);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
}
break;
case ADD:
if (newTokenReadings != null) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if (newTokenReadings[i].getToken().isEmpty()) {
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].addReading(newTok);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case FILTERALL:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
PatternToken pToken;
if (pTokensMatched.get(i + startPositionCorrection)) {
pToken = rule.getPatternTokens().get(i + startPositionCorrection);
} else {
int k = 1;
while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
k++;
}
pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
}
Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position] = matchState.filterReadings();
annotateChange(whTokens[position], prevValue, prevAnot);
}
break;
case IMMUNIZE:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
}
break;
case IGNORE_SPELLING:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
}
break;
case FILTER:
if (matchElement == null) {
// same as REPLACE if using <match>
Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
boolean newPOSmatches = false;
// only apply filter rule when it matches previous tags:
for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
newPOSmatches = true;
break;
}
}
if (newPOSmatches) {
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchState.filterReadings();
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
break;
}
//fallthrough
case REPLACE:
default:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if ("".equals(newTokenReadings[i].getToken())) {
// empty token
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
// empty lemma
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[position] = replaceTokens(whTokens[position], toReplace);
}
}
} else if (matchElement == null) {
String lemma = "";
for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
lemma = analyzedToken.getLemma();
}
}
if (StringTools.isEmpty(lemma)) {
lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
} else {
// using the match element
MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchElementState.filterReadings();
whTokens[fromPos].setWhitespaceBefore(spaceBefore);
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
}
return whTokens;
}
use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.
the class CatalanTagger method tag.
@Override
public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
for (String word : sentenceTokens) {
// This hack allows all rules and dictionary entries to work with
// typewriter apostrophe
boolean containsTypewriterApostrophe = false;
if (word.length() > 1) {
if (word.contains("'")) {
containsTypewriterApostrophe = true;
}
word = word.replace("’", "'");
}
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(conversionLocale);
final boolean isLowercase = word.equals(lowerWord);
final boolean isMixedCase = StringTools.isMixedCase(word);
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
// normal case:
addTokens(taggerTokens, l);
// word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(lowerWord));
addTokens(lowerTaggerTokens, l);
}
// additional tagging with prefixes
if (l.isEmpty() && !isMixedCase) {
addTokens(additionalTags(word, dictLookup), l);
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(word, null, null));
}
AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
if (containsTypewriterApostrophe) {
List<ChunkTag> listChunkTags = new ArrayList<>();
listChunkTags.add(new ChunkTag("containsTypewriterApostrophe"));
atr.setChunkTags(listChunkTags);
}
tokenReadings.add(atr);
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.chunking.ChunkTag in project languagetool by languagetool-org.
the class XMLRuleHandler method setToken.
protected void setToken(Attributes attrs) {
inToken = true;
if (lastPhrase) {
patternTokens.clear();
}
lastPhrase = false;
tokenNegated = YES.equals(attrs.getValue(NEGATE));
tokenInflected = YES.equals(attrs.getValue(INFLECTED));
if (attrs.getValue(SKIP) != null) {
skipPos = Integer.parseInt(attrs.getValue(SKIP));
}
if (attrs.getValue(MIN) != null) {
minOccurrence = Integer.parseInt(attrs.getValue(MIN));
}
if (attrs.getValue(MAX) != null) {
maxOccurrence = Integer.parseInt(attrs.getValue(MAX));
}
elements = new StringBuilder();
// POSElement creation
if (attrs.getValue(POSTAG) != null) {
posToken = attrs.getValue(POSTAG);
posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
posNegation = YES.equals(attrs.getValue(NEGATE_POS));
}
if (attrs.getValue(CHUNKTAG) != null) {
chunkTag = new ChunkTag(attrs.getValue(CHUNKTAG));
}
regExpression = YES.equals(attrs.getValue(REGEXP));
if (attrs.getValue(SPACEBEFORE) != null) {
tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
tokenSpaceBeforeSet = !IGNORE.equals(attrs.getValue(SPACEBEFORE));
}
if (!inAndGroup && !inOrGroup) {
tokenCounter++;
}
if (attrs.getValue(CASE_SENSITIVE) != null) {
tokenLevelCaseSet = true;
tokenLevelCaseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE));
} else {
tokenLevelCaseSensitive = false;
tokenLevelCaseSet = false;
}
}
Aggregations