use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class Unifier method checkNext.
private boolean checkNext(AnalyzedToken aToken, Map<String, List<String>> uFeatures) {
boolean anyFeatUnified = false;
List<Boolean> tokenFeaturesFound = new ArrayList<>(tmpFeaturesFound);
Map<String, Set<String>> equivalencesMatchedHere = new ConcurrentHashMap<>();
if (allFeatsIn) {
for (int i = 0; i < tokCnt; i++) {
boolean allFeatsUnified = true;
for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
boolean featUnified = false;
List<String> types = feat.getValue();
if (types == null || types.isEmpty()) {
types = equivalenceFeatures.get(feat.getKey());
}
for (String typeName : types) {
if (equivalencesMatched.get(i).containsKey(feat.getKey()) && equivalencesMatched.get(i).get(feat.getKey()).contains(typeName)) {
PatternToken testElem = equivalenceTypes.get(new EquivalenceTypeLocator(feat.getKey(), typeName));
boolean matched = testElem.isMatched(aToken);
featUnified = featUnified || matched;
//Stores equivalences to be kept
if (matched) {
if (!equivalencesToBeKept.containsKey(feat.getKey())) {
Set<String> typeSet = new HashSet<>();
typeSet.add(typeName);
equivalencesToBeKept.put(feat.getKey(), typeSet);
} else {
equivalencesToBeKept.get(feat.getKey()).add(typeName);
}
if (!equivalencesMatchedHere.containsKey(feat.getKey())) {
// just for this reading
Set<String> typeSet = new HashSet<>();
typeSet.add(typeName);
equivalencesMatchedHere.put(feat.getKey(), typeSet);
} else {
equivalencesMatchedHere.get(feat.getKey()).add(typeName);
}
}
}
}
allFeatsUnified &= featUnified;
}
tokenFeaturesFound.set(i, tokenFeaturesFound.get(i) || allFeatsUnified);
anyFeatUnified = anyFeatUnified || allFeatsUnified;
}
if (anyFeatUnified) {
if (tokSequence.size() == readingsCounter) {
tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
List<Map<String, Set<String>>> equivList = new ArrayList<>();
equivList.add(equivalencesMatchedHere);
tokSequenceEquivalences.add(equivList);
} else {
if (readingsCounter < tokSequence.size()) {
tokSequence.get(readingsCounter).addReading(aToken);
tokSequenceEquivalences.get(readingsCounter).add(equivalencesMatchedHere);
} else {
anyFeatUnified = false;
}
}
tmpFeaturesFound = tokenFeaturesFound;
}
}
return anyFeatUnified;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class Unifier method isSatisfied.
/**
* Tests if a token has shared features with other tokens.
*
* @param aToken token to be tested
* @param uFeatures features to be tested
* @return true if the token shares this type of feature with other tokens
*/
protected final boolean isSatisfied(AnalyzedToken aToken, Map<String, List<String>> uFeatures) {
if (allFeatsIn && equivalencesMatched.isEmpty()) {
return false;
}
if (uFeatures == null) {
throw new RuntimeException("isSatisfied called without features being set");
}
unificationFeats = uFeatures;
boolean unified = true;
if (allFeatsIn) {
unified = checkNext(aToken, uFeatures);
} else {
while (equivalencesMatched.size() <= tokCnt) {
equivalencesMatched.add(new ConcurrentHashMap<>());
}
for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
List<String> types = feat.getValue();
if (types == null || types.isEmpty()) {
types = equivalenceFeatures.get(feat.getKey());
}
for (String typeName : types) {
PatternToken testElem = equivalenceTypes.get(new EquivalenceTypeLocator(feat.getKey(), typeName));
if (testElem == null) {
return false;
}
if (testElem.isMatched(aToken)) {
if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) {
Set<String> typeSet = new HashSet<>();
typeSet.add(typeName);
equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet);
} else {
equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typeName);
}
}
}
unified = equivalencesMatched.get(tokCnt).containsKey(feat.getKey());
if (!unified) {
equivalencesMatched.remove(tokCnt);
break;
}
}
if (unified) {
if (tokCnt == 0 || tokSequence.isEmpty()) {
tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
List<Map<String, Set<String>>> equivList = new ArrayList<>();
equivList.add(equivalencesMatched.get(tokCnt));
tokSequenceEquivalences.add(equivList);
} else {
tokSequence.get(0).addReading(aToken);
tokSequenceEquivalences.get(0).add(equivalencesMatched.get(tokCnt));
}
tokCnt++;
}
}
return unified;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class MultiWordChunker method setAndAnnotate.
private AnalyzedTokenReadings setAndAnnotate(AnalyzedTokenReadings oldReading, AnalyzedToken newReading) {
String old = oldReading.toString();
String prevAnot = oldReading.getHistoricalAnnotations();
AnalyzedTokenReadings newAtr = new AnalyzedTokenReadings(oldReading.getReadings(), oldReading.getStartPos());
newAtr.setWhitespaceBefore(oldReading.isWhitespaceBefore());
newAtr.addReading(newReading);
newAtr.setHistoricalAnnotations(annotateToken(prevAnot, old, newAtr.toString()));
newAtr.setChunkTags(oldReading.getChunkTags());
return newAtr;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class DisambiguationPatternRuleReplacer method executeAction.
private AnalyzedTokenReadings[] executeAction(AnalyzedSentence sentence, AnalyzedTokenReadings[] whiteTokens, AnalyzedTokenReadings[] unifiedTokens, int firstMatchToken, int lastMatchToken, int matchingTokens, int[] tokenPositions) {
AnalyzedTokenReadings[] whTokens = whiteTokens.clone();
DisambiguationPatternRule rule = (DisambiguationPatternRule) this.rule;
int correctedStPos = 0;
int startPositionCorrection = rule.getStartPositionCorrection();
int endPositionCorrection = rule.getEndPositionCorrection();
int matchingTokensWithCorrection = matchingTokens;
List<Integer> tokenPositionList = new ArrayList<>();
for (int i : tokenPositions) {
tokenPositionList.add(i);
}
if (startPositionCorrection > 0) {
//token positions are shifted by 1
correctedStPos--;
for (int j = 0; j < pTokensMatched.size(); j++) {
if (!pTokensMatched.get(j)) {
// add zero-length token corresponding to the non-matching pattern element so that position count is fine
tokenPositionList.add(j, 0);
}
}
for (int l = 0; l <= startPositionCorrection && tokenPositionList.size() > l; l++) {
correctedStPos += tokenPositionList.get(l);
}
// adjust to make sure the token count is fine as it's checked later
int w = startPositionCorrection;
for (int j = 0; j <= w; j++) {
if (j < pTokensMatched.size() && !pTokensMatched.get(j)) {
startPositionCorrection--;
}
}
}
if (endPositionCorrection < 0) {
// adjust the end position correction if one of the elements has not been matched
for (int d = startPositionCorrection; d < pTokensMatched.size(); d++) {
if (!pTokensMatched.get(d)) {
endPositionCorrection++;
}
}
}
if (lastMatchToken != -1) {
int maxPosCorrection = Math.max((lastMatchToken + 1 - (firstMatchToken + correctedStPos)) - matchingTokens, 0);
matchingTokensWithCorrection += maxPosCorrection;
}
int fromPos = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore();
DisambiguationPatternRule.DisambiguatorAction disAction = rule.getAction();
AnalyzedToken[] newTokenReadings = rule.getNewTokenReadings();
Match matchElement = rule.getMatchElement();
String disambiguatedPOS = rule.getDisambiguatedPOS();
switch(disAction) {
case UNIFY:
if (unifiedTokens != null) {
//TODO: unifiedTokens.length is larger > matchingTokensWithCorrection in cases where there are no markers...
if (unifiedTokens.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
if (whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + unifiedTokens.length - 1)].isSentenceEnd()) {
unifiedTokens[unifiedTokens.length - 1].setSentEnd();
}
for (int i = 0; i < unifiedTokens.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
unifiedTokens[i].setStartPos(whTokens[position].getStartPos());
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
List<ChunkTag> chTags = whTokens[position].getChunkTags();
whTokens[position] = unifiedTokens[i];
whTokens[position].setChunkTags(chTags);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case REMOVE:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(newTokenReadings[i]);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
} else if (!StringTools.isEmpty(disambiguatedPOS)) {
// negative filtering
Pattern p = Pattern.compile(disambiguatedPOS);
AnalyzedTokenReadings tmp = new AnalyzedTokenReadings(whTokens[fromPos].getReadings(), whTokens[fromPos].getStartPos());
for (AnalyzedToken analyzedToken : tmp) {
if (analyzedToken.getPOSTag() != null) {
Matcher mPos = p.matcher(analyzedToken.getPOSTag());
if (mPos.matches()) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].removeReading(analyzedToken);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
}
break;
case ADD:
if (newTokenReadings != null) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if (newTokenReadings[i].getToken().isEmpty()) {
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position].addReading(newTok);
annotateChange(whTokens[position], prevValue, prevAnot);
}
}
}
break;
case FILTERALL:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
PatternToken pToken;
if (pTokensMatched.get(i + startPositionCorrection)) {
pToken = rule.getPatternTokens().get(i + startPositionCorrection);
} else {
int k = 1;
while (i + startPositionCorrection + k < rule.getPatternTokens().size() + endPositionCorrection && !pTokensMatched.get(i + startPositionCorrection + k)) {
k++;
}
pToken = rule.getPatternTokens().get(i + k + startPositionCorrection);
}
Match tmpMatchToken = new Match(pToken.getPOStag(), null, true, pToken.getPOStag(), null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[position]);
String prevValue = whTokens[position].toString();
String prevAnot = whTokens[position].getHistoricalAnnotations();
whTokens[position] = matchState.filterReadings();
annotateChange(whTokens[position], prevValue, prevAnot);
}
break;
case IMMUNIZE:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].immunize();
}
break;
case IGNORE_SPELLING:
for (int i = 0; i < matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection; i++) {
whTokens[sentence.getOriginalPosition(firstMatchToken + correctedStPos + i)].ignoreSpelling();
}
break;
case FILTER:
if (matchElement == null) {
// same as REPLACE if using <match>
Match tmpMatchToken = new Match(disambiguatedPOS, null, true, disambiguatedPOS, null, Match.CaseConversion.NONE, false, false, Match.IncludeRange.NONE);
boolean newPOSmatches = false;
// only apply filter rule when it matches previous tags:
for (int i = 0; i < whTokens[fromPos].getReadingsLength(); i++) {
if (!whTokens[fromPos].getAnalyzedToken(i).hasNoTag() && whTokens[fromPos].getAnalyzedToken(i).getPOSTag() != null && whTokens[fromPos].getAnalyzedToken(i).getPOSTag().matches(disambiguatedPOS)) {
newPOSmatches = true;
break;
}
}
if (newPOSmatches) {
MatchState matchState = tmpMatchToken.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchState.filterReadings();
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
break;
}
//fallthrough
case REPLACE:
default:
if (newTokenReadings != null && newTokenReadings.length > 0) {
if (newTokenReadings.length == matchingTokensWithCorrection - startPositionCorrection + endPositionCorrection) {
for (int i = 0; i < newTokenReadings.length; i++) {
String token;
int position = sentence.getOriginalPosition(firstMatchToken + correctedStPos + i);
if ("".equals(newTokenReadings[i].getToken())) {
// empty token
token = whTokens[position].getToken();
} else {
token = newTokenReadings[i].getToken();
}
String lemma;
if (newTokenReadings[i].getLemma() == null) {
// empty lemma
lemma = token;
} else {
lemma = newTokenReadings[i].getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[position] = replaceTokens(whTokens[position], toReplace);
}
}
} else if (matchElement == null) {
String lemma = "";
for (AnalyzedToken analyzedToken : whTokens[fromPos]) {
if (analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().equals(disambiguatedPOS) && analyzedToken.getLemma() != null) {
lemma = analyzedToken.getLemma();
}
}
if (StringTools.isEmpty(lemma)) {
lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma();
}
AnalyzedToken analyzedToken = new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, lemma);
AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings(analyzedToken, whTokens[fromPos].getStartPos());
whTokens[fromPos] = replaceTokens(whTokens[fromPos], toReplace);
} else {
// using the match element
MatchState matchElementState = matchElement.createState(rule.getLanguage().getSynthesizer(), whTokens[fromPos]);
String prevValue = whTokens[fromPos].toString();
String prevAnot = whTokens[fromPos].getHistoricalAnnotations();
whTokens[fromPos] = matchElementState.filterReadings();
whTokens[fromPos].setWhitespaceBefore(spaceBefore);
annotateChange(whTokens[fromPos], prevValue, prevAnot);
}
}
return whTokens;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class GenericUnpairedBracketsRule method match.
@Override
public final RuleMatch[] match(List<AnalyzedSentence> sentences) {
// the stack for pairing symbols
UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<>();
UnsyncStack<SymbolLocator> ruleMatchStack = new UnsyncStack<>();
List<RuleMatch> ruleMatches = new ArrayList<>();
int startPosBase = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (int i = 1; i < tokens.length; i++) {
for (int j = 0; j < startSymbols.length; j++) {
if (fillSymbolStack(startPosBase, tokens, i, j, symbolStack)) {
break;
}
}
}
for (AnalyzedTokenReadings readings : sentence.getTokens()) {
startPosBase += readings.getToken().length();
}
}
for (SymbolLocator sLoc : symbolStack) {
RuleMatch rMatch = createMatch(ruleMatches, ruleMatchStack, sLoc.getStartPos(), sLoc.getSymbol());
if (rMatch != null) {
ruleMatches.add(rMatch);
}
}
return toRuleMatchArray(ruleMatches);
}
Aggregations