use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class GermanHelperTest method testHasReadingOfType.
@Test
public void testHasReadingOfType() throws Exception {
AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("der", "ART:DEF:DAT:SIN:FEM", null), 0);
assertTrue(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.DETERMINER));
assertFalse(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.NOMEN));
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class LanguageToolFilter method incrementToken.
@Override
public boolean incrementToken() throws IOException {
if (posStack.size() > 0) {
String pop = posStack.pop();
restoreState(current);
termAtt.append(pop);
posIncrAtt.setPositionIncrement(0);
typeAtt.setType("pos");
return true;
}
if (tokenIter == null || !tokenIter.hasNext()) {
// there are no remaining tokens from the current sentence... are there more sentences?
if (input.incrementToken()) {
// a new sentence is available: process it.
String sentenceStr = termAtt.toString();
collectedInput.append(sentenceStr);
if (sentenceStr.length() >= 255) {
// later. See https://github.com/languagetool-org/languagetool/issues/364
return true;
} else {
sentenceStr = collectedInput.toString();
collectedInput.setLength(0);
}
AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
tokenIter = tokenBuffer.iterator();
/*
* it should not be possible to have a sentence with 0 words, check just in case. returning
* EOS isn't the best either, but it's the behavior of the original code.
*/
if (!tokenIter.hasNext()) {
return false;
}
} else {
// no more sentences, end of stream!
return false;
}
}
// It must clear attributes, as it is creating new tokens.
clearAttributes();
AnalyzedTokenReadings tr = tokenIter.next();
// add POS tag for sentence start.
if (tr.isSentenceStart()) {
// TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
// but breaks other cases:
//termAtt.append("SENT_START");
typeAtt.setType("pos");
String posTag = tr.getAnalyzedToken(0).getPOSTag();
String lemma = tr.getAnalyzedToken(0).getLemma();
if (toLowerCase) {
termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
if (lemma != null) {
termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
}
} else {
termAtt.append(POS_PREFIX).append(posTag);
if (lemma != null) {
termAtt.append(LEMMA_PREFIX).append(lemma);
}
}
return true;
}
// by pass the white spaces.
if (tr.isWhitespace()) {
return this.incrementToken();
}
offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
for (AnalyzedToken token : tr) {
if (token.getPOSTag() != null) {
if (toLowerCase) {
posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
} else {
posStack.push(POS_PREFIX + token.getPOSTag());
}
}
if (token.getLemma() != null) {
if (toLowerCase) {
posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
} else {
// chances are good this is the same for all loop iterations, store it anyway...
posStack.push(LEMMA_PREFIX + token.getLemma());
}
}
}
current = captureState();
if (toLowerCase) {
termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
} else {
termAtt.append(tr.getAnalyzedToken(0).getToken());
}
return true;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AbstractCompoundRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
RuleMatch prevRuleMatch = null;
Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
AnalyzedTokenReadings token;
// we need to extend the token list so we find matches at the end of the original list:
if (i >= tokens.length) {
token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
} else {
token = tokens[i];
}
if (i == 0) {
addToQueue(token, prevTokens);
continue;
}
if (token.isImmunized()) {
continue;
}
AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
List<String> stringsToCheck = new ArrayList<>();
// original upper/lowercase spelling
List<String> origStringsToCheck = new ArrayList<>();
Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
// sure we match longer strings first:
for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
String stringToCheck = stringsToCheck.get(k);
String origStringToCheck = origStringsToCheck.get(k);
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
List<String> replacement = new ArrayList<>();
if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
}
if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
replacement.add(mergeCompound(origStringToCheck));
msg = withoutHyphenMessage;
}
String[] parts = stringToCheck.split(" ");
if (parts.length > 0 && parts[0].length() == 1) {
replacement.clear();
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
} else if (replacement.isEmpty() || replacement.size() == 2) {
// isEmpty shouldn't happen
msg = withOrWithoutHyphenMessage;
}
RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
ruleMatch.setSuggestedReplacements(replacement);
// avoid duplicate matches:
if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
prevRuleMatch = ruleMatch;
break;
}
prevRuleMatch = ruleMatch;
ruleMatches.add(ruleMatch);
break;
}
}
addToQueue(token, prevTokens);
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AbstractCompoundRule method getStringToTokenMap.
private Map<String, AnalyzedTokenReadings> getStringToTokenMap(Queue<AnalyzedTokenReadings> prevTokens, List<String> stringsToCheck, List<String> origStringsToCheck) {
StringBuilder sb = new StringBuilder();
Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>();
int j = 0;
for (AnalyzedTokenReadings atr : prevTokens) {
sb.append(' ');
sb.append(atr.getToken());
if (j >= 1) {
String stringToCheck = normalize(sb.toString());
stringsToCheck.add(stringToCheck);
origStringsToCheck.add(sb.toString().trim());
if (!stringToToken.containsKey(stringToCheck)) {
stringToToken.put(stringToCheck, atr);
}
}
j++;
}
return stringToToken;
}
use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.
the class AbstractSimpleReplaceRule method match.
@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tokenReadings : tokens) {
// short for SENT_START
if (JLanguageTool.SENTENCE_START_TAGNAME.equals(tokenReadings.getAnalyzedToken(0).getPOSTag()))
continue;
// and speller-ignorable rules
if (tokenReadings.isImmunized() || tokenReadings.isIgnoredBySpeller()) {
continue;
}
String originalTokenStr = tokenReadings.getToken();
if (ignoreTaggedWords && isTagged(tokenReadings)) {
continue;
}
String tokenString = cleanup(originalTokenStr);
// try first with the original word, then with the all lower-case version
List<String> possibleReplacements = getWrongWords().get(originalTokenStr);
if (possibleReplacements == null) {
possibleReplacements = getWrongWords().get(tokenString);
}
if (possibleReplacements == null && checkLemmas) {
possibleReplacements = new ArrayList<>();
List<String> lemmas = new ArrayList<>();
for (AnalyzedToken analyzedToken : tokenReadings.getReadings()) {
String lemma = analyzedToken.getLemma();
if (lemma != null && getWrongWords().containsKey(lemma) && !lemmas.contains(lemma)) {
lemmas.add(cleanup(lemma));
}
}
for (String lemma : lemmas) {
List<String> replacements = getWrongWords().get(lemma);
if (replacements != null) {
possibleReplacements.addAll(replacements);
}
}
possibleReplacements = possibleReplacements.stream().distinct().collect(Collectors.toList());
}
if (possibleReplacements != null && possibleReplacements.size() > 0) {
List<String> replacements = new ArrayList<>();
replacements.addAll(possibleReplacements);
if (replacements.contains(originalTokenStr)) {
replacements.remove(originalTokenStr);
}
if (replacements.size() > 0) {
RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, replacements);
ruleMatches.add(potentialRuleMatch);
}
}
}
return toRuleMatchArray(ruleMatches);
}
Aggregations