Search in sources :

Example 41 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class EnglishChunkerTest method testAddChunkTagsSingular.

@Test
public void testAddChunkTagsSingular() throws Exception {
    EnglishChunker chunker = new EnglishChunker();
    JLanguageTool lt = new JLanguageTool(new English());
    List<AnalyzedSentence> sentences = lt.analyzeText("The abacus shows how numbers can be stored");
    List<AnalyzedTokenReadings> readingsList = Arrays.asList(sentences.get(0).getTokens());
    chunker.addChunkTags(readingsList);
    // "The abacus":
    assertThat(readingsList.get(1).getChunkTags().toString(), is("[B-NP-singular]"));
    assertThat(readingsList.get(3).getChunkTags().toString(), is("[E-NP-singular]"));
    // "numbers":
    assertThat(readingsList.get(9).getChunkTags().toString(), is("[B-NP-plural, E-NP-plural]"));
}
Also used : English(org.languagetool.language.English) AnalyzedSentence(org.languagetool.AnalyzedSentence) JLanguageTool(org.languagetool.JLanguageTool) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 42 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class GermanHelperTest method testHasReadingOfType.

@Test
public void testHasReadingOfType() throws Exception {
    AnalyzedTokenReadings readings = new AnalyzedTokenReadings(new AnalyzedToken("der", "ART:DEF:DAT:SIN:FEM", null), 0);
    assertTrue(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.DETERMINER));
    assertFalse(GermanHelper.hasReadingOfType(readings, GermanToken.POSType.NOMEN));
}
Also used : AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 43 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class LanguageToolFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    if (posStack.size() > 0) {
        String pop = posStack.pop();
        restoreState(current);
        termAtt.append(pop);
        posIncrAtt.setPositionIncrement(0);
        typeAtt.setType("pos");
        return true;
    }
    if (tokenIter == null || !tokenIter.hasNext()) {
        // there are no remaining tokens from the current sentence... are there more sentences?
        if (input.incrementToken()) {
            // a new sentence is available: process it.
            String sentenceStr = termAtt.toString();
            collectedInput.append(sentenceStr);
            if (sentenceStr.length() >= 255) {
                // later. See https://github.com/languagetool-org/languagetool/issues/364
                return true;
            } else {
                sentenceStr = collectedInput.toString();
                collectedInput.setLength(0);
            }
            AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
            List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
            tokenIter = tokenBuffer.iterator();
            /*
         * it should not be possible to have a sentence with 0 words, check just in case. returning
         * EOS isn't the best either, but it's the behavior of the original code.
         */
            if (!tokenIter.hasNext()) {
                return false;
            }
        } else {
            // no more sentences, end of stream!
            return false;
        }
    }
    // It must clear attributes, as it is creating new tokens.
    clearAttributes();
    AnalyzedTokenReadings tr = tokenIter.next();
    // add POS tag for sentence start.
    if (tr.isSentenceStart()) {
        // TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
        // but breaks other cases:
        //termAtt.append("SENT_START");
        typeAtt.setType("pos");
        String posTag = tr.getAnalyzedToken(0).getPOSTag();
        String lemma = tr.getAnalyzedToken(0).getLemma();
        if (toLowerCase) {
            termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
            }
        } else {
            termAtt.append(POS_PREFIX).append(posTag);
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX).append(lemma);
            }
        }
        return true;
    }
    // by pass the white spaces.
    if (tr.isWhitespace()) {
        return this.incrementToken();
    }
    offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
    for (AnalyzedToken token : tr) {
        if (token.getPOSTag() != null) {
            if (toLowerCase) {
                posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
            } else {
                posStack.push(POS_PREFIX + token.getPOSTag());
            }
        }
        if (token.getLemma() != null) {
            if (toLowerCase) {
                posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
            } else {
                // chances are good this is the same for all loop iterations, store it anyway...
                posStack.push(LEMMA_PREFIX + token.getLemma());
            }
        }
    }
    current = captureState();
    if (toLowerCase) {
        termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
    } else {
        termAtt.append(tr.getAnalyzedToken(0).getToken());
    }
    return true;
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 44 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractCompoundRule method match.

@Override
public RuleMatch[] match(AnalyzedSentence sentence) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
    RuleMatch prevRuleMatch = null;
    Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
    for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
        AnalyzedTokenReadings token;
        // we need to extend the token list so we find matches at the end of the original list:
        if (i >= tokens.length) {
            token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
        } else {
            token = tokens[i];
        }
        if (i == 0) {
            addToQueue(token, prevTokens);
            continue;
        }
        if (token.isImmunized()) {
            continue;
        }
        AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
        List<String> stringsToCheck = new ArrayList<>();
        // original upper/lowercase spelling
        List<String> origStringsToCheck = new ArrayList<>();
        Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck, origStringsToCheck);
        // sure we match longer strings first:
        for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
            String stringToCheck = stringsToCheck.get(k);
            String origStringToCheck = origStringsToCheck.get(k);
            if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
                AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
                String msg = null;
                List<String> replacement = new ArrayList<>();
                if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                }
                if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
                    replacement.add(mergeCompound(origStringToCheck));
                    msg = withoutHyphenMessage;
                }
                String[] parts = stringToCheck.split(" ");
                if (parts.length > 0 && parts[0].length() == 1) {
                    replacement.clear();
                    replacement.add(origStringToCheck.replace(' ', '-'));
                    msg = withHyphenMessage;
                } else if (replacement.isEmpty() || replacement.size() == 2) {
                    // isEmpty shouldn't happen
                    msg = withOrWithoutHyphenMessage;
                }
                RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), atr.getEndPos(), msg, shortDesc);
                ruleMatch.setSuggestedReplacements(replacement);
                // avoid duplicate matches:
                if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
                    prevRuleMatch = ruleMatch;
                    break;
                }
                prevRuleMatch = ruleMatch;
                ruleMatches.add(ruleMatch);
                break;
            }
        }
        addToQueue(token, prevTokens);
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) AnalyzedToken(org.languagetool.AnalyzedToken) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue)

Example 45 with AnalyzedTokenReadings

use of org.languagetool.AnalyzedTokenReadings in project languagetool by languagetool-org.

the class AbstractCompoundRule method getStringToTokenMap.

private Map<String, AnalyzedTokenReadings> getStringToTokenMap(Queue<AnalyzedTokenReadings> prevTokens, List<String> stringsToCheck, List<String> origStringsToCheck) {
    StringBuilder sb = new StringBuilder();
    Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>();
    int j = 0;
    for (AnalyzedTokenReadings atr : prevTokens) {
        sb.append(' ');
        sb.append(atr.getToken());
        if (j >= 1) {
            String stringToCheck = normalize(sb.toString());
            stringsToCheck.add(stringToCheck);
            origStringsToCheck.add(sb.toString().trim());
            if (!stringToToken.containsKey(stringToCheck)) {
                stringToToken.put(stringToCheck, atr);
            }
        }
        j++;
    }
    return stringToToken;
}
Also used : AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)116 AnalyzedToken (org.languagetool.AnalyzedToken)48 ArrayList (java.util.ArrayList)47 AnalyzedSentence (org.languagetool.AnalyzedSentence)21 Test (org.junit.Test)16 RuleMatch (org.languagetool.rules.RuleMatch)14 Matcher (java.util.regex.Matcher)13 IOException (java.io.IOException)7 Nullable (org.jetbrains.annotations.Nullable)6 JLanguageTool (org.languagetool.JLanguageTool)6 Pattern (java.util.regex.Pattern)5 ChunkTag (org.languagetool.chunking.ChunkTag)5 English (org.languagetool.language.English)3 TaggedWord (org.languagetool.tagging.TaggedWord)3 InputStream (java.io.InputStream)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Scanner (java.util.Scanner)2 TreeSet (java.util.TreeSet)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2