Search in sources :

Example 6 with AnalyzedSentence

use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.

the class EnglishChunkerTest method testContractions.

@Test
public void testContractions() throws Exception {
    JLanguageTool langTool = new JLanguageTool(new English());
    AnalyzedSentence analyzedSentence = langTool.getAnalyzedSentence("I'll be there");
    AnalyzedTokenReadings[] tokens = analyzedSentence.getTokens();
    assertThat(tokens[1].getChunkTags().get(0), is(new ChunkTag("B-NP-singular")));
    // "'" cannot be mapped as we tokenize differently
    assertThat(tokens[2].getChunkTags().size(), is(0));
    // "ll" cannot be mapped as we tokenize differently
    assertThat(tokens[3].getChunkTags().size(), is(0));
    assertThat(tokens[5].getChunkTags().get(0), is(new ChunkTag("I-VP")));
}
Also used : English(org.languagetool.language.English) AnalyzedSentence(org.languagetool.AnalyzedSentence) JLanguageTool(org.languagetool.JLanguageTool) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 7 with AnalyzedSentence

use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.

the class EnglishChunkerTest method testAddChunkTagsSingular.

@Test
public void testAddChunkTagsSingular() throws Exception {
    EnglishChunker chunker = new EnglishChunker();
    JLanguageTool lt = new JLanguageTool(new English());
    List<AnalyzedSentence> sentences = lt.analyzeText("The abacus shows how numbers can be stored");
    List<AnalyzedTokenReadings> readingsList = Arrays.asList(sentences.get(0).getTokens());
    chunker.addChunkTags(readingsList);
    // "The abacus":
    assertThat(readingsList.get(1).getChunkTags().toString(), is("[B-NP-singular]"));
    assertThat(readingsList.get(3).getChunkTags().toString(), is("[E-NP-singular]"));
    // "numbers":
    assertThat(readingsList.get(9).getChunkTags().toString(), is("[B-NP-plural, E-NP-plural]"));
}
Also used : English(org.languagetool.language.English) AnalyzedSentence(org.languagetool.AnalyzedSentence) JLanguageTool(org.languagetool.JLanguageTool) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings) Test(org.junit.Test)

Example 8 with AnalyzedSentence

use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.

the class LanguageToolFilter method incrementToken.

@Override
public boolean incrementToken() throws IOException {
    if (posStack.size() > 0) {
        String pop = posStack.pop();
        restoreState(current);
        termAtt.append(pop);
        posIncrAtt.setPositionIncrement(0);
        typeAtt.setType("pos");
        return true;
    }
    if (tokenIter == null || !tokenIter.hasNext()) {
        // there are no remaining tokens from the current sentence... are there more sentences?
        if (input.incrementToken()) {
            // a new sentence is available: process it.
            String sentenceStr = termAtt.toString();
            collectedInput.append(sentenceStr);
            if (sentenceStr.length() >= 255) {
                // later. See https://github.com/languagetool-org/languagetool/issues/364
                return true;
            } else {
                sentenceStr = collectedInput.toString();
                collectedInput.setLength(0);
            }
            AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
            List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
            tokenIter = tokenBuffer.iterator();
            /*
         * it should not be possible to have a sentence with 0 words, check just in case. returning
         * EOS isn't the best either, but it's the behavior of the original code.
         */
            if (!tokenIter.hasNext()) {
                return false;
            }
        } else {
            // no more sentences, end of stream!
            return false;
        }
    }
    // It must clear attributes, as it is creating new tokens.
    clearAttributes();
    AnalyzedTokenReadings tr = tokenIter.next();
    // add POS tag for sentence start.
    if (tr.isSentenceStart()) {
        // TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
        // but breaks other cases:
        //termAtt.append("SENT_START");
        typeAtt.setType("pos");
        String posTag = tr.getAnalyzedToken(0).getPOSTag();
        String lemma = tr.getAnalyzedToken(0).getLemma();
        if (toLowerCase) {
            termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
            }
        } else {
            termAtt.append(POS_PREFIX).append(posTag);
            if (lemma != null) {
                termAtt.append(LEMMA_PREFIX).append(lemma);
            }
        }
        return true;
    }
    // by pass the white spaces.
    if (tr.isWhitespace()) {
        return this.incrementToken();
    }
    offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
    for (AnalyzedToken token : tr) {
        if (token.getPOSTag() != null) {
            if (toLowerCase) {
                posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
            } else {
                posStack.push(POS_PREFIX + token.getPOSTag());
            }
        }
        if (token.getLemma() != null) {
            if (toLowerCase) {
                posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
            } else {
                // chances are good this is the same for all loop iterations, store it anyway...
                posStack.push(LEMMA_PREFIX + token.getLemma());
            }
        }
    }
    current = captureState();
    if (toLowerCase) {
        termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
    } else {
        termAtt.append(tr.getAnalyzedToken(0).getToken());
    }
    return true;
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 9 with AnalyzedSentence

use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.

the class AbstractWordCoherencyRule method match.

@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    // e.g. aufwändig -> RuleMatch of aufwendig
    Map<String, RuleMatch> shouldNotAppearWord = new HashMap<>();
    int pos = 0;
    for (AnalyzedSentence sentence : sentences) {
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
        for (AnalyzedTokenReadings tmpToken : tokens) {
            String token = tmpToken.getToken();
            List<AnalyzedToken> readings = tmpToken.getReadings();
            // TODO: in theory we need to care about the other readings, too (affects e.g. German "Schenke" as a noun):
            if (readings.size() > 0) {
                String baseform = readings.get(0).getLemma();
                if (baseform != null) {
                    token = baseform;
                }
            }
            if (shouldNotAppearWord.containsKey(token)) {
                RuleMatch otherMatch = shouldNotAppearWord.get(token);
                String otherSpelling = otherMatch.getMessage();
                String msg = getMessage(token, otherSpelling);
                RuleMatch ruleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), msg);
                ruleMatch.setSuggestedReplacement(otherSpelling);
                ruleMatches.add(ruleMatch);
            } else if (getWordMap().containsKey(token)) {
                String shouldNotAppear = getWordMap().get(token);
                RuleMatch potentialRuleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), token);
                shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
            }
        }
        pos += sentence.getText().length();
    }
    return toRuleMatchArray(ruleMatches);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedToken(org.languagetool.AnalyzedToken) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Example 10 with AnalyzedSentence

use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.

the class GermanChunkerTest method assertBasicChunks.

private void assertBasicChunks(String input) throws Exception {
    String plainInput = getPlainInput(input);
    AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(plainInput);
    AnalyzedTokenReadings[] result = analyzedSentence.getTokensWithoutWhitespace();
    List<ChunkTaggedToken> basicChunks = chunker.getBasicChunks(Arrays.asList(result));
    List<String> expectedChunks = getExpectedChunks(input);
    assertChunks(input, plainInput, basicChunks, expectedChunks);
}
Also used : AnalyzedSentence(org.languagetool.AnalyzedSentence) AnalyzedTokenReadings(org.languagetool.AnalyzedTokenReadings)

Aggregations

AnalyzedSentence (org.languagetool.AnalyzedSentence)40 AnalyzedTokenReadings (org.languagetool.AnalyzedTokenReadings)21 ArrayList (java.util.ArrayList)8 Test (org.junit.Test)8 JLanguageTool (org.languagetool.JLanguageTool)8 RuleMatch (org.languagetool.rules.RuleMatch)8 Rule (org.languagetool.rules.Rule)5 IOException (java.io.IOException)4 DisambiguationPatternRule (org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule)4 English (org.languagetool.language.English)3 SpellingCheckRule (org.languagetool.rules.spelling.SpellingCheckRule)3 AnalyzedToken (org.languagetool.AnalyzedToken)2 Ukrainian (org.languagetool.language.Ukrainian)2 InputStream (java.io.InputStream)1 Document (org.apache.lucene.document.Document)1 ConfusionSet (org.languagetool.rules.ConfusionSet)1 CorrectExample (org.languagetool.rules.CorrectExample)1 IncorrectExample (org.languagetool.rules.IncorrectExample)1 BitextRule (org.languagetool.rules.bitext.BitextRule)1 ConfusionProbabilityRule (org.languagetool.rules.ngrams.ConfusionProbabilityRule)1