use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.
the class EnglishChunkerTest method testContractions.
@Test
public void testContractions() throws Exception {
JLanguageTool langTool = new JLanguageTool(new English());
AnalyzedSentence analyzedSentence = langTool.getAnalyzedSentence("I'll be there");
AnalyzedTokenReadings[] tokens = analyzedSentence.getTokens();
assertThat(tokens[1].getChunkTags().get(0), is(new ChunkTag("B-NP-singular")));
// "'" cannot be mapped as we tokenize differently
assertThat(tokens[2].getChunkTags().size(), is(0));
// "ll" cannot be mapped as we tokenize differently
assertThat(tokens[3].getChunkTags().size(), is(0));
assertThat(tokens[5].getChunkTags().get(0), is(new ChunkTag("I-VP")));
}
use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.
the class EnglishChunkerTest method testAddChunkTagsSingular.
@Test
public void testAddChunkTagsSingular() throws Exception {
EnglishChunker chunker = new EnglishChunker();
JLanguageTool lt = new JLanguageTool(new English());
List<AnalyzedSentence> sentences = lt.analyzeText("The abacus shows how numbers can be stored");
List<AnalyzedTokenReadings> readingsList = Arrays.asList(sentences.get(0).getTokens());
chunker.addChunkTags(readingsList);
// "The abacus":
assertThat(readingsList.get(1).getChunkTags().toString(), is("[B-NP-singular]"));
assertThat(readingsList.get(3).getChunkTags().toString(), is("[E-NP-singular]"));
// "numbers":
assertThat(readingsList.get(9).getChunkTags().toString(), is("[B-NP-plural, E-NP-plural]"));
}
use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.
the class LanguageToolFilter method incrementToken.
@Override
public boolean incrementToken() throws IOException {
if (posStack.size() > 0) {
String pop = posStack.pop();
restoreState(current);
termAtt.append(pop);
posIncrAtt.setPositionIncrement(0);
typeAtt.setType("pos");
return true;
}
if (tokenIter == null || !tokenIter.hasNext()) {
// there are no remaining tokens from the current sentence... are there more sentences?
if (input.incrementToken()) {
// a new sentence is available: process it.
String sentenceStr = termAtt.toString();
collectedInput.append(sentenceStr);
if (sentenceStr.length() >= 255) {
// later. See https://github.com/languagetool-org/languagetool/issues/364
return true;
} else {
sentenceStr = collectedInput.toString();
collectedInput.setLength(0);
}
AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
tokenIter = tokenBuffer.iterator();
/*
* it should not be possible to have a sentence with 0 words, check just in case. returning
* EOS isn't the best either, but it's the behavior of the original code.
*/
if (!tokenIter.hasNext()) {
return false;
}
} else {
// no more sentences, end of stream!
return false;
}
}
// It must clear attributes, as it is creating new tokens.
clearAttributes();
AnalyzedTokenReadings tr = tokenIter.next();
// add POS tag for sentence start.
if (tr.isSentenceStart()) {
// TODO: would be needed so negated tokens can match on something (see testNegatedMatchAtSentenceStart())
// but breaks other cases:
//termAtt.append("SENT_START");
typeAtt.setType("pos");
String posTag = tr.getAnalyzedToken(0).getPOSTag();
String lemma = tr.getAnalyzedToken(0).getLemma();
if (toLowerCase) {
termAtt.append(POS_PREFIX.toLowerCase()).append(posTag.toLowerCase());
if (lemma != null) {
termAtt.append(LEMMA_PREFIX.toLowerCase()).append(lemma.toLowerCase());
}
} else {
termAtt.append(POS_PREFIX).append(posTag);
if (lemma != null) {
termAtt.append(LEMMA_PREFIX).append(lemma);
}
}
return true;
}
// by pass the white spaces.
if (tr.isWhitespace()) {
return this.incrementToken();
}
offsetAtt.setOffset(tr.getStartPos(), tr.getEndPos());
for (AnalyzedToken token : tr) {
if (token.getPOSTag() != null) {
if (toLowerCase) {
posStack.push(POS_PREFIX.toLowerCase() + token.getPOSTag().toLowerCase());
} else {
posStack.push(POS_PREFIX + token.getPOSTag());
}
}
if (token.getLemma() != null) {
if (toLowerCase) {
posStack.push(LEMMA_PREFIX.toLowerCase() + token.getLemma().toLowerCase());
} else {
// chances are good this is the same for all loop iterations, store it anyway...
posStack.push(LEMMA_PREFIX + token.getLemma());
}
}
}
current = captureState();
if (toLowerCase) {
termAtt.append(tr.getAnalyzedToken(0).getToken().toLowerCase());
} else {
termAtt.append(tr.getAnalyzedToken(0).getToken());
}
return true;
}
use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.
the class AbstractWordCoherencyRule method match.
@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) {
List<RuleMatch> ruleMatches = new ArrayList<>();
// e.g. aufwändig -> RuleMatch of aufwendig
Map<String, RuleMatch> shouldNotAppearWord = new HashMap<>();
int pos = 0;
for (AnalyzedSentence sentence : sentences) {
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
for (AnalyzedTokenReadings tmpToken : tokens) {
String token = tmpToken.getToken();
List<AnalyzedToken> readings = tmpToken.getReadings();
// TODO: in theory we need to care about the other readings, too (affects e.g. German "Schenke" as a noun):
if (readings.size() > 0) {
String baseform = readings.get(0).getLemma();
if (baseform != null) {
token = baseform;
}
}
if (shouldNotAppearWord.containsKey(token)) {
RuleMatch otherMatch = shouldNotAppearWord.get(token);
String otherSpelling = otherMatch.getMessage();
String msg = getMessage(token, otherSpelling);
RuleMatch ruleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), msg);
ruleMatch.setSuggestedReplacement(otherSpelling);
ruleMatches.add(ruleMatch);
} else if (getWordMap().containsKey(token)) {
String shouldNotAppear = getWordMap().get(token);
RuleMatch potentialRuleMatch = new RuleMatch(this, pos + tmpToken.getStartPos(), pos + tmpToken.getEndPos(), token);
shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
}
}
pos += sentence.getText().length();
}
return toRuleMatchArray(ruleMatches);
}
use of org.languagetool.AnalyzedSentence in project languagetool by languagetool-org.
the class GermanChunkerTest method assertBasicChunks.
private void assertBasicChunks(String input) throws Exception {
String plainInput = getPlainInput(input);
AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence(plainInput);
AnalyzedTokenReadings[] result = analyzedSentence.getTokensWithoutWhitespace();
List<ChunkTaggedToken> basicChunks = chunker.getBasicChunks(Arrays.asList(result));
List<String> expectedChunks = getExpectedChunks(input);
assertChunks(input, plainInput, basicChunks, expectedChunks);
}
Aggregations