use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LowercaseKnownFirstWordFilter method apply.
@Override
public void apply(TokenSequence tokenSequence) {
int startIndex = 0;
if (tokenSequence.isWithRoot())
startIndex += 1;
boolean lowerCaseNextWord = true;
int index = -1;
for (Token token : tokenSequence) {
index++;
if (index < startIndex)
continue;
if (token.getText().length() == 0)
continue;
if (lowerCaseNextWord) {
char firstChar = token.getText().charAt(0);
if (Character.isUpperCase(firstChar)) {
Set<String> possibleWords = TalismaneSession.get(sessionId).getDiacriticizer().diacriticize(token.getText());
if (possibleWords.size() > 0)
token.setText(possibleWords.iterator().next());
}
// next word starts with an upper-case
lowerCaseNextWord = false;
}
// should we lower-case the next word?
if (Tokeniser.getTokenSeparators(sessionId).matcher(token.getText()).matches() && !noUppercasePunctuation.matcher(token.getText()).matches()) {
lowerCaseNextWord = true;
}
}
// next token
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class TokenEvaluationCorpusWriter method onNextTokenSequence.
@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) throws IOException {
List<Integer> realSplits = realSequence.getTokenSplits();
TokenisedAtomicTokenSequence tokenisedAtomicTokenSequence = guessedAtomicSequences.get(0);
Map<Integer, TokeniserOutcome> realOutcomes = new HashMap<Integer, TokeniserOutcome>();
Map<Integer, TokeniserOutcome> guessedOutcomes = new HashMap<Integer, TokeniserOutcome>();
Map<Integer, List<String>> guessedAuthorities = new HashMap<Integer, List<String>>();
List<Integer> indexes = new ArrayList<Integer>();
corpusWriter.write(realSequence.getSentence().getText() + "\n");
for (TaggedToken<TokeniserOutcome> guessTag : tokenisedAtomicTokenSequence) {
TokeniserOutcome guessDecision = guessTag.getTag();
int startIndex = guessTag.getToken().getStartIndex();
boolean realSplit = realSplits.contains(startIndex);
TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
indexes.add(startIndex);
realOutcomes.put(startIndex, realDecision);
guessedOutcomes.put(startIndex, guessDecision);
guessedAuthorities.put(startIndex, guessTag.getDecision().getAuthorities());
}
int prevEndIndex = 0;
for (Token token : realSequence) {
corpusWriter.write(token.getOriginalText());
Set<String> authorities = new TreeSet<String>();
boolean correct = true;
for (int index : indexes) {
if (prevEndIndex <= index && index < token.getEndIndex()) {
correct = correct && realOutcomes.get(index) == guessedOutcomes.get(index);
authorities.addAll(guessedAuthorities.get(index));
}
}
corpusWriter.write("\t" + correct);
for (String authority : authorities) {
if (!authority.startsWith("_")) {
corpusWriter.write("\t" + authority);
}
}
corpusWriter.write("\n");
corpusWriter.flush();
prevEndIndex = token.getEndIndex();
}
corpusWriter.write("\n");
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PossibleSentenceBoundary method getTokenIndexWithWhitespace.
/**
* Index of this boundary's token, including whitespace.
*/
public int getTokenIndexWithWhitespace() {
if (tokenIndex < 0) {
// perform binary search to find token index quickly
List<Token> tokens = this.getTokenSequence().listWithWhiteSpace();
int current = tokens.size() / 2;
int step = current;
while (tokenIndex < 0) {
Token token = tokens.get(current);
if (token.getStartIndex() <= index && index < token.getEndIndex()) {
tokenIndex = token.getIndexWithWhiteSpace();
break;
}
step = step / 2;
if (step < 1)
step = 1;
if (token.getStartIndex() <= index) {
current += step;
} else if (token.getStartIndex() > index) {
current -= step;
}
if (current < 0 || current >= tokens.size()) {
throw new RuntimeException("Binary search failed. Current = " + current + ", Size = " + tokens.size());
}
}
}
return tokenIndex;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LexiconPosTagFeature method checkInternal.
@Override
public FeatureResult<Boolean> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<Boolean> result = null;
boolean matches = false;
for (StringFeature<TokenWrapper> posTagFeature : posTagFeatures) {
FeatureResult<String> posTagResult = posTagFeature.check(innerWrapper, env);
if (posTagResult != null) {
PosTag posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagResult.getOutcome());
boolean hasPosTag = (token.getPossiblePosTags().contains(posTag));
if (hasPosTag) {
matches = true;
break;
}
}
}
result = this.generateResult(matches);
return result;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class LexiconPosTagsFeature method checkInternal.
@Override
public FeatureResult<List<WeightedOutcome<String>>> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
if (innerWrapper == null)
return null;
Token token = innerWrapper.getToken();
FeatureResult<List<WeightedOutcome<String>>> result = null;
List<WeightedOutcome<String>> resultList = new ArrayList<WeightedOutcome<String>>();
for (PosTag posTag : token.getPossiblePosTags()) {
resultList.add(new WeightedOutcome<String>(posTag.getCode(), 1.0));
}
if (resultList.size() > 0)
result = this.generateResult(resultList);
return result;
}
Aggregations