use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class InitialsFeature method checkInternal.
@Override
public FeatureResult<Boolean> checkInternal(PossibleSentenceBoundary context, RuntimeEnvironment env) {
FeatureResult<Boolean> result = null;
if (context.getBoundaryString().equals(".")) {
int tokenIndex = context.getTokenIndexWithWhitespace();
Token previousToken = null;
if (tokenIndex > 0)
previousToken = context.getTokenSequence().listWithWhiteSpace().get(tokenIndex - 1);
String isInitial = null;
if (previousToken != null && (!languageHasUppercaseLetters || Character.isUpperCase(previousToken.getOriginalText().charAt(0)))) {
if (previousToken.getOriginalText().length() == 1)
isInitial = "true";
}
if (isInitial != null) {
result = this.generateResult(true);
}
}
return result;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class SurroundingsFeature method checkInternal.
@Override
public FeatureResult<String> checkInternal(PossibleSentenceBoundary context, RuntimeEnvironment env) throws TalismaneException {
FeatureResult<String> result = null;
FeatureResult<Integer> nResult = nFeature.check(context, env);
if (nResult != null) {
int n = nResult.getOutcome();
int tokenIndex = context.getTokenIndexWithWhitespace();
String tokenString = "";
int maxToken = context.getTokenSequence().listWithWhiteSpace().size();
for (int i = tokenIndex - n; i <= tokenIndex + n; i++) {
Token token = null;
String categoryString = null;
if (i >= 0 && i < maxToken) {
token = context.getTokenSequence().listWithWhiteSpace().get(i);
categoryString = this.getCategoryString(token);
} else {
if (i == -1 || i == maxToken) {
categoryString = " ";
} else if (i == maxToken + 1) {
categoryString = "Word";
} else if ((0 - i) % 2 == 0 || (maxToken - i) % 2 == 1) {
if (languageHasUppercaseLetters) {
categoryString = "word";
} else {
categoryString = "Word";
}
} else {
categoryString = " ";
}
}
tokenString += categoryString;
}
String resultString = tokenString;
result = this.generateResult(resultString);
}
return result;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class ForwardStatisticalPosTagger method tagSentence.
@Override
public List<PosTagSequence> tagSentence(List<TokenSequence> input) throws TalismaneException, IOException {
List<TokenSequence> tokenSequences = null;
if (this.propagateTokeniserBeam) {
tokenSequences = input;
} else {
tokenSequences = new ArrayList<>(1);
tokenSequences.add(input.get(0));
}
int sentenceLength = tokenSequences.get(0).getSentence().getText().length();
TreeMap<Double, PriorityQueue<PosTagSequence>> heaps = new TreeMap<Double, PriorityQueue<PosTagSequence>>();
PriorityQueue<PosTagSequence> heap0 = new PriorityQueue<PosTagSequence>();
for (TokenSequence tokenSequence : tokenSequences) {
// add an empty PosTagSequence for each token sequence
PosTagSequence emptySequence = new PosTagSequence(tokenSequence);
emptySequence.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
heap0.add(emptySequence);
}
heaps.put(0.0, heap0);
PriorityQueue<PosTagSequence> finalHeap = null;
while (heaps.size() > 0) {
Entry<Double, PriorityQueue<PosTagSequence>> heapEntry = heaps.pollFirstEntry();
if (LOG.isTraceEnabled()) {
LOG.trace("heap key: " + heapEntry.getKey() + ", sentence length: " + sentenceLength);
}
if (heapEntry.getKey() == sentenceLength) {
finalHeap = heapEntry.getValue();
break;
}
PriorityQueue<PosTagSequence> previousHeap = heapEntry.getValue();
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
PosTagSequence history = previousHeap.poll();
Token token = history.getNextToken();
if (LOG.isTraceEnabled()) {
LOG.trace("#### Next history ( " + heapEntry.getKey() + "): " + history.toString());
LOG.trace("Prob: " + df.format(history.getScore()));
LOG.trace("Token: " + token.getText());
StringBuilder sb = new StringBuilder();
for (Token oneToken : history.getTokenSequence().listWithWhiteSpace()) {
if (oneToken.equals(token))
sb.append("[" + oneToken + "]");
else
sb.append(oneToken);
}
LOG.trace(sb.toString());
}
PosTaggerContext context = new PosTaggerContextImpl(token, history);
List<Decision> decisions = new ArrayList<Decision>();
boolean ruleApplied = false;
// assigned?
if (token.getAttributes().containsKey(PosTagger.POS_TAG_ATTRIBUTE)) {
StringAttribute posTagCodeAttribute = (StringAttribute) token.getAttributes().get(PosTagger.POS_TAG_ATTRIBUTE);
String posTagCode = posTagCodeAttribute.getValue();
Decision positiveRuleDecision = new Decision(posTagCode);
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority("tokenAttribute");
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Token has attribute \"" + PosTagger.POS_TAG_ATTRIBUTE + "\". Setting posTag to: " + posTagCode);
}
}
// test the positive rules on the current token
if (!ruleApplied) {
if (posTaggerPositiveRules != null) {
for (PosTaggerRule rule : posTaggerPositiveRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
Decision positiveRuleDecision = new Decision(rule.getTag().getCode());
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority(rule.getCondition().getName());
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Setting posTag to: " + rule.getTag().getCode());
}
break;
}
}
}
}
if (!ruleApplied) {
// test the features on the current token
List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
for (PosTaggerFeature<?> posTaggerFeature : posTaggerFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = posTaggerFeature.check(context, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
// evaluate the feature results using the maxent model
decisions = this.decisionMaker.decide(featureResults);
for (ClassificationObserver observer : this.observers) {
observer.onAnalyse(token, featureResults, decisions);
}
// apply the negative rules
Set<String> eliminatedPosTags = new TreeSet<String>();
if (posTaggerNegativeRules != null) {
for (PosTaggerRule rule : posTaggerNegativeRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking negative rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
eliminatedPosTags.add(rule.getTag().getCode());
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Eliminating posTag: " + rule.getTag().getCode());
}
}
}
if (eliminatedPosTags.size() > 0) {
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (!eliminatedPosTags.contains(decision.getOutcome())) {
decisionShortList.add(decision);
} else {
LOG.trace("Eliminating decision: " + decision.toString());
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
} else {
LOG.debug("All decisions eliminated! Restoring original decisions.");
}
}
}
// is this a known word in the lexicon?
if (LOG.isTraceEnabled()) {
String posTags = "";
for (PosTag onePosTag : token.getPossiblePosTags()) {
posTags += onePosTag.getCode() + ",";
}
LOG.trace("Token: " + token.getText() + ". PosTags: " + posTags);
}
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (decision.getProbability() >= MIN_PROB_TO_STORE) {
decisionShortList.add(decision);
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
}
}
// outcome provided by MaxEnt
for (Decision decision : decisions) {
if (LOG.isTraceEnabled())
LOG.trace("Outcome: " + decision.getOutcome() + ", " + decision.getProbability());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, this.sessionId);
PosTagSequence sequence = new PosTagSequence(history);
sequence.addPosTaggedToken(posTaggedToken);
if (decision.isStatistical())
sequence.addDecision(decision);
double heapIndex = token.getEndIndex();
// it from regular ones
if (token.getStartIndex() == token.getEndIndex())
heapIndex += 0.5;
// if it's the last token, make sure we end
if (token.getIndex() == sequence.getTokenSequence().size() - 1)
heapIndex = sentenceLength;
if (LOG.isTraceEnabled())
LOG.trace("Heap index: " + heapIndex);
PriorityQueue<PosTagSequence> heap = heaps.get(heapIndex);
if (heap == null) {
heap = new PriorityQueue<PosTagSequence>();
heaps.put(heapIndex, heap);
}
heap.add(sequence);
}
// next outcome for this token
}
// next history
}
// next atomic index
// return the best sequence on the heap
List<PosTagSequence> sequences = new ArrayList<PosTagSequence>();
int i = 0;
while (!finalHeap.isEmpty()) {
// clone the pos tag sequences to ensure they don't share any underlying
// data (e.g. token sequences)
sequences.add(finalHeap.poll().clonePosTagSequence());
i++;
if (i >= this.getBeamWidth())
break;
}
// apply post-processing filters
if (LOG.isDebugEnabled()) {
LOG.debug("####Final postag sequences:");
int j = 1;
for (PosTagSequence sequence : sequences) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
LOG.debug("Sequence: " + sequence);
}
}
}
return sequences;
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PosTagSequence method removeRoot.
/**
* Remove a previously pre-pended root.
*/
public void removeRoot() {
PosTaggedToken rootToken = null;
if (this.size() > 0) {
rootToken = this.get(0);
if (!rootToken.getTag().equals(PosTag.ROOT_POS_TAG))
rootToken = null;
}
if (rootToken != null) {
Token emptyToken = rootToken.getToken();
try {
tokenSequence.removeEmptyToken(emptyToken);
} catch (TalismaneException e) {
// should never happen
LOG.error(e.getMessage(), e);
throw new RuntimeException(e);
}
this.remove(0);
tokenSequence.setWithRoot(false);
tokenSequence.reindex();
}
}
use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.
the class PosTagSequence method prependRoot.
/**
* Prepend a root to this PosTagSequence, unless there's a root already, and
* return the prepended root.
*/
public PosTaggedToken prependRoot() {
PosTaggedToken rootToken = null;
if (this.size() > 0) {
rootToken = this.get(0);
if (!rootToken.getTag().equals(PosTag.ROOT_POS_TAG))
rootToken = null;
}
if (rootToken == null) {
Token emptyToken = tokenSequence.addEmptyToken(0);
emptyToken.setText("[ROOT]");
tokenSequence.setWithRoot(true);
Decision rootDecision = new Decision(PosTag.ROOT_POS_TAG.getCode());
try {
rootToken = new PosTaggedToken(emptyToken, rootDecision, this.sessionId);
} catch (UnknownPosTagException e) {
// should never happen
LOG.error(e.getMessage(), e);
throw new RuntimeException(e);
}
this.add(0, rootToken);
}
this.string = null;
return rootToken;
}
Aggregations