use of com.joliciel.talismane.tokeniser.StringAttribute in project talismane by joliciel-informatique.
the class RawTextAnnotatorFactory method getAnnotator.
/**
* @param descriptor
* @param blockSize
* @return
* @throws TalismaneException
* if a descriptor is incorrectly configured
*/
public RawTextAnnotator getAnnotator(String descriptor, int blockSize) throws TalismaneException {
RawTextAnnotator filter = null;
List<Class<? extends RawTextAnnotator>> classes = new ArrayListNoNulls<Class<? extends RawTextAnnotator>>();
classes.add(DuplicateWhiteSpaceFilter.class);
classes.add(NewlineEndOfSentenceMarker.class);
classes.add(NewlineSpaceMarker.class);
String[] parts = descriptor.split("\t");
String filterName = parts[0];
// add equality to RegexMarkerFilter for historical reasons
if (filterName.equals("RegexMarkerFilter") || filterName.equals(RawTextRegexAnnotator.class.getSimpleName())) {
String[] filterTypeStrings = parts[1].split(",");
List<RawTextMarkType> filterTypes = new ArrayListNoNulls<RawTextMarkType>();
for (String filterTypeString : filterTypeStrings) {
filterTypes.add(RawTextMarkType.valueOf(filterTypeString));
}
boolean needsReplacement = false;
boolean needsTag = false;
int minParams = 3;
if (filterTypes.contains(RawTextMarkType.REPLACE)) {
needsReplacement = true;
minParams = 4;
} else if (filterTypes.contains(RawTextMarkType.TAG)) {
needsTag = true;
minParams = 4;
}
if (parts.length == minParams + 1) {
filter = new RawTextRegexAnnotator(filterTypes, parts[2], Integer.parseInt(parts[3]), blockSize);
if (needsReplacement)
filter.setReplacement(parts[4]);
if (needsTag) {
if (parts[4].indexOf('=') >= 0) {
String attribute = parts[4].substring(0, parts[4].indexOf('='));
String value = parts[4].substring(parts[4].indexOf('=') + 1);
filter.setAttribute(new StringAttribute(attribute, value));
} else {
filter.setAttribute(new StringAttribute(parts[4], ""));
}
}
} else if (parts.length == minParams) {
filter = new RawTextRegexAnnotator(filterTypes, parts[2], 0, blockSize);
if (needsReplacement)
filter.setReplacement(parts[3]);
if (needsTag) {
if (parts[3].indexOf('=') >= 0) {
String attribute = parts[3].substring(0, parts[3].indexOf('='));
String value = parts[3].substring(parts[3].indexOf('=') + 1);
filter.setAttribute(new StringAttribute(attribute, value));
} else {
filter.setAttribute(new StringAttribute(parts[4], ""));
}
}
} else {
throw new TalismaneException("Wrong number of arguments for " + RawTextRegexAnnotator.class.getSimpleName() + ". Expected " + minParams + " or " + (minParams + 1) + ", but was " + parts.length);
}
} else {
for (Class<? extends RawTextAnnotator> clazz : classes) {
if (filterName.equals(clazz.getSimpleName())) {
try {
Constructor<? extends RawTextAnnotator> constructor = clazz.getConstructor(Integer.class);
filter = constructor.newInstance(blockSize);
} catch (ReflectiveOperationException e) {
throw new TalismaneException("Problem building class: " + filterName, e);
}
}
}
if (filter == null)
throw new TalismaneException("Unknown text filter class: " + filterName);
}
return filter;
}
use of com.joliciel.talismane.tokeniser.StringAttribute in project talismane by joliciel-informatique.
the class ForwardStatisticalPosTagger method tagSentence.
@Override
public List<PosTagSequence> tagSentence(List<TokenSequence> input) throws TalismaneException, IOException {
List<TokenSequence> tokenSequences = null;
if (this.propagateTokeniserBeam) {
tokenSequences = input;
} else {
tokenSequences = new ArrayList<>(1);
tokenSequences.add(input.get(0));
}
int sentenceLength = tokenSequences.get(0).getSentence().getText().length();
TreeMap<Double, PriorityQueue<PosTagSequence>> heaps = new TreeMap<Double, PriorityQueue<PosTagSequence>>();
PriorityQueue<PosTagSequence> heap0 = new PriorityQueue<PosTagSequence>();
for (TokenSequence tokenSequence : tokenSequences) {
// add an empty PosTagSequence for each token sequence
PosTagSequence emptySequence = new PosTagSequence(tokenSequence);
emptySequence.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
heap0.add(emptySequence);
}
heaps.put(0.0, heap0);
PriorityQueue<PosTagSequence> finalHeap = null;
while (heaps.size() > 0) {
Entry<Double, PriorityQueue<PosTagSequence>> heapEntry = heaps.pollFirstEntry();
if (LOG.isTraceEnabled()) {
LOG.trace("heap key: " + heapEntry.getKey() + ", sentence length: " + sentenceLength);
}
if (heapEntry.getKey() == sentenceLength) {
finalHeap = heapEntry.getValue();
break;
}
PriorityQueue<PosTagSequence> previousHeap = heapEntry.getValue();
// limit the breadth to K
int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
PosTagSequence history = previousHeap.poll();
Token token = history.getNextToken();
if (LOG.isTraceEnabled()) {
LOG.trace("#### Next history ( " + heapEntry.getKey() + "): " + history.toString());
LOG.trace("Prob: " + df.format(history.getScore()));
LOG.trace("Token: " + token.getText());
StringBuilder sb = new StringBuilder();
for (Token oneToken : history.getTokenSequence().listWithWhiteSpace()) {
if (oneToken.equals(token))
sb.append("[" + oneToken + "]");
else
sb.append(oneToken);
}
LOG.trace(sb.toString());
}
PosTaggerContext context = new PosTaggerContextImpl(token, history);
List<Decision> decisions = new ArrayList<Decision>();
boolean ruleApplied = false;
// assigned?
if (token.getAttributes().containsKey(PosTagger.POS_TAG_ATTRIBUTE)) {
StringAttribute posTagCodeAttribute = (StringAttribute) token.getAttributes().get(PosTagger.POS_TAG_ATTRIBUTE);
String posTagCode = posTagCodeAttribute.getValue();
Decision positiveRuleDecision = new Decision(posTagCode);
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority("tokenAttribute");
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Token has attribute \"" + PosTagger.POS_TAG_ATTRIBUTE + "\". Setting posTag to: " + posTagCode);
}
}
// test the positive rules on the current token
if (!ruleApplied) {
if (posTaggerPositiveRules != null) {
for (PosTaggerRule rule : posTaggerPositiveRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
Decision positiveRuleDecision = new Decision(rule.getTag().getCode());
decisions.add(positiveRuleDecision);
positiveRuleDecision.addAuthority(rule.getCondition().getName());
ruleApplied = true;
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Setting posTag to: " + rule.getTag().getCode());
}
break;
}
}
}
}
if (!ruleApplied) {
// test the features on the current token
List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
for (PosTaggerFeature<?> posTaggerFeature : posTaggerFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = posTaggerFeature.check(context, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
// evaluate the feature results using the maxent model
decisions = this.decisionMaker.decide(featureResults);
for (ClassificationObserver observer : this.observers) {
observer.onAnalyse(token, featureResults, decisions);
}
// apply the negative rules
Set<String> eliminatedPosTags = new TreeSet<String>();
if (posTaggerNegativeRules != null) {
for (PosTaggerRule rule : posTaggerNegativeRules) {
if (LOG.isTraceEnabled()) {
LOG.trace("Checking negative rule: " + rule.getCondition().getName());
}
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
if (ruleResult != null && ruleResult.getOutcome()) {
eliminatedPosTags.add(rule.getTag().getCode());
if (LOG.isTraceEnabled()) {
LOG.trace("Rule applies. Eliminating posTag: " + rule.getTag().getCode());
}
}
}
if (eliminatedPosTags.size() > 0) {
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (!eliminatedPosTags.contains(decision.getOutcome())) {
decisionShortList.add(decision);
} else {
LOG.trace("Eliminating decision: " + decision.toString());
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
} else {
LOG.debug("All decisions eliminated! Restoring original decisions.");
}
}
}
// is this a known word in the lexicon?
if (LOG.isTraceEnabled()) {
String posTags = "";
for (PosTag onePosTag : token.getPossiblePosTags()) {
posTags += onePosTag.getCode() + ",";
}
LOG.trace("Token: " + token.getText() + ". PosTags: " + posTags);
}
List<Decision> decisionShortList = new ArrayList<Decision>();
for (Decision decision : decisions) {
if (decision.getProbability() >= MIN_PROB_TO_STORE) {
decisionShortList.add(decision);
}
}
if (decisionShortList.size() > 0) {
decisions = decisionShortList;
}
}
// outcome provided by MaxEnt
for (Decision decision : decisions) {
if (LOG.isTraceEnabled())
LOG.trace("Outcome: " + decision.getOutcome() + ", " + decision.getProbability());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, this.sessionId);
PosTagSequence sequence = new PosTagSequence(history);
sequence.addPosTaggedToken(posTaggedToken);
if (decision.isStatistical())
sequence.addDecision(decision);
double heapIndex = token.getEndIndex();
// it from regular ones
if (token.getStartIndex() == token.getEndIndex())
heapIndex += 0.5;
// if it's the last token, make sure we end
if (token.getIndex() == sequence.getTokenSequence().size() - 1)
heapIndex = sentenceLength;
if (LOG.isTraceEnabled())
LOG.trace("Heap index: " + heapIndex);
PriorityQueue<PosTagSequence> heap = heaps.get(heapIndex);
if (heap == null) {
heap = new PriorityQueue<PosTagSequence>();
heaps.put(heapIndex, heap);
}
heap.add(sequence);
}
// next outcome for this token
}
// next history
}
// next atomic index
// return the best sequence on the heap
List<PosTagSequence> sequences = new ArrayList<PosTagSequence>();
int i = 0;
while (!finalHeap.isEmpty()) {
// clone the pos tag sequences to ensure they don't share any underlying
// data (e.g. token sequences)
sequences.add(finalHeap.poll().clonePosTagSequence());
i++;
if (i >= this.getBeamWidth())
break;
}
// apply post-processing filters
if (LOG.isDebugEnabled()) {
LOG.debug("####Final postag sequences:");
int j = 1;
for (PosTagSequence sequence : sequences) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
LOG.debug("Sequence: " + sequence);
}
}
}
return sequences;
}
use of com.joliciel.talismane.tokeniser.StringAttribute in project talismane by joliciel-informatique.
the class PosTaggedToken method getLemma.
/**
* This pos-tagged token's lemma, or null if no lemma found.<br>
* If there are multiple lexical entries, the first one's lemma is returned.
* <br>
* If all possible lemmas are required, they need to be retrieved from
* {@link #getLexicalEntries()}.
*/
public String getLemma() {
if (!this.lemmaFetched) {
String lemmaType = null;
StringAttribute lemmaTypeAttribute = (StringAttribute) this.getToken().getAttributes().get(PosTagger.LEMMA_TYPE_ATTRIBUTE);
if (lemmaTypeAttribute != null)
lemmaType = lemmaTypeAttribute.getValue();
String explicitLemma = null;
StringAttribute explicitLemmaAttribute = (StringAttribute) this.getToken().getAttributes().get(PosTagger.LEMMA_ATTRIBUTE);
if (explicitLemmaAttribute != null)
explicitLemma = explicitLemmaAttribute.getValue();
if (explicitLemma != null) {
this.lemma = explicitLemma;
} else if (lemmaType != null && lemmaType.equals("originalLower")) {
this.lemma = this.getToken().getOriginalText().toLowerCase(TalismaneSession.get(sessionId).getLocale());
} else if (this.getLexicalEntries().size() > 0) {
this.lemma = this.getLexicalEntries().get(0).getLemma();
}
this.lemmaFetched = true;
}
return this.lemma;
}
use of com.joliciel.talismane.tokeniser.StringAttribute in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testPuctuation.
@Test
public void testPuctuation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(2, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals("Bonjour".length(), placeholder.getStart());
assertEquals("Bonjour.".length(), placeholder.getEnd());
assertEquals("featureType", placeholder.getData().getKey());
assertEquals("punctuation", placeholder.getData().getValue());
}
use of com.joliciel.talismane.tokeniser.StringAttribute in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testStartOfInput.
@Test
public void testStartOfInput() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "^Résumé\\.";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(1, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals(0, placeholder.getStart());
assertEquals(7, placeholder.getEnd());
assertEquals("TAG", placeholder.getData().getKey());
}
Aggregations