use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.
the class HistorySearchFeature method checkInternal.
@Override
public FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) throws TalismaneException {
FeatureResult<PosTaggedTokenWrapper> featureResult = null;
int startIndex = context.getToken().getIndex() - 1;
int endIndex = 0;
if (startIndexFeature != null) {
FeatureResult<Integer> startIndexResult = startIndexFeature.check(context, env);
if (startIndexResult != null) {
startIndex = startIndexResult.getOutcome();
} else {
return featureResult;
}
}
if (endIndexFeature != null) {
FeatureResult<Integer> endIndexResult = endIndexFeature.check(context, env);
if (endIndexResult != null) {
endIndex = endIndexResult.getOutcome();
} else {
return featureResult;
}
}
if (startIndex < 0)
startIndex = 0;
if (endIndex < 0)
endIndex = 0;
if (startIndex >= context.getHistory().size())
startIndex = context.getHistory().size() - 1;
if (endIndex >= context.getHistory().size())
endIndex = context.getHistory().size() - 1;
int step = -1;
if (endIndex > startIndex)
step = 1;
PosTaggedToken matchingToken = null;
for (int i = startIndex; (step < 0 && i >= 0 && i >= endIndex) || (step > 0 && i < context.getHistory().size() && i <= endIndex); i += step) {
PosTaggedToken oneToken = context.getHistory().get(i);
FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
if (criterionResult != null && criterionResult.getOutcome()) {
matchingToken = oneToken;
break;
}
if (stopCriterion != null) {
FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
break;
}
}
}
if (matchingToken != null) {
featureResult = this.generateResult(matchingToken);
}
return featureResult;
}
use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.
the class LemmaFeature method checkInternal.
@Override
protected FeatureResult<String> checkInternal(T context, RuntimeEnvironment env) throws TalismaneException {
PosTaggedTokenWrapper innerWrapper = this.getToken(context, env);
if (innerWrapper == null)
return null;
PosTaggedToken posTaggedToken = innerWrapper.getPosTaggedToken();
if (posTaggedToken == null)
return null;
FeatureResult<String> featureResult = null;
List<LexicalEntry> lexicalEntries = posTaggedToken.getLexicalEntries();
if (lexicalEntries.size() > 0) {
LexicalEntry lexicalEntry = lexicalEntries.get(0);
featureResult = this.generateResult(lexicalEntry.getLemma());
}
return featureResult;
}
use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.
the class ParseFeatureTester method onNextParseConfiguration.
@Override
public void onNextParseConfiguration(ParseConfiguration parseConfiguration) throws TalismaneException {
ParseConfiguration currentConfiguration = new ParseConfiguration(parseConfiguration.getPosTagSequence());
for (Transition transition : parseConfiguration.getTransitions()) {
StringBuilder sb = new StringBuilder();
for (PosTaggedToken taggedToken : currentConfiguration.getPosTagSequence()) {
if (taggedToken.equals(currentConfiguration.getStack().getFirst())) {
sb.append(" #[" + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString() + "]#");
} else if (taggedToken.equals(currentConfiguration.getBuffer().getFirst())) {
sb.append(" #[" + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString() + "]#");
} else {
sb.append(" " + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString());
}
}
sb.append(" ## Line: " + parseConfiguration.getSentence().getStartLineNumber());
if (LOG.isTraceEnabled())
LOG.trace(sb.toString());
List<FeatureResult<?>> parseFeatureResults = new ArrayList<FeatureResult<?>>();
for (ParseConfigurationFeature<?> parseFeature : parseFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = parseFeature.check(currentConfiguration, env);
if (featureResult != null) {
parseFeatureResults.add(featureResult);
if (LOG.isTraceEnabled()) {
LOG.trace(featureResult.toString());
}
}
}
String classification = transition.getCode();
for (FeatureResult<?> featureResult : parseFeatureResults) {
Map<String, List<String>> classificationMap = featureResultMap.get(featureResult.toString());
if (classificationMap == null) {
classificationMap = new TreeMap<String, List<String>>();
featureResultMap.put(featureResult.toString(), classificationMap);
}
List<String> sentences = classificationMap.get(classification);
if (sentences == null) {
sentences = new ArrayList<String>();
classificationMap.put(classification, sentences);
}
sentences.add(sb.toString());
}
// apply the transition and up the index
currentConfiguration = new ParseConfiguration(currentConfiguration);
transition.apply(currentConfiguration);
}
}
use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.
the class ParseOutputRewriter method getCorpusLines.
List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
// first convert the parse configuration to a list of corpus lines
List<CorpusLine> corpusLines = new ArrayList<>();
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (!posTaggedToken.isRoot()) {
DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
corpusLine.setIndex(posTaggedToken.getIndex());
corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
corpusLine.setPosTag(posTaggedToken.getTag().getCode());
String morphology = posTaggedToken.getMorphologyForCoNLL();
corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
if (posTaggedToken.getToken().getTrailingRawOutput() != null)
corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
corpusLine.setPosTagProbability(posTaggedToken.getProbability());
if (arc != null)
corpusLine.setParseProbability(arc.getProbability());
corpusLines.add(corpusLine);
}
}
Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
for (CorpusLine corpusLine : corpusLines) {
if (LOG.isDebugEnabled())
LOG.debug(corpusLine.toString());
for (RewriteRule rewriteRule : rewriteRules) {
boolean matches = true;
conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
Pattern pattern = rewriteRule.conditions.get(corpusElement);
if (LOG.isTraceEnabled())
LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
switch(corpusElement) {
case POSTAG:
if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Match failed for " + corpusLine.getPosTag());
matches = false;
break conditionLoop;
}
break;
case TOKEN:
if (!pattern.matcher(corpusLine.getToken()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LEMMA:
if (!pattern.matcher(corpusLine.getLemma()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LABEL:
if (!pattern.matcher(corpusLine.getLabel()).matches()) {
matches = false;
break conditionLoop;
}
break;
default:
throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
}
}
if (matches) {
if (rewriteRule.action instanceof SplitAction) {
SplitAction splitAction = (SplitAction) rewriteRule.action;
splitActions.put(corpusLine, splitAction);
}
}
}
}
if (splitActions.size() > 0) {
List<CorpusLine> newCorpusLines = new ArrayList<>();
Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
oldToNewIndexMap.put(0, 0);
int currentIndex = 1;
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
oldToNewIndexMap.put(i + 1, currentIndex);
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
currentIndex += splitAction.elementValues.size();
} else {
currentIndex++;
}
}
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
for (int j = 0; j < splitAction.elementValues.size(); j++) {
CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
// The first matching element in each group will be applied
// The default element marks the end of each group, and will be
// applied if no other match has applied.
List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
boolean groupHasMatch = false;
for (ConditionalAction conditionalAction : conditionalActions) {
CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
if (conditionalAction.isDefault) {
if (!groupHasMatch) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
}
// The default action marks the end of each matching group.
groupHasMatch = false;
} else {
boolean match = true;
for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
String origValue = baseLine.getElement(corpusElement);
Pattern pattern = conditionalAction.conditions.get(corpusElement);
if (!pattern.matcher(origValue).matches()) {
match = false;
break;
}
}
if (match) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
groupHasMatch = true;
}
// did this action match?
}
// default action?
}
// next conditional action
newCorpusLines.add(splitCorpusLine);
}
// next split
} else {
newCorpusLines.add(newCorpusLine);
}
// should line be split?
}
// next corpus line
corpusLines = newCorpusLines;
}
return corpusLines;
}
use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.
the class TransitionLogWriter method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration refConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException, IOException {
boolean includeMe = true;
if (errorLabels != null && errorLabels.size() > 0) {
includeMe = false;
int i = 0;
ParseConfiguration guessConfiguration = guessedConfigurations.get(0);
Set<PosTaggedToken> refTokensToExplain = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> guessTokensToExplain = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> refTokensToHighlight = new HashSet<PosTaggedToken>();
Set<PosTaggedToken> guessTokensToHighlight = new HashSet<PosTaggedToken>();
for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
if (i != 0) {
DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
if (refArc != null) {
PosTaggedToken guessToken = guessConfiguration.getPosTagSequence().get(i);
if (errorLabels.contains(refArc.getLabel())) {
DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
if (guessArc == null || !refArc.getLabel().equals(guessArc.getLabel()) || (refArc.getHead() == null && guessArc.getHead() != null) || (refArc.getHead() != null && guessArc.getHead() == null) || refArc.getHead().getIndex() != guessArc.getHead().getIndex()) {
refTokensToExplain.add(refToken);
if (refArc.getHead() != null)
refTokensToHighlight.add(refArc.getHead());
guessTokensToExplain.add(guessToken);
if (guessArc != null && guessArc.getHead() != null)
guessTokensToHighlight.add(guessArc.getHead());
includeMe = true;
}
}
}
// have refArc
}
i++;
}
StringBuilder refBuilder = new StringBuilder();
for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
if (refTokensToExplain.contains(refToken)) {
DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
if (refArc == null)
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov0|null# ");
else
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov" + (refArc.getHead() == null ? 0 : refArc.getHead().getIndex()) + "|" + refArc.getLabel() + "# ");
} else if (refTokensToHighlight.contains(refToken)) {
refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "# ");
} else {
refBuilder.append(refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + " ");
}
}
StringBuilder guessBuilder = new StringBuilder();
for (PosTaggedToken guessToken : guessConfiguration.getPosTagSequence()) {
if (guessTokensToExplain.contains(guessToken)) {
DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
if (guessArc == null)
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov0|null# ");
else
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov" + (guessArc.getHead() == null ? 0 : guessArc.getHead().getIndex()) + "|" + guessArc.getLabel() + "# ");
} else if (guessTokensToHighlight.contains(guessToken)) {
guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "# ");
} else {
guessBuilder.append(guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + " ");
}
}
if (includeMe) {
writer.write("\n");
writer.write(refBuilder.toString() + "\n");
writer.write(guessBuilder.toString() + "\n");
}
}
if (includeMe)
this.onNextParseConfiguration(guessedConfigurations.get(0));
}
Aggregations