use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.
the class CorpusRule method apply.
/**
* Apply the rule to a corpus line, and add the values that need to be updated
* to the values map, as long as no value yet exists for the element being
* updated. Thus, the first rule to match that updates a given element will
* win, for each element.
*/
public void apply(CorpusLine corpusLine, Map<CorpusElement, String> values) {
boolean match = true;
for (CorpusElement element : criteria.keySet()) {
if (corpusLine.hasElement(element)) {
Pattern pattern = criteria.get(element);
if (!pattern.matcher(corpusLine.getElement(element)).matches()) {
match = false;
break;
}
} else {
match = false;
break;
}
}
if (match) {
if (LOG.isTraceEnabled()) {
LOG.trace("Rule " + this.toString() + " matched line " + corpusLine);
}
for (CorpusElement element : actions.keySet()) {
if (!values.containsKey(element)) {
String value = actions.get(element);
values.put(element, value);
}
}
}
}
use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.
the class ParseOutputRewriter method getCorpusLines.
List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
// first convert the parse configuration to a list of corpus lines
List<CorpusLine> corpusLines = new ArrayList<>();
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (!posTaggedToken.isRoot()) {
DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
corpusLine.setIndex(posTaggedToken.getIndex());
corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
corpusLine.setPosTag(posTaggedToken.getTag().getCode());
String morphology = posTaggedToken.getMorphologyForCoNLL();
corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
if (posTaggedToken.getToken().getTrailingRawOutput() != null)
corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
corpusLine.setPosTagProbability(posTaggedToken.getProbability());
if (arc != null)
corpusLine.setParseProbability(arc.getProbability());
corpusLines.add(corpusLine);
}
}
Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
for (CorpusLine corpusLine : corpusLines) {
if (LOG.isDebugEnabled())
LOG.debug(corpusLine.toString());
for (RewriteRule rewriteRule : rewriteRules) {
boolean matches = true;
conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
Pattern pattern = rewriteRule.conditions.get(corpusElement);
if (LOG.isTraceEnabled())
LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
switch(corpusElement) {
case POSTAG:
if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Match failed for " + corpusLine.getPosTag());
matches = false;
break conditionLoop;
}
break;
case TOKEN:
if (!pattern.matcher(corpusLine.getToken()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LEMMA:
if (!pattern.matcher(corpusLine.getLemma()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LABEL:
if (!pattern.matcher(corpusLine.getLabel()).matches()) {
matches = false;
break conditionLoop;
}
break;
default:
throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
}
}
if (matches) {
if (rewriteRule.action instanceof SplitAction) {
SplitAction splitAction = (SplitAction) rewriteRule.action;
splitActions.put(corpusLine, splitAction);
}
}
}
}
if (splitActions.size() > 0) {
List<CorpusLine> newCorpusLines = new ArrayList<>();
Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
oldToNewIndexMap.put(0, 0);
int currentIndex = 1;
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
oldToNewIndexMap.put(i + 1, currentIndex);
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
currentIndex += splitAction.elementValues.size();
} else {
currentIndex++;
}
}
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
for (int j = 0; j < splitAction.elementValues.size(); j++) {
CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
// The first matching element in each group will be applied
// The default element marks the end of each group, and will be
// applied if no other match has applied.
List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
boolean groupHasMatch = false;
for (ConditionalAction conditionalAction : conditionalActions) {
CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
if (conditionalAction.isDefault) {
if (!groupHasMatch) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
}
// The default action marks the end of each matching group.
groupHasMatch = false;
} else {
boolean match = true;
for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
String origValue = baseLine.getElement(corpusElement);
Pattern pattern = conditionalAction.conditions.get(corpusElement);
if (!pattern.matcher(origValue).matches()) {
match = false;
break;
}
}
if (match) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
groupHasMatch = true;
}
// did this action match?
}
// default action?
}
// next conditional action
newCorpusLines.add(splitCorpusLine);
}
// next split
} else {
newCorpusLines.add(newCorpusLine);
}
// should line be split?
}
// next corpus line
corpusLines = newCorpusLines;
}
return corpusLines;
}
use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.
the class ParseOutputRewriter method setElementValues.
private void setElementValues(Map<CorpusElement, String> elementValues, Map<Integer, Integer> oldToNewIndexMap, CorpusLine origLine, CorpusLine splitCorpusLine) throws TalismaneException {
for (CorpusElement key : elementValues.keySet()) {
String elementValue = elementValues.get(key);
if (elementValue.equals("${orig}")) {
splitCorpusLine.setElement(key, origLine.getElement(key));
} else {
Matcher matcher = linePattern.matcher(elementValue);
if (matcher.matches()) {
int lineNumber = Integer.parseInt(matcher.group(1));
int equivalentIndex = (origLine.getIndex() + lineNumber) - 1;
switch(key) {
case GOVERNOR:
splitCorpusLine.setGovernorIndex(equivalentIndex);
break;
case NON_PROJ_GOVERNOR:
splitCorpusLine.setNonProjGovernorIndex(equivalentIndex);
break;
default:
throw new TalismaneException("element value '" + elementValue + "' not supported for corpus element " + key.name());
}
} else {
splitCorpusLine.setElement(key, elementValue);
}
}
}
}
use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.
the class CorpusLineReader method read.
/**
* Read one line out of the corpus, and transform it into a {@link CorpusLine}
*
* @param line
* the line to read
* @param lineNumber
* the line number we reached, starting at 1.
* @throws TalismaneException
* if the regex wasn't matched on a given line
*/
public CorpusLine read(String line, int lineNumber) throws TalismaneException {
Matcher matcher = this.pattern.matcher(line);
if (!matcher.matches())
throw new TalismaneException("Didn't match pattern \"" + regex + "\". Compiled to: \"" + this.pattern.pattern() + "\". On line " + lineNumber + ": " + line);
CorpusLine corpusLine = new CorpusLine(line, lineNumber);
for (CorpusElement elementType : CorpusElement.values()) {
if (placeholderIndexMap.containsKey(elementType)) {
String value = matcher.group(placeholderIndexMap.get(elementType));
switch(elementType) {
case TOKEN:
case LEMMA:
value = TalismaneSession.get(sessionId).getCoNLLFormatter().fromCoNLL(value);
break;
default:
if ("_".equals(value))
value = "";
break;
}
corpusLine.setElement(elementType, value);
}
}
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
Map<CorpusElement, String> updateValues = new HashMap<>();
for (CorpusRule corpusRule : corpusRules) {
corpusRule.apply(corpusLine, updateValues);
}
for (CorpusElement element : updateValues.keySet()) {
String value = updateValues.get(element);
if (LOG.isTraceEnabled()) {
LOG.trace("On line " + lineNumber + ", updating " + element.name() + " from '" + corpusLine.getElement(element) + "' to '" + value + "'");
}
corpusLine.setElement(element, value);
}
return corpusLine;
}
Aggregations