use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.
the class ParserRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
PosTaggedToken rootToken = posTagSequence.prependRoot();
idTokenMap.put(0, rootToken);
TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
Set<DependencyArc> dependencies = new TreeSet<>();
for (CorpusLine dataLine : corpusLines) {
int headIndex = 0;
if (dataLine.hasElement(CorpusElement.GOVERNOR))
headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.GOVERNOR));
PosTaggedToken head = idTokenMap.get(headIndex);
PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
String dependencyLabel = dataLine.getElement(CorpusElement.LABEL);
if (transitionSystem.getDependencyLabels().size() > 1) {
if (dependencyLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(dependencyLabel)) {
throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), dependencyLabel);
}
String nonProjectiveLabel = dataLine.getElement(CorpusElement.NON_PROJ_LABEL);
if (nonProjectiveLabel != null && nonProjectiveLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(nonProjectiveLabel)) {
throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), nonProjectiveLabel);
}
}
DependencyArc arc = new DependencyArc(head, dependent, dependencyLabel);
if (LOG.isTraceEnabled())
LOG.trace(arc.toString());
dependencies.add(arc);
if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
arc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
}
configuration = new ParseConfiguration(posTagSequence);
if (this.predictTransitions) {
transitionSystem.predictTransitions(configuration, dependencies);
} else {
for (DependencyArc arc : dependencies) {
configuration.addDependency(arc.getHead(), arc.getDependent(), arc.getLabel(), null);
}
}
// if there are any
if (this.getCorpusLineReader().hasPlaceholder(CorpusElement.NON_PROJ_GOVERNOR)) {
Set<DependencyArc> nonProjDeps = new TreeSet<>();
if (LOG.isTraceEnabled())
LOG.trace("Non projective dependencies: ");
for (CorpusLine dataLine : corpusLines) {
int headIndex = 0;
if (dataLine.hasElement(CorpusElement.NON_PROJ_GOVERNOR))
headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.NON_PROJ_GOVERNOR));
PosTaggedToken head = idTokenMap.get(headIndex);
PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
DependencyArc nonProjArc = new DependencyArc(head, dependent, dataLine.getElement(CorpusElement.NON_PROJ_LABEL));
if (LOG.isTraceEnabled())
LOG.trace(nonProjArc.toString());
nonProjDeps.add(nonProjArc);
if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
nonProjArc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
}
for (DependencyArc nonProjArc : nonProjDeps) {
configuration.addManualNonProjectiveDependency(nonProjArc.getHead(), nonProjArc.getDependent(), nonProjArc.getLabel());
}
}
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.
the class ParseOutputRewriter method getCorpusLines.
List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
// first convert the parse configuration to a list of corpus lines
List<CorpusLine> corpusLines = new ArrayList<>();
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (!posTaggedToken.isRoot()) {
DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
corpusLine.setIndex(posTaggedToken.getIndex());
corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
corpusLine.setPosTag(posTaggedToken.getTag().getCode());
String morphology = posTaggedToken.getMorphologyForCoNLL();
corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
if (posTaggedToken.getToken().getTrailingRawOutput() != null)
corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
corpusLine.setPosTagProbability(posTaggedToken.getProbability());
if (arc != null)
corpusLine.setParseProbability(arc.getProbability());
corpusLines.add(corpusLine);
}
}
Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
for (CorpusLine corpusLine : corpusLines) {
if (LOG.isDebugEnabled())
LOG.debug(corpusLine.toString());
for (RewriteRule rewriteRule : rewriteRules) {
boolean matches = true;
conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
Pattern pattern = rewriteRule.conditions.get(corpusElement);
if (LOG.isTraceEnabled())
LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
switch(corpusElement) {
case POSTAG:
if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Match failed for " + corpusLine.getPosTag());
matches = false;
break conditionLoop;
}
break;
case TOKEN:
if (!pattern.matcher(corpusLine.getToken()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LEMMA:
if (!pattern.matcher(corpusLine.getLemma()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LABEL:
if (!pattern.matcher(corpusLine.getLabel()).matches()) {
matches = false;
break conditionLoop;
}
break;
default:
throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
}
}
if (matches) {
if (rewriteRule.action instanceof SplitAction) {
SplitAction splitAction = (SplitAction) rewriteRule.action;
splitActions.put(corpusLine, splitAction);
}
}
}
}
if (splitActions.size() > 0) {
List<CorpusLine> newCorpusLines = new ArrayList<>();
Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
oldToNewIndexMap.put(0, 0);
int currentIndex = 1;
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
oldToNewIndexMap.put(i + 1, currentIndex);
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
currentIndex += splitAction.elementValues.size();
} else {
currentIndex++;
}
}
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
for (int j = 0; j < splitAction.elementValues.size(); j++) {
CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
// The first matching element in each group will be applied
// The default element marks the end of each group, and will be
// applied if no other match has applied.
List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
boolean groupHasMatch = false;
for (ConditionalAction conditionalAction : conditionalActions) {
CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
if (conditionalAction.isDefault) {
if (!groupHasMatch) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
}
// The default action marks the end of each matching group.
groupHasMatch = false;
} else {
boolean match = true;
for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
String origValue = baseLine.getElement(corpusElement);
Pattern pattern = conditionalAction.conditions.get(corpusElement);
if (!pattern.matcher(origValue).matches()) {
match = false;
break;
}
}
if (match) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
groupHasMatch = true;
}
// did this action match?
}
// default action?
}
// next conditional action
newCorpusLines.add(splitCorpusLine);
}
// next split
} else {
newCorpusLines.add(newCorpusLine);
}
// should line be split?
}
// next corpus line
corpusLines = newCorpusLines;
}
return corpusLines;
}
use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.
the class ParseOutputRewriterTest method testGetCorpusLines.
@Test
public void testGetCorpusLines() throws Exception {
TalismaneSession.clearSessions();
System.setProperty("config.file", "src/test/resources/testWithOutputRules.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String input = "";
input += "1\tAu\tau\tADP+DET\t0\troot\n";
input += "2\tsein\tsein\tNOUN\t1\tfixed\n";
input += "3\tmême\tmême\tADV\t1\tadvmod\n";
input += "4\tdu\tdu\tADP+DET\t5\tcase\n";
input += "5\tParti\tParti\tPROPN\t1\tnmod\n";
input += "6\tsocialiste\tsocialiste\tADJ\t5\tfixed\n";
input += "7\tauquel\tauquel\tADP+PRON\t8\tobl\n";
input += "8\tappartient\tappartenir\tVERB\t5\tacl:relcl\n";
input += "9\tM.\tmonsieur\tNOUN\t8\tnsubj\n";
input += "10\tDupont\tDupont\tPROPN\t9\tflat:name\n";
StringReader stringReader = new StringReader(input);
ParserRegexBasedCorpusReader reader = new ParserRegexBasedCorpusReader(stringReader, config.getConfig("talismane.core.test.parser.input"), sessionId);
ParseConfiguration parseConfiguration = reader.nextConfiguration();
final StringWriter writer = new StringWriter();
try (ParseOutputRewriter rewriter = new ParseOutputRewriter(writer, sessionId)) {
List<CorpusLine> corpusLines = rewriter.getCorpusLines(parseConfiguration);
int i = 1;
for (CorpusLine corpusLine : corpusLines) {
LOG.debug("line " + corpusLine.getIndex() + ": " + corpusLine.getElements());
if (i == 1) {
assertEquals(1, corpusLine.getIndex());
assertEquals("à", corpusLine.getToken());
assertEquals("à", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(0, corpusLine.getGovernorIndex());
assertEquals("root", corpusLine.getLabel());
} else if (i == 2) {
assertEquals(2, corpusLine.getIndex());
assertEquals("le", corpusLine.getToken());
assertEquals("le", corpusLine.getLemma());
assertEquals("DET", corpusLine.getPosTag());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 3) {
assertEquals(3, corpusLine.getIndex());
assertEquals("sein", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 4) {
assertEquals(4, corpusLine.getIndex());
assertEquals("même", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("advmod", corpusLine.getLabel());
} else if (i == 5) {
assertEquals(5, corpusLine.getIndex());
assertEquals("de", corpusLine.getToken());
assertEquals("de", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("case", corpusLine.getLabel());
} else if (i == 6) {
assertEquals(6, corpusLine.getIndex());
assertEquals("le", corpusLine.getToken());
assertEquals("le", corpusLine.getLemma());
assertEquals("DET", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("det", corpusLine.getLabel());
} else if (i == 7) {
assertEquals(7, corpusLine.getIndex());
assertEquals("Parti", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("nmod", corpusLine.getLabel());
} else if (i == 8) {
assertEquals(8, corpusLine.getIndex());
assertEquals("socialiste", corpusLine.getToken());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 9) {
assertEquals(9, corpusLine.getIndex());
assertEquals("à", corpusLine.getToken());
assertEquals("à", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(10, corpusLine.getGovernorIndex());
assertEquals("case", corpusLine.getLabel());
} else if (i == 10) {
assertEquals(10, corpusLine.getIndex());
assertEquals("lequel", corpusLine.getToken());
assertEquals("lequel", corpusLine.getLemma());
assertEquals("PRON", corpusLine.getPosTag());
assertEquals(11, corpusLine.getGovernorIndex());
assertEquals("obl", corpusLine.getLabel());
} else if (i == 11) {
assertEquals(11, corpusLine.getIndex());
assertEquals("appartient", corpusLine.getToken());
assertEquals("VERB", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("acl:relcl", corpusLine.getLabel());
} else if (i == 12) {
assertEquals(12, corpusLine.getIndex());
assertEquals("M.", corpusLine.getToken());
assertEquals("NOUN", corpusLine.getPosTag());
assertEquals(11, corpusLine.getGovernorIndex());
assertEquals("nsubj", corpusLine.getLabel());
} else if (i == 13) {
assertEquals(13, corpusLine.getIndex());
assertEquals("Dupont", corpusLine.getToken());
assertEquals("PROPN", corpusLine.getPosTag());
assertEquals(12, corpusLine.getGovernorIndex());
assertEquals("flat:name", corpusLine.getLabel());
}
i++;
}
assertEquals(13, corpusLines.size());
}
}
use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.
the class PosTagRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
posTagSequence = new PosTagSequence(tokenSequence);
int i = 0;
for (CorpusLine corpusLine : corpusLines) {
PosTaggedToken posTaggedToken = this.convertToPosTaggedToken(corpusLine, posTagSequence, i++, this.getCurrentFile());
this.idTokenMap.put(corpusLine.getIndex(), posTaggedToken);
}
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.
the class TokenRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
tokenSequence = new PretokenisedSequence(sentence, sessionId);
for (CorpusLine corpusLine : corpusLines) {
this.convertToToken(tokenSequence, corpusLine);
}
for (TokenFilter filter : filters) filter.apply(tokenSequence);
tokenSequence.cleanSlate();
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
Aggregations