use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PosTaggerFeatureParser method getRules.
/**
* @param ruleDescriptors
* @return
* @throws TalismaneException
* if a rule is incorrectly configured
*/
public List<PosTaggerRule> getRules(List<String> ruleDescriptors) throws TalismaneException {
List<PosTaggerRule> rules = new ArrayList<PosTaggerRule>();
FunctionDescriptorParser descriptorParser = new FunctionDescriptorParser();
for (String ruleDescriptor : ruleDescriptors) {
LOG.debug(ruleDescriptor);
if (ruleDescriptor.length() > 0 && !ruleDescriptor.startsWith("#")) {
String[] ruleParts = ruleDescriptor.split("\t");
String posTagCode = ruleParts[0];
PosTag posTag = null;
boolean negative = false;
String descriptor = null;
String descriptorName = null;
if (ruleParts.length > 2) {
descriptor = ruleParts[2];
descriptorName = ruleParts[1];
} else {
descriptor = ruleParts[1];
}
if (posTagCode.length() == 0) {
if (descriptorName == null) {
throw new TalismaneException("Rule without PosTag must have a name.");
}
} else {
if (posTagCode.startsWith("!")) {
negative = true;
posTagCode = posTagCode.substring(1);
}
posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagCode);
}
FunctionDescriptor functionDescriptor = descriptorParser.parseDescriptor(descriptor);
if (descriptorName != null)
functionDescriptor.setDescriptorName(descriptorName);
List<PosTaggerFeature<?>> myFeatures = this.parseDescriptor(functionDescriptor);
if (posTag != null) {
for (PosTaggerFeature<?> feature : myFeatures) {
if (feature instanceof BooleanFeature) {
@SuppressWarnings("unchecked") BooleanFeature<PosTaggerContext> condition = (BooleanFeature<PosTaggerContext>) feature;
PosTaggerRule rule = new PosTaggerRule(condition, posTag);
rule.setNegative(negative);
rules.add(rule);
} else {
throw new TalismaneException("Rule must be based on a boolean feature.");
}
}
// next feature
}
// is it a rule, or just a descriptor
}
// proper rule descriptor
}
// next rule descriptor
return rules;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PosTaggerHistoryAddressFunction method checkInternal.
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) throws TalismaneException {
FeatureResult<PosTaggedTokenWrapper> result = null;
FeatureResult<Integer> offsetResult = offsetFeature.check(context, env);
if (offsetResult != null) {
int n = offsetResult.getOutcome();
if (n >= 0) {
throw new TalismaneException("Cannot call PosTaggerHistoryFeature with an offset >= 0");
}
n = 0 - n;
int i = context.getToken().getIndex();
if (i >= n) {
PosTaggedToken prevToken = context.getHistory().get(i - n);
if (prevToken != null)
result = this.generateResult(prevToken);
}
}
// have n
return result;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PosTagSequenceProcessor method getProcessors.
/**
* Collect the processors specified in the configuration key
* talismane.core.[sessionId].pos-tagger.output.processors.<br>
* <br>
* Each processor must implement this interface and must have a constructor
* matching one of the following signatures:<br>
* - ( {@link File} outputDir, {@link String} sessionId)<br>
* - ( {@link String} sessionId)<br>
* <br>
* Optionally, it can have a constructor with the following signature:<br>
* - ( {@link Writer} writer, {@link String} sessionId)<br>
* If a writer is provided here, then the first processor with the above
* constructor will be given the writer.
*
* @param writer
* if specified, will be used for the first processor in the list
* with a writer in the constructor
* @param outDir
* directory in which to write the various outputs
* @return
* @throws IOException
* @throws TalismaneException
* if a processor does not implement this interface, or if no
* constructor is found with the correct signature
*/
public static List<PosTagSequenceProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
Config config = ConfigFactory.load();
Config myConfig = config.getConfig("talismane.core." + sessionId + ".pos-tagger");
List<PosTagSequenceProcessor> processors = new ArrayList<>();
List<String> classes = myConfig.getStringList("output.processors");
if (outDir != null)
outDir.mkdirs();
Writer firstProcessorWriter = writer;
for (String className : classes) {
@SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
if (!PosTagSequenceProcessor.class.isAssignableFrom(untypedClass))
throw new TalismaneException("Class " + className + " does not implement interface " + PosTagSequenceProcessor.class.getSimpleName());
@SuppressWarnings("unchecked") Class<? extends PosTagSequenceProcessor> clazz = untypedClass;
Constructor<? extends PosTagSequenceProcessor> cons = null;
PosTagSequenceProcessor processor = null;
if (firstProcessorWriter != null) {
try {
cons = clazz.getConstructor(Writer.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(firstProcessorWriter, sessionId);
firstProcessorWriter = null;
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(File.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(outDir, sessionId);
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(sessionId);
} else {
throw new TalismaneException("No constructor found with correct signature for: " + className);
}
}
processors.add(processor);
}
return processors;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParseConfigurationProcessor method getProcessors.
/**
* Collect the processors specified in the configuration key
* talismane.core.[sessionId].parser.output.processors.<br>
* <br>
* Each processor must implement this interface and must have a constructor
* matching one of the following signatures:<br>
* - ( {@link File} outputDir, {@link String} sessionId)<br>
* - ( {@link String} sessionId)<br>
* <br>
* Optionally, it can have a constructor with the following signature:<br>
* - ( {@link Writer} writer, {@link String} sessionId)<br>
* If a writer is provided here, then the first processor with the above
* constructor will be given the writer.
*
* @param writer
* if specified, will be used for the first processor in the list
* with a writer in the constructor
* @param outDir
* directory in which to write the various outputs
* @return
* @throws IOException
* @throws TalismaneException
* if a processor does not implement this interface, or if no
* constructor is found with the correct signature
*/
public static List<ParseConfigurationProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
Config config = ConfigFactory.load();
Config parserConfig = config.getConfig("talismane.core." + sessionId + ".parser");
List<ParseConfigurationProcessor> processors = new ArrayList<>();
List<String> classes = parserConfig.getStringList("output.processors");
if (outDir != null)
outDir.mkdirs();
Writer firstProcessorWriter = writer;
for (String className : classes) {
@SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
if (!ParseConfigurationProcessor.class.isAssignableFrom(untypedClass))
throw new TalismaneException("Class " + className + " does not implement interface " + ParseConfigurationProcessor.class.getSimpleName());
@SuppressWarnings("unchecked") Class<? extends ParseConfigurationProcessor> clazz = untypedClass;
Constructor<? extends ParseConfigurationProcessor> cons = null;
ParseConfigurationProcessor processor = null;
if (firstProcessorWriter != null) {
try {
cons = clazz.getConstructor(Writer.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(firstProcessorWriter, sessionId);
firstProcessorWriter = null;
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(File.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(outDir, sessionId);
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(sessionId);
} else {
throw new TalismaneException("No constructor found with correct signature for: " + className);
}
}
processors.add(processor);
}
return processors;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParseOutputRewriter method getCorpusLines.
List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
// first convert the parse configuration to a list of corpus lines
List<CorpusLine> corpusLines = new ArrayList<>();
for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
if (!posTaggedToken.isRoot()) {
DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
corpusLine.setIndex(posTaggedToken.getIndex());
corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
corpusLine.setPosTag(posTaggedToken.getTag().getCode());
String morphology = posTaggedToken.getMorphologyForCoNLL();
corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
if (posTaggedToken.getToken().getTrailingRawOutput() != null)
corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
corpusLine.setPosTagProbability(posTaggedToken.getProbability());
if (arc != null)
corpusLine.setParseProbability(arc.getProbability());
corpusLines.add(corpusLine);
}
}
Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
for (CorpusLine corpusLine : corpusLines) {
if (LOG.isDebugEnabled())
LOG.debug(corpusLine.toString());
for (RewriteRule rewriteRule : rewriteRules) {
boolean matches = true;
conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
Pattern pattern = rewriteRule.conditions.get(corpusElement);
if (LOG.isTraceEnabled())
LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
switch(corpusElement) {
case POSTAG:
if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Match failed for " + corpusLine.getPosTag());
matches = false;
break conditionLoop;
}
break;
case TOKEN:
if (!pattern.matcher(corpusLine.getToken()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LEMMA:
if (!pattern.matcher(corpusLine.getLemma()).matches()) {
matches = false;
break conditionLoop;
}
break;
case LABEL:
if (!pattern.matcher(corpusLine.getLabel()).matches()) {
matches = false;
break conditionLoop;
}
break;
default:
throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
}
}
if (matches) {
if (rewriteRule.action instanceof SplitAction) {
SplitAction splitAction = (SplitAction) rewriteRule.action;
splitActions.put(corpusLine, splitAction);
}
}
}
}
if (splitActions.size() > 0) {
List<CorpusLine> newCorpusLines = new ArrayList<>();
Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
oldToNewIndexMap.put(0, 0);
int currentIndex = 1;
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
oldToNewIndexMap.put(i + 1, currentIndex);
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
currentIndex += splitAction.elementValues.size();
} else {
currentIndex++;
}
}
for (int i = 0; i < corpusLines.size(); i++) {
CorpusLine corpusLine = corpusLines.get(i);
CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
if (splitActions.containsKey(corpusLine)) {
SplitAction splitAction = splitActions.get(corpusLine);
for (int j = 0; j < splitAction.elementValues.size(); j++) {
CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
// The first matching element in each group will be applied
// The default element marks the end of each group, and will be
// applied if no other match has applied.
List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
boolean groupHasMatch = false;
for (ConditionalAction conditionalAction : conditionalActions) {
CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
if (conditionalAction.isDefault) {
if (!groupHasMatch) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
}
// The default action marks the end of each matching group.
groupHasMatch = false;
} else {
boolean match = true;
for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
String origValue = baseLine.getElement(corpusElement);
Pattern pattern = conditionalAction.conditions.get(corpusElement);
if (!pattern.matcher(origValue).matches()) {
match = false;
break;
}
}
if (match) {
Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
groupHasMatch = true;
}
// did this action match?
}
// default action?
}
// next conditional action
newCorpusLines.add(splitCorpusLine);
}
// next split
} else {
newCorpusLines.add(newCorpusLine);
}
// should line be split?
}
// next corpus line
corpusLines = newCorpusLines;
}
return corpusLines;
}
Aggregations