use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TokenSequenceProcessor method getProcessors.
/**
* Collect the processors specified in the configuration key
* talismane.core.[sessionId].tokeniser.output.processors.<br>
* <br>
* Each processor must implement this interface and must have a constructor
* matching one of the following signatures:<br>
* - ( {@link File} outputDir, {@link String} sessionId)<br>
* - ( {@link String} sessionId)<br>
* <br>
* Optionally, it can have a constructor with the following signature:<br>
* - ( {@link Writer} writer, {@link String} sessionId)<br>
* If a writer is provided here, then the first processor with the above
* constructor will be given the writer.
*
* @param writer
* if specified, will be used for the first processor in the list
* with a writer in the constructor
* @param outDir
* directory in which to write the various outputs
* @return
* @throws IOException
* @throws TalismaneException
* if a processor does not implement this interface, or if no
* constructor is found with the correct signature
*/
public static List<TokenSequenceProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
Config config = ConfigFactory.load();
Config myConfig = config.getConfig("talismane.core." + sessionId + ".tokeniser");
List<TokenSequenceProcessor> processors = new ArrayList<>();
List<String> classes = myConfig.getStringList("output.processors");
if (outDir != null)
outDir.mkdirs();
Writer firstProcessorWriter = writer;
for (String className : classes) {
@SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
if (!TokenSequenceProcessor.class.isAssignableFrom(untypedClass))
throw new TalismaneException("Class " + className + " does not implement interface " + TokenSequenceProcessor.class.getSimpleName());
@SuppressWarnings("unchecked") Class<? extends TokenSequenceProcessor> clazz = untypedClass;
Constructor<? extends TokenSequenceProcessor> cons = null;
TokenSequenceProcessor processor = null;
if (firstProcessorWriter != null) {
try {
cons = clazz.getConstructor(Writer.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(firstProcessorWriter, sessionId);
firstProcessorWriter = null;
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(File.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(outDir, sessionId);
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
processor = cons.newInstance(sessionId);
} else {
throw new TalismaneException("No constructor found with correct signature for: " + className);
}
}
processors.add(processor);
}
return processors;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class PatternEventStream method next.
@Override
public ClassificationEvent next() throws TalismaneException, IOException {
ClassificationEvent event = null;
if (this.hasNext()) {
TokenPatternMatch tokenPatternMatch = currentPatternMatches.get(currentIndex);
TokeniserOutcome outcome = currentOutcomes.get(currentIndex);
String classification = outcome.name();
LOG.debug("next event, pattern match: " + tokenPatternMatch.toString() + ", outcome:" + classification);
List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
for (TokenPatternMatchFeature<?> feature : tokenPatternMatchFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(tokenPatternMatch, env);
if (featureResult != null) {
tokenFeatureResults.add(featureResult);
}
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
event = new ClassificationEvent(tokenFeatureResults, classification);
currentIndex++;
if (currentIndex == currentPatternMatches.size()) {
currentPatternMatches = null;
}
}
return event;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TokenPattern method parsePattern.
/**
* Break the regexp up into chunks, where each chunk will match one token.
*
* @throws TalismaneException
*/
List<Pattern> parsePattern(String regexp) throws TalismaneException {
boolean inLiteral = false;
boolean inException = false;
boolean inGrouping = false;
boolean groupingHasLetters = false;
int groupingStart = 0;
List<Pattern> parsedPattern = new ArrayList<Pattern>();
int currentStart = 0;
int currentEnd = 0;
for (int i = 0; i < regexp.length(); i++) {
char c = regexp.charAt(i);
if (!inLiteral && c == '\\') {
inLiteral = true;
} else if (inLiteral) {
if (c == 'd' || c == 'D' || c == 'z') {
// digit or non-digit = not a separator
// \z is included here because we're only expecting it
// inside negative lookahead
currentEnd = i + 1;
} else if (inGrouping) {
currentEnd = i + 1;
} else {
// always a separator
// either an actual separator, or the patterns \p (all
// separators) or \s (whitespace)
// or \b (whitespace/sentence start/sentence end)
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
this.addPattern(regexp, i - 1, i + 1, parsedPattern, inException);
currentStart = i + 1;
currentEnd = i + 1;
}
inLiteral = false;
} else if (c == '[') {
inGrouping = true;
groupingHasLetters = false;
groupingStart = i;
currentEnd = i + 1;
} else if (c == ']') {
if (!groupingHasLetters) {
if (groupingStart > 0) {
this.addPattern(regexp, currentStart, groupingStart, parsedPattern, inException);
}
this.addPattern(regexp, groupingStart, i + 1, parsedPattern, inException);
currentStart = i + 1;
currentEnd = i + 1;
} else {
currentEnd = i + 1;
}
inGrouping = false;
} else if (c == '{') {
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
inException = true;
currentStart = i + 1;
currentEnd = i + 1;
} else if (c == '}') {
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
inException = false;
currentStart = i + 1;
currentEnd = i + 1;
} else if (c == '.' || c == '+' || c == '(' || c == '|' || c == ')' || c == '^' || c == '?' || c == '!') {
// special meaning characters, not separators
currentEnd = i + 1;
} else if (c == '-') {
// either the dash separator, or a character range (e.g. A-Z)
if (inGrouping) {
// do nothing
// we don't know if it's a separator grouping or a character
// range
} else {
// a separator
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
this.addPattern(regexp, i, i + 1, parsedPattern, inException);
currentStart = i + 1;
currentEnd = i + 1;
}
} else if (separatorPattern.matcher("" + c).find()) {
if (inGrouping) {
if (groupingHasLetters) {
throw new TalismaneException("Cannot mix separators and non-separators in same grouping");
}
} else {
// a separator
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
this.addPattern(regexp, i, i + 1, parsedPattern, inException);
currentStart = i + 1;
currentEnd = i + 1;
}
} else {
// any other non-separating character
if (inGrouping) {
groupingHasLetters = true;
}
currentEnd = i + 1;
}
}
this.addPattern(regexp, currentStart, currentEnd, parsedPattern, inException);
if (LOG.isTraceEnabled()) {
int i = 0;
LOG.trace("Parsed " + regexp);
for (Pattern pattern : parsedPattern) {
boolean test = indexesToTest.contains(i);
LOG.trace("Added " + pattern.pattern() + " Test? " + test);
i++;
}
}
if (indexesToTest.size() == 0) {
throw new InvalidTokenPatternException("No indexes to test in pattern: " + this.getName());
}
return parsedPattern;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TokenEvaluationObserver method getTokenEvaluationObservers.
/**
* Collect the observers specified in the configuration key
* talismane.core.[sessionId].tokeniser.evaluate.observers.<br>
* <br>
* Each processor must implement this interface and must have a constructor
* matching one of the following signatures:<br>
* - ( {@link File} outputDir, {@link String} sessionId)<br>
* - ( {@link String} sessionId)<br>
* <br>
*
* @param outDir
* directory in which to write the various outputs
* @return
* @throws IOException
* @throws TalismaneException
* if an observer does not implement this interface, or if no
* constructor is found with the correct signature
*/
public static List<TokenEvaluationObserver> getTokenEvaluationObservers(File outDir, String sessionId) throws IOException, TalismaneException, ReflectiveOperationException {
if (outDir != null)
outDir.mkdirs();
Config config = ConfigFactory.load();
Config tokeniserConfig = config.getConfig("talismane.core." + sessionId + ".tokeniser");
Config evalConfig = tokeniserConfig.getConfig("evaluate");
List<TokenEvaluationObserver> observers = new ArrayList<>();
List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(null, outDir, sessionId);
for (TokenSequenceProcessor processor : processors) {
TokenSequenceProcessorWrapper wrapper = new TokenSequenceProcessorWrapper(processor);
observers.add(wrapper);
}
List<String> classes = evalConfig.getStringList("observers");
if (outDir != null)
outDir.mkdirs();
for (String className : classes) {
@SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
if (!TokenEvaluationObserver.class.isAssignableFrom(untypedClass))
throw new TalismaneException("Class " + className + " does not implement interface " + TokenEvaluationObserver.class.getSimpleName());
@SuppressWarnings("unchecked") Class<? extends TokenEvaluationObserver> clazz = untypedClass;
Constructor<? extends TokenEvaluationObserver> cons = null;
TokenEvaluationObserver observer = null;
if (cons == null) {
try {
cons = clazz.getConstructor(File.class, String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
observer = cons.newInstance(outDir, sessionId);
}
}
if (cons == null) {
try {
cons = clazz.getConstructor(String.class);
} catch (NoSuchMethodException e) {
// do nothing
}
if (cons != null) {
observer = cons.newInstance(sessionId);
} else {
throw new TalismaneException("No constructor found with correct signature for: " + className);
}
}
observers.add(observer);
}
return observers;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class TokenRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
tokenSequence = new PretokenisedSequence(sentence, sessionId);
for (CorpusLine corpusLine : corpusLines) {
this.convertToToken(tokenSequence, corpusLine);
}
for (TokenFilter filter : filters) filter.apply(tokenSequence);
tokenSequence.cleanSlate();
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
Aggregations