use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParseEventStream method next.
@Override
public ClassificationEvent next() throws TalismaneException, IOException {
ClassificationEvent event = null;
if (this.hasNext()) {
eventCount++;
LOG.debug("Event " + eventCount + ": " + currentConfiguration.toString());
List<FeatureResult<?>> parseFeatureResults = new ArrayList<FeatureResult<?>>();
for (ParseConfigurationFeature<?> parseFeature : parseFeatures) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = parseFeature.check(currentConfiguration, env);
if (featureResult != null) {
parseFeatureResults.add(featureResult);
}
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = parseFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
Transition transition = targetConfiguration.getTransitions().get(currentIndex);
String classification = transition.getCode();
event = new ClassificationEvent(parseFeatureResults, classification);
// apply the transition and up the index
currentConfiguration = new ParseConfiguration(currentConfiguration);
transition.apply(currentConfiguration);
currentIndex++;
if (currentIndex == targetConfiguration.getTransitions().size()) {
targetConfiguration = null;
}
}
return event;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class ParserRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
PosTaggedToken rootToken = posTagSequence.prependRoot();
idTokenMap.put(0, rootToken);
TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
Set<DependencyArc> dependencies = new TreeSet<>();
for (CorpusLine dataLine : corpusLines) {
int headIndex = 0;
if (dataLine.hasElement(CorpusElement.GOVERNOR))
headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.GOVERNOR));
PosTaggedToken head = idTokenMap.get(headIndex);
PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
String dependencyLabel = dataLine.getElement(CorpusElement.LABEL);
if (transitionSystem.getDependencyLabels().size() > 1) {
if (dependencyLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(dependencyLabel)) {
throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), dependencyLabel);
}
String nonProjectiveLabel = dataLine.getElement(CorpusElement.NON_PROJ_LABEL);
if (nonProjectiveLabel != null && nonProjectiveLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(nonProjectiveLabel)) {
throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), nonProjectiveLabel);
}
}
DependencyArc arc = new DependencyArc(head, dependent, dependencyLabel);
if (LOG.isTraceEnabled())
LOG.trace(arc.toString());
dependencies.add(arc);
if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
arc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
}
configuration = new ParseConfiguration(posTagSequence);
if (this.predictTransitions) {
transitionSystem.predictTransitions(configuration, dependencies);
} else {
for (DependencyArc arc : dependencies) {
configuration.addDependency(arc.getHead(), arc.getDependent(), arc.getLabel(), null);
}
}
// if there are any
if (this.getCorpusLineReader().hasPlaceholder(CorpusElement.NON_PROJ_GOVERNOR)) {
Set<DependencyArc> nonProjDeps = new TreeSet<>();
if (LOG.isTraceEnabled())
LOG.trace("Non projective dependencies: ");
for (CorpusLine dataLine : corpusLines) {
int headIndex = 0;
if (dataLine.hasElement(CorpusElement.NON_PROJ_GOVERNOR))
headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.NON_PROJ_GOVERNOR));
PosTaggedToken head = idTokenMap.get(headIndex);
PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
DependencyArc nonProjArc = new DependencyArc(head, dependent, dataLine.getElement(CorpusElement.NON_PROJ_LABEL));
if (LOG.isTraceEnabled())
LOG.trace(nonProjArc.toString());
nonProjDeps.add(nonProjArc);
if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
nonProjArc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
}
for (DependencyArc nonProjArc : nonProjDeps) {
configuration.addManualNonProjectiveDependency(nonProjArc.getHead(), nonProjArc.getDependent(), nonProjArc.getLabel());
}
}
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class SentenceDetector method detectSentences.
/**
* Detect sentences within an annotated text. Sentences are added in the form
* of an Annotation around a {@link SentenceBoundary}, with the start position
* (relative to the start of the annotated text) at the start of the sentence
* and the end position immediately after the end of the sentence. <br>
* <br>
* Sentence boundaries will not be detected within any annotation of type
* {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
* after the {@link AnnotatedText#getAnalysisStart()} and
* {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
* <br>
* If the text contained existing {@link SentenceBoundary} annotations before
* analysis start, the first sentence will begin where the last existing
* annotation ended. Otherwise, the first boundary will begin at position 0.
* <br>
* <br>
* If the text's analysis end is equal to the text length, it is assumed that
* the text end is a sentence boundary. In this case, an additional sentence
* is added starting at the final detected boundary and ending at text end.
*
* @param text
* the annotated text in which we need to detect sentences.
* @return in addition to the annotations added, we return a List of integers
* marking the end position of each sentence boundary.
*/
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
LOG.debug("detectSentences");
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
List<Integer> possibleBoundaries = new ArrayList<>();
while (matcher.find()) {
if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
boolean noSentences = false;
int position = matcher.start();
for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
noSentences = true;
break;
}
}
if (!noSentences)
possibleBoundaries.add(position);
}
}
// collect all deterministic sentence boundaries
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
// Share one token sequence for all possible boundaries, to avoid tokenising
// multiple times
Sentence sentence = new Sentence(text.getText(), sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
for (int possibleBoundary : possibleBoundaries) {
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Testing boundary: " + boundary);
LOG.trace(" at position: " + possibleBoundary);
}
List<FeatureResult<?>> featureResults = new ArrayList<>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(featureResults);
if (LOG.isTraceEnabled()) {
for (Decision decision : decisions) {
LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
}
}
if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
if (LOG.isTraceEnabled()) {
LOG.trace("Adding boundary: " + possibleBoundary + 1);
}
guessedBoundaries.add(possibleBoundary + 1);
boundaries.add(boundary);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
}
if (LOG.isDebugEnabled())
LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
int lastBoundary = 0;
List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
if (existingBoundaries.size() > 0) {
lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
}
// advance boundary start until a non space character is encountered
while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
lastBoundary++;
}
for (int guessedBoundary : guessedBoundaries) {
if (guessedBoundary > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added boundary: " + sentenceBoundary);
}
lastBoundary = guessedBoundary;
}
}
if (text.getAnalysisEnd() == text.getText().length()) {
if (text.getAnalysisEnd() > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added final boundary: " + sentenceBoundary);
}
}
}
text.addAnnotations(newBoundaries);
return new ArrayList<>(guessedBoundaries);
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class SentenceDetectorEventStream method next.
@Override
public ClassificationEvent next() throws TalismaneException, IOException {
ClassificationEvent event = null;
if (this.hasNext()) {
int possibleBoundary = possibleBoundaries.get(currentIndex++);
String moreText = "";
int sentenceIndex = 0;
while (moreText.length() < minCharactersAfterBoundary) {
String nextSentence = "";
if (sentenceIndex < sentences.size()) {
nextSentence = sentences.get(sentenceIndex);
} else if (corpusReader.hasNextSentence()) {
nextSentence = corpusReader.nextSentence().getText().toString();
sentences.add(nextSentence);
} else {
break;
}
if (nextSentence.startsWith(" ") || nextSentence.startsWith("\n"))
moreText += sentences.get(sentenceIndex);
else
moreText += " " + sentences.get(sentenceIndex);
sentenceIndex++;
}
String text = previousSentence + currentSentence + moreText;
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(text, possibleBoundary, sessionId);
LOG.debug("next event, boundary: " + boundary);
List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
String classification = SentenceDetectorOutcome.IS_NOT_BOUNDARY.name();
if (possibleBoundary == realBoundary)
classification = SentenceDetectorOutcome.IS_BOUNDARY.name();
event = new ClassificationEvent(featureResults, classification);
if (currentIndex == possibleBoundaries.size()) {
if (currentSentence.endsWith(" "))
previousSentence = currentSentence;
else
previousSentence = currentSentence + " ";
currentSentence = null;
}
}
return event;
}
use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.
the class RawTextAnnotatorFactory method getAnnotator.
/**
* @param descriptor
* @param blockSize
* @return
* @throws TalismaneException
* if a descriptor is incorrectly configured
*/
public RawTextAnnotator getAnnotator(String descriptor, int blockSize) throws TalismaneException {
RawTextAnnotator filter = null;
List<Class<? extends RawTextAnnotator>> classes = new ArrayListNoNulls<Class<? extends RawTextAnnotator>>();
classes.add(DuplicateWhiteSpaceFilter.class);
classes.add(NewlineEndOfSentenceMarker.class);
classes.add(NewlineSpaceMarker.class);
String[] parts = descriptor.split("\t");
String filterName = parts[0];
// add equality to RegexMarkerFilter for historical reasons
if (filterName.equals("RegexMarkerFilter") || filterName.equals(RawTextRegexAnnotator.class.getSimpleName())) {
String[] filterTypeStrings = parts[1].split(",");
List<RawTextMarkType> filterTypes = new ArrayListNoNulls<RawTextMarkType>();
for (String filterTypeString : filterTypeStrings) {
filterTypes.add(RawTextMarkType.valueOf(filterTypeString));
}
boolean needsReplacement = false;
boolean needsTag = false;
int minParams = 3;
if (filterTypes.contains(RawTextMarkType.REPLACE)) {
needsReplacement = true;
minParams = 4;
} else if (filterTypes.contains(RawTextMarkType.TAG)) {
needsTag = true;
minParams = 4;
}
if (parts.length == minParams + 1) {
filter = new RawTextRegexAnnotator(filterTypes, parts[2], Integer.parseInt(parts[3]), blockSize);
if (needsReplacement)
filter.setReplacement(parts[4]);
if (needsTag) {
if (parts[4].indexOf('=') >= 0) {
String attribute = parts[4].substring(0, parts[4].indexOf('='));
String value = parts[4].substring(parts[4].indexOf('=') + 1);
filter.setAttribute(new StringAttribute(attribute, value));
} else {
filter.setAttribute(new StringAttribute(parts[4], ""));
}
}
} else if (parts.length == minParams) {
filter = new RawTextRegexAnnotator(filterTypes, parts[2], 0, blockSize);
if (needsReplacement)
filter.setReplacement(parts[3]);
if (needsTag) {
if (parts[3].indexOf('=') >= 0) {
String attribute = parts[3].substring(0, parts[3].indexOf('='));
String value = parts[3].substring(parts[3].indexOf('=') + 1);
filter.setAttribute(new StringAttribute(attribute, value));
} else {
filter.setAttribute(new StringAttribute(parts[4], ""));
}
}
} else {
throw new TalismaneException("Wrong number of arguments for " + RawTextRegexAnnotator.class.getSimpleName() + ". Expected " + minParams + " or " + (minParams + 1) + ", but was " + parts.length);
}
} else {
for (Class<? extends RawTextAnnotator> clazz : classes) {
if (filterName.equals(clazz.getSimpleName())) {
try {
Constructor<? extends RawTextAnnotator> constructor = clazz.getConstructor(Integer.class);
filter = constructor.newInstance(blockSize);
} catch (ReflectiveOperationException e) {
throw new TalismaneException("Problem building class: " + filterName, e);
}
}
}
if (filter == null)
throw new TalismaneException("Unknown text filter class: " + filterName);
}
return filter;
}
Aggregations