use of com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker in project talismane by joliciel-informatique.
the class SentenceDetector method detectSentences.
/**
* Detect sentences within an annotated text. Sentences are added in the form
* of an Annotation around a {@link SentenceBoundary}, with the start position
* (relative to the start of the annotated text) at the start of the sentence
* and the end position immediately after the end of the sentence. <br>
* <br>
* Sentence boundaries will not be detected within any annotation of type
* {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
* after the {@link AnnotatedText#getAnalysisStart()} and
* {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
* <br>
* If the text contained existing {@link SentenceBoundary} annotations before
* analysis start, the first sentence will begin where the last existing
* annotation ended. Otherwise, the first boundary will begin at position 0.
* <br>
* <br>
* If the text's analysis end is equal to the text length, it is assumed that
* the text end is a sentence boundary. In this case, an additional sentence
* is added starting at the final detected boundary and ending at text end.
*
* @param text
* the annotated text in which we need to detect sentences.
* @return in addition to the annotations added, we return a List of integers
* marking the end position of each sentence boundary.
*/
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
LOG.debug("detectSentences");
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
List<Integer> possibleBoundaries = new ArrayList<>();
while (matcher.find()) {
if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
boolean noSentences = false;
int position = matcher.start();
for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
noSentences = true;
break;
}
}
if (!noSentences)
possibleBoundaries.add(position);
}
}
// collect all deterministic sentence boundaries
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
// Share one token sequence for all possible boundaries, to avoid tokenising
// multiple times
Sentence sentence = new Sentence(text.getText(), sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
for (int possibleBoundary : possibleBoundaries) {
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Testing boundary: " + boundary);
LOG.trace(" at position: " + possibleBoundary);
}
List<FeatureResult<?>> featureResults = new ArrayList<>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(featureResults);
if (LOG.isTraceEnabled()) {
for (Decision decision : decisions) {
LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
}
}
if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
if (LOG.isTraceEnabled()) {
LOG.trace("Adding boundary: " + possibleBoundary + 1);
}
guessedBoundaries.add(possibleBoundary + 1);
boundaries.add(boundary);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
}
if (LOG.isDebugEnabled())
LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
int lastBoundary = 0;
List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
if (existingBoundaries.size() > 0) {
lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
}
// advance boundary start until a non space character is encountered
while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
lastBoundary++;
}
for (int guessedBoundary : guessedBoundaries) {
if (guessedBoundary > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added boundary: " + sentenceBoundary);
}
lastBoundary = guessedBoundary;
}
}
if (text.getAnalysisEnd() == text.getText().length()) {
if (text.getAnalysisEnd() > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added final boundary: " + sentenceBoundary);
}
}
}
text.addAnnotations(newBoundaries);
return new ArrayList<>(guessedBoundaries);
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker in project talismane by joliciel-informatique.
the class RawTextTest method testNoSentenceAnnotationLocation.
@Test
public void testNoSentenceAnnotationLocation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
String text = "Mr. Jones and <skip/>Mrs. Smith.";
RawText textBlock = new RawText(text, true, sessionId);
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreaks = new ArrayList<>();
System.out.println("we add no sentence break annotations (as if they were added by a filter)");
noSentenceBreaks.add(new Annotation<>("".length(), "Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
noSentenceBreaks.add(new Annotation<>("Mr. Jones and <skip/>".length(), "Mr. Jones and <skip/>Mrs.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
textBlock.addAnnotations(noSentenceBreaks);
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("Mr. Jones and ".length(), "Mr. Jones and <skip/>".length(), new RawTextSkipMarker("me"), labels));
textBlock.addAnnotations(skips);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("Mr. Jones and Mrs. Smith.", processedTextBlock.getText());
// ensure that the no sentence break text got added at the right place
// in the processed text
noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
System.out.println("Processed annotations: " + noSentenceBreaks);
assertEquals(2, noSentenceBreaks.size());
assertEquals("".length(), noSentenceBreaks.get(0).getStart());
assertEquals("Mr.".length(), noSentenceBreaks.get(0).getEnd());
assertEquals("Mr. Jones and ".length(), noSentenceBreaks.get(1).getStart());
assertEquals("Mr. Jones and Mrs.".length(), noSentenceBreaks.get(1).getEnd());
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testNoSentenceAnnotationLocation.
@Test
public void testNoSentenceAnnotationLocation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
// String text = "I see Mr. Jones and <skip/>Mrs. Smith.";
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("I see ");
textBlock = textBlock.roll("Mr. Jones ");
textBlock = textBlock.roll("and <sk");
AnnotatedText rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreaks = new ArrayList<>();
System.out.println("we add no sentence break annotations (as if they were added by a filter)");
noSentenceBreaks.add(new Annotation<>("".length(), "Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
rawText.addAnnotations(noSentenceBreaks);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("ip/>Mrs.");
rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("and ".length(), "and <skip/>".length(), new RawTextSkipMarker("me"), labels));
rawText.addAnnotations(skips);
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("I see Mr. Jones and ", processedTextBlock.getText());
// ensure that the no sentence break text got added at the right place
// in the processed text
noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
System.out.println("Processed annotations: " + noSentenceBreaks);
assertEquals(1, noSentenceBreaks.size());
assertEquals("I see ".length(), noSentenceBreaks.get(0).getStart());
assertEquals("I see Mr.".length(), noSentenceBreaks.get(0).getEnd());
textBlock = textBlock.roll(" Smith.");
rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
noSentenceBreaks = new ArrayList<>();
noSentenceBreaks.add(new Annotation<>("ip/>".length(), "ip/>Mrs.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
rawText.addAnnotations(noSentenceBreaks);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("");
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
processedTextBlock = textBlock.getProcessedText();
assertEquals("and Mrs. Smith.", processedTextBlock.getText());
// ensure that the no sentence break text got added at the right place
// in the processed text
noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
System.out.println("Processed annotations: " + noSentenceBreaks);
assertEquals(1, noSentenceBreaks.size());
assertEquals("and ".length(), noSentenceBreaks.get(0).getStart());
assertEquals("and Mrs.".length(), noSentenceBreaks.get(0).getEnd());
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker in project talismane by joliciel-informatique.
the class RawTextRegexAnnotator method annotate.
@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
if (LOG.isTraceEnabled()) {
LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
}
List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
Matcher matcher = pattern.matcher(textBlock.getText());
while (matcher.find()) {
int matcherStart = 0;
int matcherEnd = 0;
if (groupIndex == 0) {
matcherStart = matcher.start();
matcherEnd = matcher.end();
} else {
matcherStart = matcher.start(groupIndex);
matcherEnd = matcher.end(groupIndex);
}
CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
if (LOG.isTraceEnabled()) {
LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
}
LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
}
if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
throw new MatchTooLargeException(errorString);
}
if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
for (RawTextMarkType filterType : filterTypes) {
switch(filterType) {
case REPLACE:
{
String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
if (LOG.isTraceEnabled()) {
LOG.trace("Setting replacement to: " + insertionText);
}
RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case NO_SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SKIP:
{
RawTextMarker marker = new RawTextSkipMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case TAG:
{
Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
tokenAttributes.add(annotation);
break;
}
default:
{
RawTextMarker marker = new RawTextMarker(filterType, this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
}
}
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
}
}
if (rawTextMarkers.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
LOG.debug("Added annotations: " + rawTextMarkers);
}
}
if (rawTextMarkers.size() > 0)
textBlock.addAnnotations(rawTextMarkers);
if (tokenAttributes.size() > 0)
textBlock.addAnnotations(tokenAttributes);
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker in project talismane by joliciel-informatique.
the class SentenceDetectorTest method testDetectSentences2.
@Test
public void testDetectSentences2() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
DecisionMaker decisionMaker = new DecisionMaker() {
@Override
public ScoringStrategy<ClassificationSolution> getDefaultScoringStrategy() {
return new GeometricMeanScoringStrategy();
}
@Override
public List<Decision> decide(List<FeatureResult<?>> featureResults) {
List<Decision> decisions = new ArrayList<>();
Decision decision = new Decision(SentenceDetectorOutcome.IS_BOUNDARY.name(), 1.0);
decisions.add(decision);
return decisions;
}
};
String[] labels = new String[0];
Set<SentenceDetectorFeature<?>> features = new HashSet<>();
SentenceDetector sentenceDetector = new SentenceDetector(decisionMaker, features, sessionId);
String text = "Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones? After";
AnnotatedText annotatedText = new AnnotatedText(text, "Before analysis. ".length(), text.length());
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = new ArrayList<>();
noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello ".length(), "Before analysis. Hello Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello Mr. Jones\nHow are you, ".length(), "Before analysis. Hello Mr. Jones\nHow are you, Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
annotatedText.addAnnotations(noSentenceBreakMarkers);
List<Annotation<SentenceBoundary>> existingBoundaries = new ArrayList<>();
existingBoundaries.add(new Annotation<>("".length(), "Before analysis.".length(), new SentenceBoundary(), labels));
annotatedText.addAnnotations(existingBoundaries);
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
sentenceBreaks.add(new Annotation<>("Before analysis. Hello Mr. Jones".length(), "Before analysis. Hello Mr. Jones\n".length(), new RawTextSentenceBreakMarker("me"), labels));
annotatedText.addAnnotations(sentenceBreaks);
List<Integer> guessedBoundaries = sentenceDetector.detectSentences(annotatedText);
assertEquals(2, guessedBoundaries.size());
assertEquals("Before analysis. Hello Mr. Jones\n".length(), guessedBoundaries.get(0).intValue());
assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), guessedBoundaries.get(1).intValue());
List<Annotation<SentenceBoundary>> sentenceBoundaries = annotatedText.getAnnotations(SentenceBoundary.class);
System.out.println(sentenceBoundaries.toString());
assertEquals(4, sentenceBoundaries.size());
assertEquals("".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Before analysis.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Before analysis. ".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Before analysis. Hello Mr. Jones\n".length(), sentenceBoundaries.get(1).getEnd());
assertEquals("Before analysis. Hello Mr. Jones\n".length(), sentenceBoundaries.get(2).getStart());
assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), sentenceBoundaries.get(2).getEnd());
assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones?".length(), sentenceBoundaries.get(3).getStart());
assertEquals("Before analysis. Hello Mr. Jones\nHow are you, Mr. Jones? After".length(), sentenceBoundaries.get(3).getEnd());
}
Aggregations