use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.
the class RawTextProcessor method getDetectedSentences.
/**
* Get a list of sentences currently detected. All sentences will be complete
* - if the list ends with an incomplete sentence it is kept for another
* round.
*
* @return
*/
public final List<Sentence> getDetectedSentences() {
SentenceHolder prevHolder = this.getPreviousSentenceHolder();
SentenceHolder currentHolder = this.getCurrentSentenceHolder();
for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
}
List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
leftover = null;
if (sentences.size() > 0) {
Sentence lastSentence = sentences.get(sentences.size() - 1);
if (!lastSentence.isComplete()) {
leftover = lastSentence;
if (LOG.isTraceEnabled())
LOG.trace("Set leftover to: " + leftover.toString());
sentences.remove(sentences.size() - 1);
}
}
// ensure that sentence annotations get added to the raw text as well
for (Sentence sentence : sentences) {
sentence.addObserver(new AnnotationObserver() {
int myOrigin = RawTextProcessor.this.originalStartIndex;
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
int originalStart = sentence.getOriginalIndex(annotation.getStart());
int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
newAnnotations.add(newAnnotation);
}
RawTextProcessor.this.addAnnotations(newAnnotations);
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
}
// position 0.
if (currentHolder.getOriginalTextSegments().size() > 0) {
if (leftover == null) {
leftover = new Sentence("", currentFile, sessionId);
}
StringBuilder segmentsToInsert = new StringBuilder();
if (leftover.getLeftoverOriginalText().length() > 0)
segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
segmentsToInsert.append(originalTextSegment);
}
leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
}
return sentences;
}
use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.
the class RawTextProcessor method getProcessedText.
/**
* Return processed text ready for sentence detection.
*
* It has sentence break and non-sentence-break annotations inherited from the
* present RawTextProcessor. Any sentence-break annotations added will
* automatically get reflected in the current RollingTextBlock.
*
* @return
*/
public final AnnotatedText getProcessedText() {
LOG.trace("getProcessedTextBlock");
int textStartPos = this.getTextProcessingStart();
int textEndPos = this.getTextProcessingEnd();
SentenceHolder prevHolder = this.getPreviousSentenceHolder();
SentenceHolder currentHolder = this.getCurrentSentenceHolder();
SentenceHolder nextHolder = this.getNextSentenceHolder();
StringBuilder sb = new StringBuilder();
String processedText1 = prevHolder.getProcessedText();
String processedText2 = currentHolder.getProcessedText();
String processedText3 = nextHolder.getProcessedText();
sb.append(processedText1);
sb.append(processedText2);
sb.append(processedText3);
String processedText = sb.toString();
List<Annotation<RawTextMarker>> myAnnotations = this.getAnnotations(RawTextMarker.class);
List<Annotation<RawTextMarker>> hisAnnotations = new ArrayList<>();
int prevHolderOriginalIndex = prevHolder.getOriginalStartIndex();
for (Annotation<RawTextMarker> myAnnotation : myAnnotations) {
if ((myAnnotation.getStart() >= textStartPos && myAnnotation.getStart() < textEndPos) || ((myAnnotation.getEnd() >= textStartPos && myAnnotation.getEnd() < textEndPos))) {
int originalStart = prevHolderOriginalIndex + myAnnotation.getStart();
int originalEnd = prevHolderOriginalIndex + myAnnotation.getEnd();
int localStart = processedText1.length();
if (originalStart >= currentHolder.getOriginalStartIndex())
localStart += currentHolder.getIndex(originalStart);
int localEnd = processedText1.length() + currentHolder.getIndex(originalEnd);
Annotation<RawTextMarker> hisAnnotation = myAnnotation.getAnnotation(localStart, localEnd);
hisAnnotations.add(hisAnnotation);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("raw annotations: " + myAnnotations);
LOG.trace("processed annotations: " + hisAnnotations);
}
AnnotatedText processedTextBlock = new AnnotatedText(processedText, processedText1.length(), processedText1.length() + processedText2.length());
processedTextBlock.addAnnotations(hisAnnotations);
processedTextBlock.addObserver(new AnnotationObserver() {
// an observer which adds any annotations added to the
// processedTextBlock back to myself, at the correct position
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
int offset = textStartPos;
int length1 = prevHolder.getProcessedText().length();
int length2 = currentHolder.getProcessedText().length();
int sentence2HolderStart = currentHolder.getOriginalStartIndex();
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
int originalStart = -1;
if (annotation.getStart() < length1)
originalStart = prevHolder.getOriginalIndex(annotation.getStart());
else if (annotation.getStart() < length1 + length2)
originalStart = currentHolder.getOriginalIndex(annotation.getStart() - length1);
if (originalStart >= 0) {
int originalEnd = -1;
if (annotation.getEnd() <= length1 + length2)
originalEnd = currentHolder.getOriginalIndex(annotation.getEnd() - length1);
else
originalEnd = nextHolder.getOriginalIndex(annotation.getEnd() - (length1 + length2));
if (originalEnd >= 0) {
Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - sentence2HolderStart + offset, originalEnd - sentence2HolderStart + offset);
newAnnotations.add(newAnnotation);
if (annotation.getData() instanceof SentenceBoundary) {
@SuppressWarnings("unchecked") Annotation<SentenceBoundary> sentenceBoundary = (Annotation<SentenceBoundary>) annotation;
sentenceBoundaries.add(sentenceBoundary);
}
}
}
}
RawTextProcessor.this.addAnnotations(newAnnotations);
if (LOG.isTraceEnabled()) {
LOG.trace("ProcessedTextBlock Annotations received: " + annotations);
LOG.trace("ProcessedTextBlock Annotations added: " + newAnnotations);
}
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
return processedTextBlock;
}
use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.
the class RawTextTest method testGetDetectedSentences.
@Test
public void testGetDetectedSentences() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
String text = "Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.";
RawText textBlock = new RawText(text, true, sessionId);
// we add a sentence break annotation to the raw text (as if it was
// added by a filter)
System.out.println("we add a sentence break annotation (as if it was added by a filter)");
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
sentenceBreaks.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
textBlock.addAnnotations(sentenceBreaks);
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
textBlock.addAnnotations(skips);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
// add sentence boundaries to the processed text (as if they were added
// by a sentence detector)
System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
assertEquals(2, sentenceBoundaries.size());
assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
List<Sentence> sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(4, sentences.size());
assertEquals("Sentence 1", sentences.get(0).getText());
assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
assertEquals("Sentence 2.", sentences.get(1).getText());
assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
assertEquals("Sentence 3.", sentences.get(2).getText());
assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(2).getOriginalIndex(0));
assertEquals("Sentence 4.", sentences.get(3).getText());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(3).getOriginalIndex(0));
// test that sentence annotations get added to the original raw text
Sentence sentence4 = sentences.get(3);
List<Annotation<String>> annotations = new ArrayList<>();
annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
sentence4.addAnnotations(annotations);
annotations = textBlock.getAnnotations(String.class);
assertEquals(1, annotations.size());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence ".length(), annotations.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4".length(), annotations.get(0).getEnd());
}
use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testGetDetectedSentences.
@Test
public void testGetDetectedSentences() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("Sentence 1<sent/>Sentence 2. Sentence");
textBlock = textBlock.roll(" 3.");
// the rawTextBlock always contains the last two added sub-blocks
// so annotations are relative to these sub-blocks
AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
// we add a sentence break annotation (as if it was added by a filter)
System.out.println("we add a sentence break annotation (as if it was added by a filter)");
sentenceBreaks.add(new Annotation<>("".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
rawTextBlock.addAnnotations(sentenceBreaks);
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
rawTextBlock.addAnnotations(skips);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll(" Sentence 4.");
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3.", processedTextBlock.getText());
// add sentence boundaries to the processed text (as if they were added
// by a sentence detector)
System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
List<Sentence> sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(2, sentences.size());
assertEquals("Sentence 1", sentences.get(0).getText());
assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
assertEquals("Sentence 2.", sentences.get(1).getText());
assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("");
// we have now rolled all text up until sentence 4 into the processed
// area
processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
// add a sentence boundary for "Sentence 3"
System.out.println("add a sentence boundary for \"Sentence 3\", this time inside the analysis range");
sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(1, sentences.size());
assertEquals("Sentence 3.", sentences.get(0).getText());
assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(0).getOriginalIndex(0));
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
System.out.println(sentenceBoundaries.toString());
assertEquals(2, sentenceBoundaries.size());
assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
// roll in a final empty block - we now have an empty block at block 3,
// so that any leftover in block 2 should be marked as complete
// since sentences never overlap empty blocks.
textBlock = textBlock.roll("");
sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(1, sentences.size());
assertEquals("Sentence 4.", sentences.get(0).getText());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(0).getOriginalIndex(0));
// note: at this point the initial two blocks have been rolled out
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals(" 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
assertEquals(1, sentenceBoundaries.size());
assertEquals("".length(), sentenceBoundaries.get(0).getStart());
assertEquals(" 3.".length(), sentenceBoundaries.get(0).getEnd());
// test that sentence annotations get added to the original raw text
Sentence sentence4 = sentences.get(0);
List<Annotation<String>> annotations = new ArrayList<>();
annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
sentence4.addAnnotations(annotations);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
annotations = textBlock.getAnnotations(String.class);
assertEquals(1, annotations.size());
assertEquals(" 3. Sentence ".length(), annotations.get(0).getStart());
assertEquals(" 3. Sentence 4".length(), annotations.get(0).getEnd());
textBlock.getProcessedText();
}
Aggregations