use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testGetDetectedSentences.
@Test
public void testGetDetectedSentences() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("Sentence 1<sent/>Sentence 2. Sentence");
textBlock = textBlock.roll(" 3.");
// the rawTextBlock always contains the last two added sub-blocks
// so annotations are relative to these sub-blocks
AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
// we add a sentence break annotation (as if it was added by a filter)
System.out.println("we add a sentence break annotation (as if it was added by a filter)");
sentenceBreaks.add(new Annotation<>("".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
rawTextBlock.addAnnotations(sentenceBreaks);
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
rawTextBlock.addAnnotations(skips);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll(" Sentence 4.");
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3.", processedTextBlock.getText());
// add sentence boundaries to the processed text (as if they were added
// by a sentence detector)
System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
List<Sentence> sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(2, sentences.size());
assertEquals("Sentence 1", sentences.get(0).getText());
assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
assertEquals("Sentence 2.", sentences.get(1).getText());
assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("");
// we have now rolled all text up until sentence 4 into the processed
// area
processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
// add a sentence boundary for "Sentence 3"
System.out.println("add a sentence boundary for \"Sentence 3\", this time inside the analysis range");
sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(1, sentences.size());
assertEquals("Sentence 3.", sentences.get(0).getText());
assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(0).getOriginalIndex(0));
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
System.out.println(sentenceBoundaries.toString());
assertEquals(2, sentenceBoundaries.size());
assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
// roll in a final empty block - we now have an empty block at block 3,
// so that any leftover in block 2 should be marked as complete
// since sentences never overlap empty blocks.
textBlock = textBlock.roll("");
sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(1, sentences.size());
assertEquals("Sentence 4.", sentences.get(0).getText());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(0).getOriginalIndex(0));
// note: at this point the initial two blocks have been rolled out
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals(" 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
assertEquals(1, sentenceBoundaries.size());
assertEquals("".length(), sentenceBoundaries.get(0).getStart());
assertEquals(" 3.".length(), sentenceBoundaries.get(0).getEnd());
// test that sentence annotations get added to the original raw text
Sentence sentence4 = sentences.get(0);
List<Annotation<String>> annotations = new ArrayList<>();
annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
sentence4.addAnnotations(annotations);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
annotations = textBlock.getAnnotations(String.class);
assertEquals(1, annotations.size());
assertEquals(" 3. Sentence ".length(), annotations.get(0).getStart());
assertEquals(" 3. Sentence 4".length(), annotations.get(0).getEnd());
textBlock.getProcessedText();
}
Aggregations