Search in sources :

Example 1 with SentenceBoundary

use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.

the class RawTextProcessor method getDetectedSentences.

/**
 * Get a list of sentences currently detected. All sentences will be complete
 * - if the list ends with an incomplete sentence it is kept for another
 * round.
 *
 * @return
 */
public final List<Sentence> getDetectedSentences() {
    SentenceHolder prevHolder = this.getPreviousSentenceHolder();
    SentenceHolder currentHolder = this.getCurrentSentenceHolder();
    for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
        currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
        currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
    }
    List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
    leftover = null;
    if (sentences.size() > 0) {
        Sentence lastSentence = sentences.get(sentences.size() - 1);
        if (!lastSentence.isComplete()) {
            leftover = lastSentence;
            if (LOG.isTraceEnabled())
                LOG.trace("Set leftover to: " + leftover.toString());
            sentences.remove(sentences.size() - 1);
        }
    }
    // ensure that sentence annotations get added to the raw text as well
    for (Sentence sentence : sentences) {
        sentence.addObserver(new AnnotationObserver() {

            int myOrigin = RawTextProcessor.this.originalStartIndex;

            @Override
            public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
                List<Annotation<T>> newAnnotations = new ArrayList<>();
                for (Annotation<T> annotation : annotations) {
                    int originalStart = sentence.getOriginalIndex(annotation.getStart());
                    int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
                    Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
                    newAnnotations.add(newAnnotation);
                }
                RawTextProcessor.this.addAnnotations(newAnnotations);
            }

            @Override
            public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
            }
        });
    }
    // position 0.
    if (currentHolder.getOriginalTextSegments().size() > 0) {
        if (leftover == null) {
            leftover = new Sentence("", currentFile, sessionId);
        }
        StringBuilder segmentsToInsert = new StringBuilder();
        if (leftover.getLeftoverOriginalText().length() > 0)
            segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
        for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
            segmentsToInsert.append(originalTextSegment);
        }
        leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
    }
    return sentences;
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with SentenceBoundary

use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.

the class RawTextProcessor method getProcessedText.

/**
 * Return processed text ready for sentence detection.
 *
 * It has sentence break and non-sentence-break annotations inherited from the
 * present RawTextProcessor. Any sentence-break annotations added will
 * automatically get reflected in the current RollingTextBlock.
 *
 * @return
 */
public final AnnotatedText getProcessedText() {
    LOG.trace("getProcessedTextBlock");
    int textStartPos = this.getTextProcessingStart();
    int textEndPos = this.getTextProcessingEnd();
    SentenceHolder prevHolder = this.getPreviousSentenceHolder();
    SentenceHolder currentHolder = this.getCurrentSentenceHolder();
    SentenceHolder nextHolder = this.getNextSentenceHolder();
    StringBuilder sb = new StringBuilder();
    String processedText1 = prevHolder.getProcessedText();
    String processedText2 = currentHolder.getProcessedText();
    String processedText3 = nextHolder.getProcessedText();
    sb.append(processedText1);
    sb.append(processedText2);
    sb.append(processedText3);
    String processedText = sb.toString();
    List<Annotation<RawTextMarker>> myAnnotations = this.getAnnotations(RawTextMarker.class);
    List<Annotation<RawTextMarker>> hisAnnotations = new ArrayList<>();
    int prevHolderOriginalIndex = prevHolder.getOriginalStartIndex();
    for (Annotation<RawTextMarker> myAnnotation : myAnnotations) {
        if ((myAnnotation.getStart() >= textStartPos && myAnnotation.getStart() < textEndPos) || ((myAnnotation.getEnd() >= textStartPos && myAnnotation.getEnd() < textEndPos))) {
            int originalStart = prevHolderOriginalIndex + myAnnotation.getStart();
            int originalEnd = prevHolderOriginalIndex + myAnnotation.getEnd();
            int localStart = processedText1.length();
            if (originalStart >= currentHolder.getOriginalStartIndex())
                localStart += currentHolder.getIndex(originalStart);
            int localEnd = processedText1.length() + currentHolder.getIndex(originalEnd);
            Annotation<RawTextMarker> hisAnnotation = myAnnotation.getAnnotation(localStart, localEnd);
            hisAnnotations.add(hisAnnotation);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("raw annotations: " + myAnnotations);
        LOG.trace("processed annotations: " + hisAnnotations);
    }
    AnnotatedText processedTextBlock = new AnnotatedText(processedText, processedText1.length(), processedText1.length() + processedText2.length());
    processedTextBlock.addAnnotations(hisAnnotations);
    processedTextBlock.addObserver(new AnnotationObserver() {

        // an observer which adds any annotations added to the
        // processedTextBlock back to myself, at the correct position
        @Override
        public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
            int offset = textStartPos;
            int length1 = prevHolder.getProcessedText().length();
            int length2 = currentHolder.getProcessedText().length();
            int sentence2HolderStart = currentHolder.getOriginalStartIndex();
            List<Annotation<T>> newAnnotations = new ArrayList<>();
            for (Annotation<T> annotation : annotations) {
                int originalStart = -1;
                if (annotation.getStart() < length1)
                    originalStart = prevHolder.getOriginalIndex(annotation.getStart());
                else if (annotation.getStart() < length1 + length2)
                    originalStart = currentHolder.getOriginalIndex(annotation.getStart() - length1);
                if (originalStart >= 0) {
                    int originalEnd = -1;
                    if (annotation.getEnd() <= length1 + length2)
                        originalEnd = currentHolder.getOriginalIndex(annotation.getEnd() - length1);
                    else
                        originalEnd = nextHolder.getOriginalIndex(annotation.getEnd() - (length1 + length2));
                    if (originalEnd >= 0) {
                        Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - sentence2HolderStart + offset, originalEnd - sentence2HolderStart + offset);
                        newAnnotations.add(newAnnotation);
                        if (annotation.getData() instanceof SentenceBoundary) {
                            @SuppressWarnings("unchecked") Annotation<SentenceBoundary> sentenceBoundary = (Annotation<SentenceBoundary>) annotation;
                            sentenceBoundaries.add(sentenceBoundary);
                        }
                    }
                }
            }
            RawTextProcessor.this.addAnnotations(newAnnotations);
            if (LOG.isTraceEnabled()) {
                LOG.trace("ProcessedTextBlock Annotations received: " + annotations);
                LOG.trace("ProcessedTextBlock Annotations added: " + newAnnotations);
            }
        }

        @Override
        public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
        }
    });
    return processedTextBlock;
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver) ArrayList(java.util.ArrayList) List(java.util.List)

Example 3 with SentenceBoundary

use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.

the class RawTextTest method testGetDetectedSentences.

@Test
public void testGetDetectedSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    String text = "Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.";
    RawText textBlock = new RawText(text, true, sessionId);
    // we add a sentence break annotation to the raw text (as if it was
    // added by a filter)
    System.out.println("we add a sentence break annotation (as if it was added by a filter)");
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
    sentenceBreaks.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
    textBlock.addAnnotations(sentenceBreaks);
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
    textBlock.addAnnotations(skips);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    // add sentence boundaries to the processed text (as if they were added
    // by a sentence detector)
    System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
    List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
    sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
    List<Sentence> sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(4, sentences.size());
    assertEquals("Sentence 1", sentences.get(0).getText());
    assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
    assertEquals("Sentence 2.", sentences.get(1).getText());
    assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
    assertEquals("Sentence 3.", sentences.get(2).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(2).getOriginalIndex(0));
    assertEquals("Sentence 4.", sentences.get(3).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(3).getOriginalIndex(0));
    // test that sentence annotations get added to the original raw text
    Sentence sentence4 = sentences.get(3);
    List<Annotation<String>> annotations = new ArrayList<>();
    annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
    sentence4.addAnnotations(annotations);
    annotations = textBlock.getAnnotations(String.class);
    assertEquals(1, annotations.size());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence ".length(), annotations.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4".length(), annotations.get(0).getEnd());
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 4 with SentenceBoundary

use of com.joliciel.talismane.sentenceDetector.SentenceBoundary in project talismane by joliciel-informatique.

the class RollingTextBlockTest method testGetDetectedSentences.

@Test
public void testGetDetectedSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
    textBlock = textBlock.roll("Sentence 1<sent/>Sentence 2. Sentence");
    textBlock = textBlock.roll(" 3.");
    // the rawTextBlock always contains the last two added sub-blocks
    // so annotations are relative to these sub-blocks
    AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
    // we add a sentence break annotation (as if it was added by a filter)
    System.out.println("we add a sentence break annotation (as if it was added by a filter)");
    sentenceBreaks.add(new Annotation<>("".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
    rawTextBlock.addAnnotations(sentenceBreaks);
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
    rawTextBlock.addAnnotations(skips);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll(" Sentence 4.");
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3.", processedTextBlock.getText());
    // add sentence boundaries to the processed text (as if they were added
    // by a sentence detector)
    System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
    List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    List<Sentence> sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(2, sentences.size());
    assertEquals("Sentence 1", sentences.get(0).getText());
    assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
    assertEquals("Sentence 2.", sentences.get(1).getText());
    assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll("");
    // we have now rolled all text up until sentence 4 into the processed
    // area
    processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    // add a sentence boundary for "Sentence 3"
    System.out.println("add a sentence boundary for \"Sentence 3\", this time inside the analysis range");
    sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(1, sentences.size());
    assertEquals("Sentence 3.", sentences.get(0).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(0).getOriginalIndex(0));
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    System.out.println(sentenceBoundaries.toString());
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
    // roll in a final empty block - we now have an empty block at block 3,
    // so that any leftover in block 2 should be marked as complete
    // since sentences never overlap empty blocks.
    textBlock = textBlock.roll("");
    sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(1, sentences.size());
    assertEquals("Sentence 4.", sentences.get(0).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(0).getOriginalIndex(0));
    // note: at this point the initial two blocks have been rolled out
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals(" 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    assertEquals(1, sentenceBoundaries.size());
    assertEquals("".length(), sentenceBoundaries.get(0).getStart());
    assertEquals(" 3.".length(), sentenceBoundaries.get(0).getEnd());
    // test that sentence annotations get added to the original raw text
    Sentence sentence4 = sentences.get(0);
    List<Annotation<String>> annotations = new ArrayList<>();
    annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
    sentence4.addAnnotations(annotations);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    annotations = textBlock.getAnnotations(String.class);
    assertEquals(1, annotations.size());
    assertEquals(" 3. Sentence ".length(), annotations.get(0).getStart());
    assertEquals(" 3. Sentence 4".length(), annotations.get(0).getEnd());
    textBlock.getProcessedText();
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

AnnotatedText (com.joliciel.talismane.AnnotatedText)4 Annotation (com.joliciel.talismane.Annotation)4 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 ArrayList (java.util.ArrayList)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)2 TalismaneTest (com.joliciel.talismane.TalismaneTest)2 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)2 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)2 Config (com.typesafe.config.Config)2 List (java.util.List)2 Test (org.junit.Test)2