Examples with RawTextSentenceBreakMarker - com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker

Example 1 with RawTextSentenceBreakMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker in project talismane by joliciel-informatique.

the class SentenceDetector method detectSentences.

/**
 * Detect sentences within an annotated text. Sentences are added in the form
 * of an Annotation around a {@link SentenceBoundary}, with the start position
 * (relative to the start of the annotated text) at the start of the sentence
 * and the end position immediately after the end of the sentence. <br>
 * <br>
 * Sentence boundaries will not be detected within any annotation of type
 * {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
 * after the {@link AnnotatedText#getAnalysisStart()} and
 * {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
 * <br>
 * If the text contained existing {@link SentenceBoundary} annotations before
 * analysis start, the first sentence will begin where the last existing
 * annotation ended. Otherwise, the first boundary will begin at position 0.
 * <br>
 * <br>
 * If the text's analysis end is equal to the text length, it is assumed that
 * the text end is a sentence boundary. In this case, an additional sentence
 * is added starting at the final detected boundary and ending at text end.
 *
 * @param text
 *          the annotated text in which we need to detect sentences.
 * @return in addition to the annotations added, we return a List of integers
 *         marking the end position of each sentence boundary.
 */
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
    LOG.debug("detectSentences");
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
    Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
    List<Integer> possibleBoundaries = new ArrayList<>();
    while (matcher.find()) {
        if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
            boolean noSentences = false;
            int position = matcher.start();
            for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
                if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
                    noSentences = true;
                    break;
                }
            }
            if (!noSentences)
                possibleBoundaries.add(position);
        }
    }
    // collect all deterministic sentence boundaries
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
    Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
    // Share one token sequence for all possible boundaries, to avoid tokenising
    // multiple times
    Sentence sentence = new Sentence(text.getText(), sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
    for (int possibleBoundary : possibleBoundaries) {
        PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Testing boundary: " + boundary);
            LOG.trace(" at position: " + possibleBoundary);
        }
        List<FeatureResult<?>> featureResults = new ArrayList<>();
        for (SentenceDetectorFeature<?> feature : features) {
            RuntimeEnvironment env = new RuntimeEnvironment();
            FeatureResult<?> featureResult = feature.check(boundary, env);
            if (featureResult != null)
                featureResults.add(featureResult);
        }
        if (LOG.isTraceEnabled()) {
            SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
            for (String featureResultString : featureResultSet) {
                LOG.trace(featureResultString);
            }
        }
        List<Decision> decisions = this.decisionMaker.decide(featureResults);
        if (LOG.isTraceEnabled()) {
            for (Decision decision : decisions) {
                LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
            }
        }
        if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Adding boundary: " + possibleBoundary + 1);
            }
            guessedBoundaries.add(possibleBoundary + 1);
            boundaries.add(boundary);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
        for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
    }
    if (LOG.isDebugEnabled())
        LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
    List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
    int lastBoundary = 0;
    List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
    if (existingBoundaries.size() > 0) {
        lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
    }
    // advance boundary start until a non space character is encountered
    while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
        lastBoundary++;
    }
    for (int guessedBoundary : guessedBoundaries) {
        if (guessedBoundary > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added boundary: " + sentenceBoundary);
            }
            lastBoundary = guessedBoundary;
        }
    }
    if (text.getAnalysisEnd() == text.getText().length()) {
        if (text.getAnalysisEnd() > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added final boundary: " + sentenceBoundary);
            }
        }
    }
    text.addAnnotations(newBoundaries);
    return new ArrayList<>(guessedBoundaries);
}

Also used : ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) SentenceDetectorFeatureParser(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeatureParser) Matcher(java.util.regex.Matcher) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResourceFinder(com.joliciel.talismane.machineLearning.ExternalResourceFinder) AnnotatedText(com.joliciel.talismane.AnnotatedText) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) List(java.util.List) Annotation(com.joliciel.talismane.Annotation) Annotator(com.joliciel.talismane.Annotator) Pattern(java.util.regex.Pattern) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) TreeSet(java.util.TreeSet) Sentence(com.joliciel.talismane.rawText.Sentence) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Annotation(com.joliciel.talismane.Annotation) Decision(com.joliciel.talismane.machineLearning.Decision) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 2 with RawTextSentenceBreakMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker in project talismane by joliciel-informatique.

the class NewlineEndOfSentenceMarkerTest method testApply.

@Test
public void testApply() throws Exception {
    NewlineEndOfSentenceMarker filter = new NewlineEndOfSentenceMarker(1000);
    AnnotatedText text = new AnnotatedText("1\r\n2\r\n");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    List<Annotation<RawTextSkipMarker>> skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(3, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(3, skips.get(0).getEnd());
    assertEquals(4, sentenceBreaks.get(1).getStart());
    assertEquals(6, sentenceBreaks.get(1).getEnd());
    assertEquals(4, skips.get(1).getStart());
    assertEquals(6, skips.get(1).getEnd());
    text = new AnnotatedText("1\r2\r");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(2, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(2, skips.get(0).getEnd());
    assertEquals(3, sentenceBreaks.get(1).getStart());
    assertEquals(4, sentenceBreaks.get(1).getEnd());
    assertEquals(3, skips.get(1).getStart());
    assertEquals(4, skips.get(1).getEnd());
    text = new AnnotatedText("1\r2\r");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(2, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(2, skips.get(0).getEnd());
    assertEquals(3, sentenceBreaks.get(1).getStart());
    assertEquals(4, sentenceBreaks.get(1).getEnd());
    assertEquals(3, skips.get(1).getStart());
    assertEquals(4, skips.get(1).getEnd());
}

Also used : NewlineEndOfSentenceMarker(com.joliciel.talismane.rawText.NewlineEndOfSentenceMarker) AnnotatedText(com.joliciel.talismane.AnnotatedText) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) Annotation(com.joliciel.talismane.Annotation) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 3 with RawTextSentenceBreakMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker in project talismane by joliciel-informatique.

the class RawTextRegexAnnotator method annotate.

@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
    if (LOG.isTraceEnabled()) {
        LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
    }
    List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
    List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
    Matcher matcher = pattern.matcher(textBlock.getText());
    while (matcher.find()) {
        int matcherStart = 0;
        int matcherEnd = 0;
        if (groupIndex == 0) {
            matcherStart = matcher.start();
            matcherEnd = matcher.end();
        } else {
            matcherStart = matcher.start(groupIndex);
            matcherEnd = matcher.end(groupIndex);
        }
        CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
        if (LOG.isTraceEnabled()) {
            LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
            if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
                LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
            }
            LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
        }
        if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
            String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
            throw new MatchTooLargeException(errorString);
        }
        if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
            for (RawTextMarkType filterType : filterTypes) {
                switch(filterType) {
                    case REPLACE:
                        {
                            String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Setting replacement to: " + insertionText);
                            }
                            RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case NO_SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SKIP:
                        {
                            RawTextMarker marker = new RawTextSkipMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case TAG:
                        {
                            Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
                            tokenAttributes.add(annotation);
                            break;
                        }
                    default:
                        {
                            RawTextMarker marker = new RawTextMarker(filterType, this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                }
            }
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
        }
    }
    if (rawTextMarkers.size() > 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
            LOG.debug("Added annotations: " + rawTextMarkers);
        }
    }
    if (rawTextMarkers.size() > 0)
        textBlock.addAnnotations(rawTextMarkers);
    if (tokenAttributes.size() > 0)
        textBlock.addAnnotations(tokenAttributes);
}

Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)

Example 4 with RawTextSentenceBreakMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker in project talismane by joliciel-informatique.

the class RawTextTest method testGetDetectedSentences.

@Test
public void testGetDetectedSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    String text = "Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.";
    RawText textBlock = new RawText(text, true, sessionId);
    // we add a sentence break annotation to the raw text (as if it was
    // added by a filter)
    System.out.println("we add a sentence break annotation (as if it was added by a filter)");
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
    sentenceBreaks.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
    textBlock.addAnnotations(sentenceBreaks);
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
    textBlock.addAnnotations(skips);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    // add sentence boundaries to the processed text (as if they were added
    // by a sentence detector)
    System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
    List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
    sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
    List<Sentence> sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(4, sentences.size());
    assertEquals("Sentence 1", sentences.get(0).getText());
    assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
    assertEquals("Sentence 2.", sentences.get(1).getText());
    assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
    assertEquals("Sentence 3.", sentences.get(2).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(2).getOriginalIndex(0));
    assertEquals("Sentence 4.", sentences.get(3).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(3).getOriginalIndex(0));
    // test that sentence annotations get added to the original raw text
    Sentence sentence4 = sentences.get(3);
    List<Annotation<String>> annotations = new ArrayList<>();
    annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
    sentence4.addAnnotations(annotations);
    annotations = textBlock.getAnnotations(String.class);
    assertEquals(1, annotations.size());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence ".length(), annotations.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4".length(), annotations.get(0).getEnd());
}

Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 5 with RawTextSentenceBreakMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker in project talismane by joliciel-informatique.

the class RollingTextBlockTest method testGetDetectedSentences.

@Test
public void testGetDetectedSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
    textBlock = textBlock.roll("Sentence 1<sent/>Sentence 2. Sentence");
    textBlock = textBlock.roll(" 3.");
    // the rawTextBlock always contains the last two added sub-blocks
    // so annotations are relative to these sub-blocks
    AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
    // we add a sentence break annotation (as if it was added by a filter)
    System.out.println("we add a sentence break annotation (as if it was added by a filter)");
    sentenceBreaks.add(new Annotation<>("".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
    rawTextBlock.addAnnotations(sentenceBreaks);
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
    rawTextBlock.addAnnotations(skips);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll(" Sentence 4.");
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3.", processedTextBlock.getText());
    // add sentence boundaries to the processed text (as if they were added
    // by a sentence detector)
    System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
    List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    List<Sentence> sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(2, sentences.size());
    assertEquals("Sentence 1", sentences.get(0).getText());
    assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
    assertEquals("Sentence 2.", sentences.get(1).getText());
    assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    textBlock = textBlock.roll("");
    // we have now rolled all text up until sentence 4 into the processed
    // area
    processedTextBlock = textBlock.getProcessedText();
    assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
    // add a sentence boundary for "Sentence 3"
    System.out.println("add a sentence boundary for \"Sentence 3\", this time inside the analysis range");
    sentenceBoundaries = new ArrayList<>();
    sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
    processedTextBlock.addAnnotations(sentenceBoundaries);
    sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(1, sentences.size());
    assertEquals("Sentence 3.", sentences.get(0).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(0).getOriginalIndex(0));
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    System.out.println(sentenceBoundaries.toString());
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
    // roll in a final empty block - we now have an empty block at block 3,
    // so that any leftover in block 2 should be marked as complete
    // since sentences never overlap empty blocks.
    textBlock = textBlock.roll("");
    sentences = textBlock.getDetectedSentences();
    System.out.println("sentences: " + sentences.toString());
    assertEquals(1, sentences.size());
    assertEquals("Sentence 4.", sentences.get(0).getText());
    assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(0).getOriginalIndex(0));
    // note: at this point the initial two blocks have been rolled out
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    // ensure that the sentence boundary annotations in the original text
    // are in the right place
    assertEquals(" 3. Sentence 4.", textBlock.getText());
    sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
    assertEquals(1, sentenceBoundaries.size());
    assertEquals("".length(), sentenceBoundaries.get(0).getStart());
    assertEquals(" 3.".length(), sentenceBoundaries.get(0).getEnd());
    // test that sentence annotations get added to the original raw text
    Sentence sentence4 = sentences.get(0);
    List<Annotation<String>> annotations = new ArrayList<>();
    annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
    sentence4.addAnnotations(annotations);
    System.out.println("textBlock text: " + textBlock.getText());
    System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
    annotations = textBlock.getAnnotations(String.class);
    assertEquals(1, annotations.size());
    assertEquals(" 3. Sentence ".length(), annotations.get(0).getStart());
    assertEquals(" 3. Sentence 4".length(), annotations.get(0).getEnd());
    textBlock.getProcessedText();
}

Aggregations

Annotation (com.joliciel.talismane.Annotation)6 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)6 AnnotatedText (com.joliciel.talismane.AnnotatedText)5 ArrayList (java.util.ArrayList)5 TalismaneTest (com.joliciel.talismane.TalismaneTest)4 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)4 Config (com.typesafe.config.Config)4 Test (org.junit.Test)4 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)3 Decision (com.joliciel.talismane.machineLearning.Decision)2 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)2 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)2 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Matcher (java.util.regex.Matcher)2 Annotator (com.joliciel.talismane.Annotator)1 TalismaneException (com.joliciel.talismane.TalismaneException)1 ClassificationModel (com.joliciel.talismane.machineLearning.ClassificationModel)1 ClassificationSolution (com.joliciel.talismane.machineLearning.ClassificationSolution)1