Search in sources :

Example 1 with AnnotatedText

use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.

the class SentenceDetector method detectSentences.

/**
 * Detect sentences within an annotated text. Sentences are added in the form
 * of an Annotation around a {@link SentenceBoundary}, with the start position
 * (relative to the start of the annotated text) at the start of the sentence
 * and the end position immediately after the end of the sentence. <br>
 * <br>
 * Sentence boundaries will not be detected within any annotation of type
 * {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
 * after the {@link AnnotatedText#getAnalysisStart()} and
 * {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
 * <br>
 * If the text contained existing {@link SentenceBoundary} annotations before
 * analysis start, the first sentence will begin where the last existing
 * annotation ended. Otherwise, the first boundary will begin at position 0.
 * <br>
 * <br>
 * If the text's analysis end is equal to the text length, it is assumed that
 * the text end is a sentence boundary. In this case, an additional sentence
 * is added starting at the final detected boundary and ending at text end.
 *
 * @param text
 *          the annotated text in which we need to detect sentences.
 * @return in addition to the annotations added, we return a List of integers
 *         marking the end position of each sentence boundary.
 */
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
    LOG.debug("detectSentences");
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
    Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
    List<Integer> possibleBoundaries = new ArrayList<>();
    while (matcher.find()) {
        if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
            boolean noSentences = false;
            int position = matcher.start();
            for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
                if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
                    noSentences = true;
                    break;
                }
            }
            if (!noSentences)
                possibleBoundaries.add(position);
        }
    }
    // collect all deterministic sentence boundaries
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
    Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
    // Share one token sequence for all possible boundaries, to avoid tokenising
    // multiple times
    Sentence sentence = new Sentence(text.getText(), sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
    for (int possibleBoundary : possibleBoundaries) {
        PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Testing boundary: " + boundary);
            LOG.trace(" at position: " + possibleBoundary);
        }
        List<FeatureResult<?>> featureResults = new ArrayList<>();
        for (SentenceDetectorFeature<?> feature : features) {
            RuntimeEnvironment env = new RuntimeEnvironment();
            FeatureResult<?> featureResult = feature.check(boundary, env);
            if (featureResult != null)
                featureResults.add(featureResult);
        }
        if (LOG.isTraceEnabled()) {
            SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
            for (String featureResultString : featureResultSet) {
                LOG.trace(featureResultString);
            }
        }
        List<Decision> decisions = this.decisionMaker.decide(featureResults);
        if (LOG.isTraceEnabled()) {
            for (Decision decision : decisions) {
                LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
            }
        }
        if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Adding boundary: " + possibleBoundary + 1);
            }
            guessedBoundaries.add(possibleBoundary + 1);
            boundaries.add(boundary);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
        for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
    }
    if (LOG.isDebugEnabled())
        LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
    List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
    int lastBoundary = 0;
    List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
    if (existingBoundaries.size() > 0) {
        lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
    }
    // advance boundary start until a non space character is encountered
    while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
        lastBoundary++;
    }
    for (int guessedBoundary : guessedBoundaries) {
        if (guessedBoundary > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added boundary: " + sentenceBoundary);
            }
            lastBoundary = guessedBoundary;
        }
    }
    if (text.getAnalysisEnd() == text.getText().length()) {
        if (text.getAnalysisEnd() > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added final boundary: " + sentenceBoundary);
            }
        }
    }
    text.addAnnotations(newBoundaries);
    return new ArrayList<>(guessedBoundaries);
}
Also used : ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) SentenceDetectorFeatureParser(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeatureParser) Matcher(java.util.regex.Matcher) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResourceFinder(com.joliciel.talismane.machineLearning.ExternalResourceFinder) AnnotatedText(com.joliciel.talismane.AnnotatedText) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) List(java.util.List) Annotation(com.joliciel.talismane.Annotation) Annotator(com.joliciel.talismane.Annotator) Pattern(java.util.regex.Pattern) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) TreeSet(java.util.TreeSet) Sentence(com.joliciel.talismane.rawText.Sentence) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Annotation(com.joliciel.talismane.Annotation) Decision(com.joliciel.talismane.machineLearning.Decision) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 2 with AnnotatedText

use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.

the class SentenceDetectorEvaluator method evaluate.

/**
 * Evaluate a given sentence detector.
 *
 * @return an f-score calculator for this sentence detector
 * @throws TalismaneException
 * @throws IOException
 */
public FScoreCalculator<SentenceDetectorOutcome> evaluate() throws TalismaneException, IOException {
    FScoreCalculator<SentenceDetectorOutcome> fScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
    // add f-score per tagger module, to see how we do for each boundary
    // character
    Map<String, FScoreCalculator<SentenceDetectorOutcome>> taggerFScoreCalculators = new TreeMap<String, FScoreCalculator<SentenceDetectorOutcome>>();
    Map<String, List<String>> errorMap = new TreeMap<String, List<String>>();
    LinkedList<String> sentences = new LinkedList<String>();
    String sentence = null;
    String previousSentence = ". ";
    if (corpusReader.hasNextSentence())
        sentence = corpusReader.nextSentence().getText().toString();
    sentences.add(sentence);
    while (!sentences.isEmpty()) {
        sentence = sentences.poll();
        LOG.debug("Sentence: " + sentence);
        String moreText = "";
        int sentenceIndex = 0;
        while (moreText.length() < minCharactersAfterBoundary) {
            String nextSentence = "";
            if (sentenceIndex < sentences.size()) {
                nextSentence = sentences.get(sentenceIndex);
            } else if (corpusReader.hasNextSentence()) {
                nextSentence = corpusReader.nextSentence().getText().toString();
                sentences.add(nextSentence);
            } else {
                break;
            }
            if (nextSentence.startsWith(" ") || nextSentence.startsWith("\n"))
                moreText += nextSentence;
            else
                moreText += " " + nextSentence;
            sentenceIndex++;
        }
        String text = previousSentence + sentence + moreText;
        AnnotatedText annotatedText = new AnnotatedText(text, previousSentence.length(), previousSentence.length() + sentence.length(), new ArrayList<>());
        Matcher matcher = sentenceDetector.getPossibleBoundaryPattern().matcher(text);
        List<Integer> possibleBoundaries = new ArrayList<Integer>();
        while (matcher.find()) {
            if (matcher.start() >= annotatedText.getAnalysisStart() && matcher.start() < annotatedText.getAnalysisEnd())
                possibleBoundaries.add(matcher.start());
        }
        int realBoundary = previousSentence.length() + sentence.length();
        if (!possibleBoundaries.contains(realBoundary))
            possibleBoundaries.add(realBoundary);
        List<Integer> guessedBoundaries = this.sentenceDetector.detectSentences(annotatedText);
        for (int possibleBoundary : possibleBoundaries) {
            SentenceDetectorOutcome expected = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
            SentenceDetectorOutcome guessed = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
            if (possibleBoundary == realBoundary)
                expected = SentenceDetectorOutcome.IS_BOUNDARY;
            if (guessedBoundaries.contains(possibleBoundary))
                guessed = SentenceDetectorOutcome.IS_BOUNDARY;
            fScoreCalculator.increment(expected, guessed);
            String boundaryCharacter = "" + text.charAt(possibleBoundary - 1);
            Matcher boundaryMatcher = sentenceDetector.getPossibleBoundaryPattern().matcher(boundaryCharacter);
            if (!boundaryMatcher.matches())
                boundaryCharacter = "OTHER";
            FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
            if (taggerFScoreCalculator == null) {
                taggerFScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
                taggerFScoreCalculators.put(boundaryCharacter, taggerFScoreCalculator);
            }
            taggerFScoreCalculator.increment(expected, guessed);
            if (!expected.equals(guessed)) {
                int start1 = possibleBoundary - NUM_CHARS;
                int end1 = possibleBoundary + NUM_CHARS;
                if (start1 < 0)
                    start1 = 0;
                String startString = text.substring(start1, possibleBoundary - 1);
                startString = StringUtils.padLeft(startString, NUM_CHARS);
                String middleString = "" + text.charAt(possibleBoundary - 1);
                if (end1 >= text.length())
                    end1 = text.length() - 1;
                String endString = "";
                if (end1 >= 0 && possibleBoundary < text.length())
                    endString = text.substring(possibleBoundary, end1);
                String testText = startString + "[" + middleString + "]" + endString;
                testText = testText.replace('\n', '¶');
                String error = "Guessed " + guessed + ", Expected " + expected + ". Text: " + testText;
                LOG.debug(error);
                List<String> errors = errorMap.get(boundaryCharacter);
                if (errors == null) {
                    errors = new ArrayList<String>();
                    errorMap.put(boundaryCharacter, errors);
                }
                errors.add(error);
            }
        // have error
        }
        // next possible boundary
        if (sentence.endsWith(" "))
            previousSentence = sentence;
        else
            previousSentence = sentence + " ";
    }
    for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
        LOG.debug("###### Boundary " + boundaryCharacter);
        FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
        LOG.debug("###### Boundary " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore());
    }
    if (errorWriter != null) {
        for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
            FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
            errorWriter.write("###### Tagger " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore() + "\n");
            errorWriter.write("Total " + (taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount()) + "\n");
            errorWriter.write("True + " + taggerFScoreCalculator.getTotalTruePositiveCount() + "\n");
            errorWriter.write("False- " + taggerFScoreCalculator.getTotalFalseNegativeCount() + "\n");
            errorWriter.write("False+ " + taggerFScoreCalculator.getTotalFalsePositiveCount() + "\n");
            for (SentenceDetectorOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
                errorWriter.write(outcome + " total  " + (taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                errorWriter.write(outcome + " true + " + (taggerFScoreCalculator.getTruePositiveCount(outcome)) + "\n");
                errorWriter.write(outcome + " false- " + (taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                errorWriter.write(outcome + " false+ " + (taggerFScoreCalculator.getFalsePositiveCount(outcome)) + "\n");
                errorWriter.write(outcome + " precis " + (taggerFScoreCalculator.getPrecision(outcome)) + "\n");
                errorWriter.write(outcome + " recall " + (taggerFScoreCalculator.getRecall(outcome)) + "\n");
                errorWriter.write(outcome + " fscore " + (taggerFScoreCalculator.getFScore(outcome)) + "\n");
            }
            List<String> errors = errorMap.get(boundaryCharacter);
            if (errors != null) {
                for (String error : errors) {
                    errorWriter.write(error + "\n");
                }
            }
            errorWriter.flush();
        }
        // next boundary character
        errorWriter.close();
    }
    // have error writer
    return fScoreCalculator;
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) Matcher(java.util.regex.Matcher) FScoreCalculator(com.joliciel.talismane.stats.FScoreCalculator) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 3 with AnnotatedText

use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.

the class RawTextProcessor method getDetectedSentences.

/**
 * Get a list of sentences currently detected. All sentences will be complete
 * - if the list ends with an incomplete sentence it is kept for another
 * round.
 *
 * @return
 */
public final List<Sentence> getDetectedSentences() {
    SentenceHolder prevHolder = this.getPreviousSentenceHolder();
    SentenceHolder currentHolder = this.getCurrentSentenceHolder();
    for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
        currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
        currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
    }
    List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
    leftover = null;
    if (sentences.size() > 0) {
        Sentence lastSentence = sentences.get(sentences.size() - 1);
        if (!lastSentence.isComplete()) {
            leftover = lastSentence;
            if (LOG.isTraceEnabled())
                LOG.trace("Set leftover to: " + leftover.toString());
            sentences.remove(sentences.size() - 1);
        }
    }
    // ensure that sentence annotations get added to the raw text as well
    for (Sentence sentence : sentences) {
        sentence.addObserver(new AnnotationObserver() {

            int myOrigin = RawTextProcessor.this.originalStartIndex;

            @Override
            public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
                List<Annotation<T>> newAnnotations = new ArrayList<>();
                for (Annotation<T> annotation : annotations) {
                    int originalStart = sentence.getOriginalIndex(annotation.getStart());
                    int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
                    Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
                    newAnnotations.add(newAnnotation);
                }
                RawTextProcessor.this.addAnnotations(newAnnotations);
            }

            @Override
            public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
            }
        });
    }
    // position 0.
    if (currentHolder.getOriginalTextSegments().size() > 0) {
        if (leftover == null) {
            leftover = new Sentence("", currentFile, sessionId);
        }
        StringBuilder segmentsToInsert = new StringBuilder();
        if (leftover.getLeftoverOriginalText().length() > 0)
            segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
        for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
            segmentsToInsert.append(originalTextSegment);
        }
        leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
    }
    return sentences;
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver) ArrayList(java.util.ArrayList) List(java.util.List)

Example 4 with AnnotatedText

use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.

the class RollingTextBlock method getRawTextBlock.

/**
 * Get a raw text block for annotation by filters. This covers blocks 3 and 4
 * only of the current RollingTextBlock, with analysis end at the end of
 * block3. It is assumed that annotations crossing block 2 and 3 were already
 * added by a predecessor.
 */
public AnnotatedText getRawTextBlock() {
    AnnotatedText rawTextBlock = new AnnotatedText(this.block3 + this.block4, 0, this.block3.length());
    rawTextBlock.addObserver(new AnnotationObserver() {

        @Override
        public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
            if (annotations.size() > 0) {
                int offset = RollingTextBlock.this.block1.length() + RollingTextBlock.this.block2.length();
                List<Annotation<T>> newAnnotations = new ArrayList<>();
                for (Annotation<T> annotation : annotations) {
                    Annotation<T> newAnnotation = annotation.getAnnotation(annotation.getStart() + offset, annotation.getEnd() + offset);
                    newAnnotations.add(newAnnotation);
                }
                RollingTextBlock.this.addAnnotations(newAnnotations);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("RawTextBlock Annotations received: " + annotations);
                    LOG.trace("RawTextBlock Annotations added: " + newAnnotations);
                }
            }
        }

        @Override
        public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
        }
    });
    return rawTextBlock;
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) List(java.util.List) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver)

Example 5 with AnnotatedText

use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.

the class NewlineEndOfSentenceMarkerTest method testApply.

@Test
public void testApply() throws Exception {
    NewlineEndOfSentenceMarker filter = new NewlineEndOfSentenceMarker(1000);
    AnnotatedText text = new AnnotatedText("1\r\n2\r\n");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    List<Annotation<RawTextSkipMarker>> skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(3, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(3, skips.get(0).getEnd());
    assertEquals(4, sentenceBreaks.get(1).getStart());
    assertEquals(6, sentenceBreaks.get(1).getEnd());
    assertEquals(4, skips.get(1).getStart());
    assertEquals(6, skips.get(1).getEnd());
    text = new AnnotatedText("1\r2\r");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(2, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(2, skips.get(0).getEnd());
    assertEquals(3, sentenceBreaks.get(1).getStart());
    assertEquals(4, sentenceBreaks.get(1).getEnd());
    assertEquals(3, skips.get(1).getStart());
    assertEquals(4, skips.get(1).getEnd());
    text = new AnnotatedText("1\r2\r");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
    assertEquals(2, sentenceBreaks.size());
    skips = text.getAnnotations(RawTextSkipMarker.class);
    assertEquals(2, skips.size());
    assertEquals(1, sentenceBreaks.get(0).getStart());
    assertEquals(2, sentenceBreaks.get(0).getEnd());
    assertEquals(1, skips.get(0).getStart());
    assertEquals(2, skips.get(0).getEnd());
    assertEquals(3, sentenceBreaks.get(1).getStart());
    assertEquals(4, sentenceBreaks.get(1).getEnd());
    assertEquals(3, skips.get(1).getStart());
    assertEquals(4, skips.get(1).getEnd());
}
Also used : NewlineEndOfSentenceMarker(com.joliciel.talismane.rawText.NewlineEndOfSentenceMarker) AnnotatedText(com.joliciel.talismane.AnnotatedText) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) Annotation(com.joliciel.talismane.Annotation) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

AnnotatedText (com.joliciel.talismane.AnnotatedText)22 Annotation (com.joliciel.talismane.Annotation)20 TalismaneTest (com.joliciel.talismane.TalismaneTest)16 Test (org.junit.Test)16 ArrayList (java.util.ArrayList)14 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)10 Config (com.typesafe.config.Config)10 List (java.util.List)7 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)5 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)5 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)3 Decision (com.joliciel.talismane.machineLearning.Decision)3 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)3 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)3 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)3 HashSet (java.util.HashSet)3 ClassificationSolution (com.joliciel.talismane.machineLearning.ClassificationSolution)2 GeometricMeanScoringStrategy (com.joliciel.talismane.machineLearning.GeometricMeanScoringStrategy)2 Sentence (com.joliciel.talismane.rawText.Sentence)2