Search in sources :

Example 1 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class SentenceDetector method detectSentences.

/**
 * Detect sentences within an annotated text. Sentences are added in the form
 * of an Annotation around a {@link SentenceBoundary}, with the start position
 * (relative to the start of the annotated text) at the start of the sentence
 * and the end position immediately after the end of the sentence. <br>
 * <br>
 * Sentence boundaries will not be detected within any annotation of type
 * {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
 * after the {@link AnnotatedText#getAnalysisStart()} and
 * {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
 * <br>
 * If the text contained existing {@link SentenceBoundary} annotations before
 * analysis start, the first sentence will begin where the last existing
 * annotation ended. Otherwise, the first boundary will begin at position 0.
 * <br>
 * <br>
 * If the text's analysis end is equal to the text length, it is assumed that
 * the text end is a sentence boundary. In this case, an additional sentence
 * is added starting at the final detected boundary and ending at text end.
 *
 * @param text
 *          the annotated text in which we need to detect sentences.
 * @return in addition to the annotations added, we return a List of integers
 *         marking the end position of each sentence boundary.
 */
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
    LOG.debug("detectSentences");
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
    Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
    List<Integer> possibleBoundaries = new ArrayList<>();
    while (matcher.find()) {
        if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
            boolean noSentences = false;
            int position = matcher.start();
            for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
                if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
                    noSentences = true;
                    break;
                }
            }
            if (!noSentences)
                possibleBoundaries.add(position);
        }
    }
    // collect all deterministic sentence boundaries
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
    Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
    // Share one token sequence for all possible boundaries, to avoid tokenising
    // multiple times
    Sentence sentence = new Sentence(text.getText(), sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
    for (int possibleBoundary : possibleBoundaries) {
        PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Testing boundary: " + boundary);
            LOG.trace(" at position: " + possibleBoundary);
        }
        List<FeatureResult<?>> featureResults = new ArrayList<>();
        for (SentenceDetectorFeature<?> feature : features) {
            RuntimeEnvironment env = new RuntimeEnvironment();
            FeatureResult<?> featureResult = feature.check(boundary, env);
            if (featureResult != null)
                featureResults.add(featureResult);
        }
        if (LOG.isTraceEnabled()) {
            SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
            for (String featureResultString : featureResultSet) {
                LOG.trace(featureResultString);
            }
        }
        List<Decision> decisions = this.decisionMaker.decide(featureResults);
        if (LOG.isTraceEnabled()) {
            for (Decision decision : decisions) {
                LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
            }
        }
        if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Adding boundary: " + possibleBoundary + 1);
            }
            guessedBoundaries.add(possibleBoundary + 1);
            boundaries.add(boundary);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
        for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
    }
    if (LOG.isDebugEnabled())
        LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
    List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
    int lastBoundary = 0;
    List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
    if (existingBoundaries.size() > 0) {
        lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
    }
    // advance boundary start until a non space character is encountered
    while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
        lastBoundary++;
    }
    for (int guessedBoundary : guessedBoundaries) {
        if (guessedBoundary > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added boundary: " + sentenceBoundary);
            }
            lastBoundary = guessedBoundary;
        }
    }
    if (text.getAnalysisEnd() == text.getText().length()) {
        if (text.getAnalysisEnd() > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added final boundary: " + sentenceBoundary);
            }
        }
    }
    text.addAnnotations(newBoundaries);
    return new ArrayList<>(guessedBoundaries);
}
Also used : ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) SentenceDetectorFeatureParser(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeatureParser) Matcher(java.util.regex.Matcher) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResourceFinder(com.joliciel.talismane.machineLearning.ExternalResourceFinder) AnnotatedText(com.joliciel.talismane.AnnotatedText) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) List(java.util.List) Annotation(com.joliciel.talismane.Annotation) Annotator(com.joliciel.talismane.Annotator) Pattern(java.util.regex.Pattern) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) TreeSet(java.util.TreeSet) Sentence(com.joliciel.talismane.rawText.Sentence) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Annotation(com.joliciel.talismane.Annotation) Decision(com.joliciel.talismane.machineLearning.Decision) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 2 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RawTextProcessor method processText.

/**
 * Processes the current text based on annotations added to block 3, and
 * returns a SentenceHolder.
 *
 * @return SentenceHolder to retrieve the sentences.
 */
protected final SentenceHolder processText(int textStartPos, int textEndPos, CharSequence rawText, boolean finalBlock) {
    if (this.sentenceHolder != null)
        return this.sentenceHolder;
    LOG.debug("processText");
    List<Annotation<RawTextMarker>> annotations = this.getAnnotations(RawTextMarker.class);
    if (LOG.isTraceEnabled()) {
        LOG.trace("finalBlock? " + finalBlock);
        LOG.trace("annotations: " + annotations.toString());
    }
    Map<Integer, List<Pair<Boolean, Annotation<RawTextMarker>>>> markMap = new TreeMap<>();
    for (Annotation<RawTextMarker> annotation : annotations) {
        if (LOG.isTraceEnabled())
            LOG.trace("Annotation: " + annotation.toString());
        // START_MARK processing mark.
        if (annotation.getStart() >= textStartPos && annotation.getStart() < textEndPos && !annotation.hasProcessingMark(START_MARK)) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start in range: textStartPos " + textStartPos + ">= matcherStart [[" + annotation.getStart() + "]] < textEndPos " + textEndPos + ", start applied? " + annotation.hasProcessingMark(START_MARK));
            }
            annotation.addProcessingMark(START_MARK);
            List<Pair<Boolean, Annotation<RawTextMarker>>> startMarks = markMap.get(annotation.getStart());
            if (startMarks == null) {
                startMarks = new ArrayList<>();
                markMap.put(annotation.getStart(), startMarks);
            }
            startMarks.add(new ImmutablePair<Boolean, Annotation<RawTextMarker>>(true, annotation));
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start out of range: textStartPos " + textStartPos + ">= matcherStart [[" + annotation.getStart() + "]] < textEndPos " + textEndPos + ", start applied? " + annotation.hasProcessingMark(START_MARK));
            }
        }
        // END_MARK processing mark.
        if (annotation.getEnd() >= textStartPos && (annotation.getEnd() < textEndPos || (annotation.getEnd() == textEndPos && rawText.length() > 0 && finalBlock)) && !annotation.hasProcessingMark(END_MARK)) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("End in range: textStartPos " + textStartPos + ">= matcherEnd [[" + annotation.getEnd() + "]] < textEndPos " + textEndPos + ", finalBlock? " + finalBlock + ", end applied? " + annotation.hasProcessingMark(END_MARK));
            }
            annotation.addProcessingMark(END_MARK);
            List<Pair<Boolean, Annotation<RawTextMarker>>> endMarks = markMap.get(annotation.getEnd());
            if (endMarks == null) {
                endMarks = new ArrayList<>();
                markMap.put(annotation.getEnd(), endMarks);
            }
            endMarks.add(new ImmutablePair<Boolean, Annotation<RawTextMarker>>(false, annotation));
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("End out of range: textStartPos " + textStartPos + ">= matcherEnd [[" + annotation.getEnd() + "]] < textEndPos " + textEndPos + ", finalBlock? " + finalBlock + ", end applied? " + annotation.hasProcessingMark(END_MARK));
            }
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("currentText: " + rawText.toString().replace('\n', '¶').replace('\r', '¶'));
        LOG.trace("marks: " + markMap.toString());
    }
    SentenceHolder sentenceHolder = new SentenceHolder(originalIndexProcessed, finalBlock, sessionId);
    // find any newlines
    sentenceHolder.addNewline(leftoverNewline, lineNumber - 1);
    Matcher matcher = newlinePattern.matcher(rawText);
    while (matcher.find()) {
        sentenceHolder.addNewline(originalIndexProcessed + matcher.end(), lineNumber++);
        leftoverNewline = originalIndexProcessed + matcher.end();
    }
    Map<Integer, Integer> insertionPoints = new TreeMap<Integer, Integer>();
    StringBuilder processedText = new StringBuilder();
    int currentPos = 0;
    int outputPos = 0;
    for (int markPos : markMap.keySet()) {
        List<Pair<Boolean, Annotation<RawTextMarker>>> marks = markMap.get(markPos);
        // collect all instructions at a given position
        List<Triple<RawTextInstruction, Annotation<RawTextMarker>, Boolean>> instructions = new ArrayList<>();
        for (Pair<Boolean, Annotation<RawTextMarker>> mark : marks) {
            boolean isStart = mark.getLeft();
            Annotation<RawTextMarker> annotation = mark.getRight();
            RawTextMarker marker = annotation.getData();
            List<RawTextInstruction> actions = new ArrayList<>();
            if (isStart) {
                switch(marker.getType()) {
                    case SKIP:
                        actions.add(RawTextInstruction.PUSH_SKIP);
                        break;
                    case SENTENCE_BREAK:
                        break;
                    case SPACE:
                        actions.add(RawTextInstruction.INSERT);
                        actions.add(RawTextInstruction.PUSH_SKIP);
                        break;
                    case REPLACE:
                        actions.add(RawTextInstruction.INSERT);
                        actions.add(RawTextInstruction.PUSH_SKIP);
                        break;
                    case OUTPUT:
                        actions.add(RawTextInstruction.PUSH_OUTPUT);
                        actions.add(RawTextInstruction.PUSH_SKIP);
                        break;
                    case INCLUDE:
                        actions.add(RawTextInstruction.PUSH_INCLUDE);
                        break;
                    case OUTPUT_START:
                        actions.add(RawTextInstruction.PUSH_OUTPUT);
                        break;
                    case STOP:
                        actions.add(RawTextInstruction.STOP);
                        break;
                    case NO_SENTENCE_BREAK:
                    case NONE:
                    case OUTPUT_STOP:
                    case START:
                    case TAG:
                        break;
                }
            } else {
                // end of annotation
                switch(marker.getType()) {
                    case SKIP:
                    case SPACE:
                    case REPLACE:
                        actions.add(RawTextInstruction.POP_SKIP);
                        break;
                    case SENTENCE_BREAK:
                        actions.add(RawTextInstruction.SENTENCE_BREAK);
                        break;
                    case OUTPUT:
                        actions.add(RawTextInstruction.STOP_OUTPUT);
                        actions.add(RawTextInstruction.POP_SKIP);
                        break;
                    case INCLUDE:
                        actions.add(RawTextInstruction.POP_INCLUDE);
                        break;
                    case START:
                        actions.add(RawTextInstruction.START);
                        break;
                    case OUTPUT_STOP:
                        actions.add(RawTextInstruction.STOP_OUTPUT);
                        break;
                    case NO_SENTENCE_BREAK:
                    case NONE:
                    case OUTPUT_START:
                    case STOP:
                    case TAG:
                        break;
                }
            }
            for (RawTextInstruction action : actions) {
                instructions.add(new ImmutableTriple<>(action, annotation, isStart));
            }
        }
        // sort the instructions to ensure they're applied in the correct
        // order
        instructions = instructions.stream().sorted((i1, i2) -> i1.getLeft().compareTo(i2.getLeft())).collect(Collectors.toList());
        for (Triple<RawTextInstruction, Annotation<RawTextMarker>, Boolean> triple : instructions) {
            RawTextInstruction instruction = triple.getLeft();
            Annotation<RawTextMarker> annotation = triple.getMiddle();
            RawTextMarker marker = annotation.getData();
            boolean isStart = triple.getRight();
            int position = isStart ? annotation.getStart() : annotation.getEnd();
            int relativePosition = position - textStartPos;
            if (LOG.isTraceEnabled()) {
                LOG.trace((isStart ? "Start " : "Stop ") + marker.getType() + " at " + position + ", relative pos: " + relativePosition);
                LOG.trace("instruction: " + instruction);
                LOG.trace("Stack before: " + shouldProcessStack);
                LOG.trace("Text before: " + processedText.toString());
                LOG.trace("Added by filter: " + marker.getSource());
                LOG.trace("Match text: " + this.getText().subSequence(annotation.getStart(), annotation.getEnd()).toString().replace('\n', '¶').replace('\r', '¶'));
            }
            boolean shouldProcess = shouldProcessStack.peek();
            boolean shouldOutput = shouldOutputStack.peek();
            switch(instruction) {
                case PUSH_SKIP:
                    if (shouldProcess) {
                        insertionPoints.put(processedText.length(), currentPos);
                        processedText.append(rawText.subSequence(currentPos, relativePosition));
                        if (shouldOutput) {
                            outputPos = relativePosition;
                        }
                    }
                    shouldProcessStack.push(false);
                    break;
                case PUSH_OUTPUT:
                    if (!shouldOutput && !shouldProcess) {
                        outputPos = relativePosition;
                    }
                    shouldOutputStack.push(true);
                    break;
                case PUSH_INCLUDE:
                    if (!shouldProcess) {
                        currentPos = relativePosition;
                        if (shouldOutput) {
                            CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
                            this.addOutputText(sentenceHolder, processedText.length(), outputText);
                            outputPos = relativePosition;
                        }
                    }
                    shouldProcessStack.push(true);
                    break;
                case SPACE:
                    if (shouldProcess) {
                        insertionPoints.put(processedText.length(), currentPos);
                        CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
                        processedText.append(leftoverText);
                        currentPos = relativePosition;
                        if (leftoverText.length() > 0 && leftoverText.charAt(leftoverText.length() - 1) != ' ') {
                            insertionPoints.put(processedText.length(), currentPos);
                            processedText.append(" ");
                        }
                    }
                    break;
                case INSERT:
                    if (shouldProcess) {
                        insertionPoints.put(processedText.length(), currentPos);
                        CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
                        processedText.append(leftoverText);
                        currentPos = relativePosition;
                        for (int i = 0; i < marker.getInsertionText().length(); i++) {
                            insertionPoints.put(processedText.length() + i, currentPos);
                        }
                        if (LOG.isTraceEnabled())
                            LOG.trace("Inserting: " + marker.getInsertionText());
                        processedText.append(marker.getInsertionText());
                    }
                    break;
                case SENTENCE_BREAK:
                    {
                        if (shouldProcess) {
                            insertionPoints.put(processedText.length(), currentPos);
                            CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
                            processedText.append(leftoverText);
                            currentPos = relativePosition;
                        }
                        // add the sentence boundary after the last character that
                        // was added
                        sentenceHolder.addSentenceBoundary(processedText.length());
                        if (LOG.isTraceEnabled()) {
                            int boundary = processedText.length() - 1;
                            if (boundary >= 0) {
                                String string = null;
                                int start1 = boundary - NUM_CHARS;
                                if (start1 < 0)
                                    start1 = 0;
                                String startString = processedText.subSequence(start1, boundary).toString();
                                String middleString = "" + processedText.charAt(boundary);
                                string = startString + "[" + middleString + "]";
                                string = string.replace('\n', '¶');
                                LOG.trace("Adding sentence break at position " + boundary + ": " + string);
                            }
                        }
                        if (shouldProcess) {
                            if (processedText.length() > 0 && processedText.charAt(processedText.length() - 1) != ' ') {
                                insertionPoints.put(processedText.length(), currentPos);
                                processedText.append(" ");
                            }
                        }
                        break;
                    }
                case POP_SKIP:
                case POP_INCLUDE:
                case STOP:
                case START:
                    {
                        boolean wasProcessing = shouldProcess;
                        boolean wasOutputting = shouldOutput && !shouldProcess;
                        if (instruction == RawTextInstruction.POP_SKIP || instruction == RawTextInstruction.POP_INCLUDE) {
                            shouldProcessStack.pop();
                        } else if (instruction == RawTextInstruction.STOP) {
                            shouldProcessStack.pop();
                            shouldProcessStack.push(false);
                        } else if (instruction == RawTextInstruction.START) {
                            shouldProcessStack.pop();
                            shouldProcessStack.push(true);
                        }
                        shouldProcess = shouldProcessStack.peek();
                        shouldOutput = shouldOutput && !shouldProcess;
                        if (wasProcessing && !shouldProcess) {
                            insertionPoints.put(processedText.length(), currentPos);
                            processedText.append(rawText.subSequence(currentPos, relativePosition));
                        } else if (!wasProcessing && shouldProcess) {
                            currentPos = relativePosition;
                        }
                        if (wasOutputting && (!shouldOutput || !shouldProcess)) {
                            CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
                            this.addOutputText(sentenceHolder, processedText.length(), outputText);
                            outputPos = relativePosition;
                        } else if (!wasOutputting && (shouldOutput && !shouldProcess)) {
                            outputPos = relativePosition;
                        }
                        // shouldOutput?
                        break;
                    }
                case POP_OUTPUT:
                case STOP_OUTPUT:
                case START_OUTPUT:
                    {
                        boolean wasOutputting = shouldOutput && !shouldProcess;
                        if (instruction == RawTextInstruction.POP_OUTPUT) {
                            shouldOutputStack.pop();
                        } else if (instruction == RawTextInstruction.STOP_OUTPUT) {
                            shouldOutputStack.pop();
                            shouldOutputStack.push(false);
                        } else if (instruction == RawTextInstruction.START_OUTPUT) {
                            shouldOutputStack.pop();
                            shouldOutputStack.push(true);
                        }
                        shouldOutput = shouldOutputStack.peek();
                        if (wasOutputting && (!shouldOutput || !shouldProcess)) {
                            CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
                            this.addOutputText(sentenceHolder, processedText.length(), outputText);
                            outputPos = relativePosition;
                        } else if (!wasOutputting && (shouldOutput && !shouldProcess)) {
                            outputPos = relativePosition;
                        }
                        // shouldOutput?
                        break;
                    }
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("Stack after: " + shouldProcessStack);
                LOG.trace("Text after: " + processedText.toString());
            }
        }
    // next action
    }
    // next text marker
    boolean shouldProcess = shouldProcessStack.peek();
    boolean shouldOutput = shouldOutputStack.peek();
    if (shouldProcess) {
        insertionPoints.put(processedText.length(), currentPos);
        processedText.append(rawText.subSequence(currentPos, rawText.length()));
    }
    if (shouldOutput && !shouldProcess) {
        leftoverOutput = leftoverOutput + rawText.subSequence(outputPos, rawText.length());
    }
    String finalProcessedText = processedText.toString();
    if (LOG.isTraceEnabled())
        LOG.trace("Text after processing: " + finalProcessedText);
    sentenceHolder.setProcessedText(finalProcessedText);
    int lastIndex = 0;
    int lastOriginalIndex = 0;
    for (Entry<Integer, Integer> insertionPoint : insertionPoints.entrySet()) {
        int j = 0;
        for (int i = lastIndex; i < insertionPoint.getKey(); i++) {
            sentenceHolder.addOriginalIndex(originalIndexProcessed + lastOriginalIndex + j);
            j++;
        }
        lastIndex = insertionPoint.getKey();
        lastOriginalIndex = insertionPoint.getValue();
    }
    if (lastIndex < sentenceHolder.getProcessedText().length()) {
        int j = 0;
        for (int i = lastIndex; i < sentenceHolder.getProcessedText().length(); i++) {
            sentenceHolder.addOriginalIndex(originalIndexProcessed + lastOriginalIndex + j);
            j++;
        }
    }
    originalIndexProcessed += rawText.length();
    this.sentenceHolder = sentenceHolder;
    return sentenceHolder;
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) Pair(org.apache.commons.lang3.tuple.Pair) TreeMap(java.util.TreeMap) Annotation(com.joliciel.talismane.Annotation) ImmutableTriple(org.apache.commons.lang3.tuple.ImmutableTriple) Triple(org.apache.commons.lang3.tuple.Triple)

Example 3 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RawTextProcessor method getDetectedSentences.

/**
 * Get a list of sentences currently detected. All sentences will be complete
 * - if the list ends with an incomplete sentence it is kept for another
 * round.
 *
 * @return
 */
public final List<Sentence> getDetectedSentences() {
    SentenceHolder prevHolder = this.getPreviousSentenceHolder();
    SentenceHolder currentHolder = this.getCurrentSentenceHolder();
    for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
        currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
        currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
    }
    List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
    leftover = null;
    if (sentences.size() > 0) {
        Sentence lastSentence = sentences.get(sentences.size() - 1);
        if (!lastSentence.isComplete()) {
            leftover = lastSentence;
            if (LOG.isTraceEnabled())
                LOG.trace("Set leftover to: " + leftover.toString());
            sentences.remove(sentences.size() - 1);
        }
    }
    // ensure that sentence annotations get added to the raw text as well
    for (Sentence sentence : sentences) {
        sentence.addObserver(new AnnotationObserver() {

            int myOrigin = RawTextProcessor.this.originalStartIndex;

            @Override
            public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
                List<Annotation<T>> newAnnotations = new ArrayList<>();
                for (Annotation<T> annotation : annotations) {
                    int originalStart = sentence.getOriginalIndex(annotation.getStart());
                    int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
                    Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
                    newAnnotations.add(newAnnotation);
                }
                RawTextProcessor.this.addAnnotations(newAnnotations);
            }

            @Override
            public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
            }
        });
    }
    // position 0.
    if (currentHolder.getOriginalTextSegments().size() > 0) {
        if (leftover == null) {
            leftover = new Sentence("", currentFile, sessionId);
        }
        StringBuilder segmentsToInsert = new StringBuilder();
        if (leftover.getLeftoverOriginalText().length() > 0)
            segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
        for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
            segmentsToInsert.append(originalTextSegment);
        }
        leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
    }
    return sentences;
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver) ArrayList(java.util.ArrayList) List(java.util.List)

Example 4 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RollingTextBlock method getRawTextBlock.

/**
 * Get a raw text block for annotation by filters. This covers blocks 3 and 4
 * only of the current RollingTextBlock, with analysis end at the end of
 * block3. It is assumed that annotations crossing block 2 and 3 were already
 * added by a predecessor.
 */
public AnnotatedText getRawTextBlock() {
    AnnotatedText rawTextBlock = new AnnotatedText(this.block3 + this.block4, 0, this.block3.length());
    rawTextBlock.addObserver(new AnnotationObserver() {

        @Override
        public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
            if (annotations.size() > 0) {
                int offset = RollingTextBlock.this.block1.length() + RollingTextBlock.this.block2.length();
                List<Annotation<T>> newAnnotations = new ArrayList<>();
                for (Annotation<T> annotation : annotations) {
                    Annotation<T> newAnnotation = annotation.getAnnotation(annotation.getStart() + offset, annotation.getEnd() + offset);
                    newAnnotations.add(newAnnotation);
                }
                RollingTextBlock.this.addAnnotations(newAnnotations);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("RawTextBlock Annotations received: " + annotations);
                    LOG.trace("RawTextBlock Annotations added: " + newAnnotations);
                }
            }
        }

        @Override
        public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
        }
    });
    return rawTextBlock;
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) List(java.util.List) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver)

Example 5 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class Tokeniser method tokeniseWithDecisions.

/**
 * Similar to {@link #tokeniseWithDecisions(String)}, but the text to be
 * tokenised is contained within a Sentence object.
 *
 * @param sentence
 *          the sentence to tokenise
 * @param labels
 *          the labels to add to any annotations added.
 * @throws IOException
 */
public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence, String... labels) throws TalismaneException, IOException {
    // Initially, separate the sentence into tokens using the separators
    // provided
    TokenSequence tokenSequence = new TokenSequence(sentence, this.sessionId);
    tokenSequence.findDefaultTokens();
    List<TokenisedAtomicTokenSequence> sequences = this.tokeniseInternal(tokenSequence, sentence);
    LOG.debug("####Final token sequences:");
    int j = 1;
    for (TokenisedAtomicTokenSequence sequence : sequences) {
        TokenSequence newTokenSequence = sequence.inferTokenSequence();
        for (TokenFilter filter : filters) filter.apply(newTokenSequence);
        if (j == 1) {
            // add annotations for the very first token sequence
            List<Annotation<TokenBoundary>> tokenBoundaries = new ArrayList<>();
            for (Token token : newTokenSequence) {
                Annotation<TokenBoundary> tokenBoundary = new Annotation<>(token.getStartIndex(), token.getEndIndex(), new TokenBoundary(token.getText(), token.getAnalyisText(), token.getAttributes()), labels);
                tokenBoundaries.add(tokenBoundary);
            }
            sentence.addAnnotations(tokenBoundaries);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Token sequence " + j);
            LOG.debug("Atomic sequence: " + sequence);
            LOG.debug("Resulting sequence: " + newTokenSequence);
        }
        j++;
    }
    return sequences;
}
Also used : ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) TokenFilter(com.joliciel.talismane.tokeniser.filters.TokenFilter)

Aggregations

Annotation (com.joliciel.talismane.Annotation)36 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 ArrayList (java.util.ArrayList)23 Config (com.typesafe.config.Config)22 AnnotatedText (com.joliciel.talismane.AnnotatedText)20 Sentence (com.joliciel.talismane.rawText.Sentence)12 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)11 List (java.util.List)7 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)6 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)6 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)4 TokenPlaceholder (com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder)4 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)4 Matcher (java.util.regex.Matcher)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)3 Decision (com.joliciel.talismane.machineLearning.Decision)3 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)3 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)3