Search in sources :

Example 16 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testPuctuation.

@Test
public void testPuctuation() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
    String replacement = null;
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
    Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
    filter.annotate(text);
    @SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
    LOG.debug(annotations.toString());
    assertEquals(2, annotations.size());
    @SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
    assertEquals("Bonjour".length(), placeholder.getStart());
    assertEquals("Bonjour.".length(), placeholder.getEnd());
    assertEquals("featureType", placeholder.getData().getKey());
    assertEquals("punctuation", placeholder.getData().getValue());
}
Also used : Config(com.typesafe.config.Config) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 17 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testApplyWithConsecutiveDollars.

@Test
public void testApplyWithConsecutiveDollars() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
    String replacement = "\\$Email$2$1";
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
    filter.annotate(text);
    List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
    LOG.debug(placeholders.toString());
    assertEquals(1, placeholders.size());
    Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
    assertEquals(14, placeholder.getStart());
    assertEquals(33, placeholder.getEnd());
    assertEquals("$Email@test.comjoe.schmoe", placeholder.getData().getReplacement());
}
Also used : Config(com.typesafe.config.Config) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 18 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testStartOfInput.

@Test
public void testStartOfInput() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "^Résumé\\.";
    String replacement = null;
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
    Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
    filter.annotate(text);
    @SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
    LOG.debug(annotations.toString());
    assertEquals(1, annotations.size());
    @SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
    assertEquals(0, placeholder.getStart());
    assertEquals(7, placeholder.getEnd());
    assertEquals("TAG", placeholder.getData().getKey());
}
Also used : Config(com.typesafe.config.Config) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 19 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RawTextRegexAnnotator method annotate.

@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
    if (LOG.isTraceEnabled()) {
        LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
    }
    List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
    List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
    Matcher matcher = pattern.matcher(textBlock.getText());
    while (matcher.find()) {
        int matcherStart = 0;
        int matcherEnd = 0;
        if (groupIndex == 0) {
            matcherStart = matcher.start();
            matcherEnd = matcher.end();
        } else {
            matcherStart = matcher.start(groupIndex);
            matcherEnd = matcher.end(groupIndex);
        }
        CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
        if (LOG.isTraceEnabled()) {
            LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
            if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
                LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
            }
            LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
        }
        if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
            String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
            throw new MatchTooLargeException(errorString);
        }
        if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
            for (RawTextMarkType filterType : filterTypes) {
                switch(filterType) {
                    case REPLACE:
                        {
                            String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Setting replacement to: " + insertionText);
                            }
                            RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case NO_SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SKIP:
                        {
                            RawTextMarker marker = new RawTextSkipMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case TAG:
                        {
                            Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
                            tokenAttributes.add(annotation);
                            break;
                        }
                    default:
                        {
                            RawTextMarker marker = new RawTextMarker(filterType, this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                }
            }
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
        }
    }
    if (rawTextMarkers.size() > 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
            LOG.debug("Added annotations: " + rawTextMarkers);
        }
    }
    if (rawTextMarkers.size() > 0)
        textBlock.addAnnotations(rawTextMarkers);
    if (tokenAttributes.size() > 0)
        textBlock.addAnnotations(tokenAttributes);
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)

Example 20 with Annotation

use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.

the class RawTextProcessor method getProcessedText.

/**
 * Return processed text ready for sentence detection.
 *
 * It has sentence break and non-sentence-break annotations inherited from the
 * present RawTextProcessor. Any sentence-break annotations added will
 * automatically get reflected in the current RollingTextBlock.
 *
 * @return
 */
public final AnnotatedText getProcessedText() {
    LOG.trace("getProcessedTextBlock");
    int textStartPos = this.getTextProcessingStart();
    int textEndPos = this.getTextProcessingEnd();
    SentenceHolder prevHolder = this.getPreviousSentenceHolder();
    SentenceHolder currentHolder = this.getCurrentSentenceHolder();
    SentenceHolder nextHolder = this.getNextSentenceHolder();
    StringBuilder sb = new StringBuilder();
    String processedText1 = prevHolder.getProcessedText();
    String processedText2 = currentHolder.getProcessedText();
    String processedText3 = nextHolder.getProcessedText();
    sb.append(processedText1);
    sb.append(processedText2);
    sb.append(processedText3);
    String processedText = sb.toString();
    List<Annotation<RawTextMarker>> myAnnotations = this.getAnnotations(RawTextMarker.class);
    List<Annotation<RawTextMarker>> hisAnnotations = new ArrayList<>();
    int prevHolderOriginalIndex = prevHolder.getOriginalStartIndex();
    for (Annotation<RawTextMarker> myAnnotation : myAnnotations) {
        if ((myAnnotation.getStart() >= textStartPos && myAnnotation.getStart() < textEndPos) || ((myAnnotation.getEnd() >= textStartPos && myAnnotation.getEnd() < textEndPos))) {
            int originalStart = prevHolderOriginalIndex + myAnnotation.getStart();
            int originalEnd = prevHolderOriginalIndex + myAnnotation.getEnd();
            int localStart = processedText1.length();
            if (originalStart >= currentHolder.getOriginalStartIndex())
                localStart += currentHolder.getIndex(originalStart);
            int localEnd = processedText1.length() + currentHolder.getIndex(originalEnd);
            Annotation<RawTextMarker> hisAnnotation = myAnnotation.getAnnotation(localStart, localEnd);
            hisAnnotations.add(hisAnnotation);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("raw annotations: " + myAnnotations);
        LOG.trace("processed annotations: " + hisAnnotations);
    }
    AnnotatedText processedTextBlock = new AnnotatedText(processedText, processedText1.length(), processedText1.length() + processedText2.length());
    processedTextBlock.addAnnotations(hisAnnotations);
    processedTextBlock.addObserver(new AnnotationObserver() {

        // an observer which adds any annotations added to the
        // processedTextBlock back to myself, at the correct position
        @Override
        public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
            int offset = textStartPos;
            int length1 = prevHolder.getProcessedText().length();
            int length2 = currentHolder.getProcessedText().length();
            int sentence2HolderStart = currentHolder.getOriginalStartIndex();
            List<Annotation<T>> newAnnotations = new ArrayList<>();
            for (Annotation<T> annotation : annotations) {
                int originalStart = -1;
                if (annotation.getStart() < length1)
                    originalStart = prevHolder.getOriginalIndex(annotation.getStart());
                else if (annotation.getStart() < length1 + length2)
                    originalStart = currentHolder.getOriginalIndex(annotation.getStart() - length1);
                if (originalStart >= 0) {
                    int originalEnd = -1;
                    if (annotation.getEnd() <= length1 + length2)
                        originalEnd = currentHolder.getOriginalIndex(annotation.getEnd() - length1);
                    else
                        originalEnd = nextHolder.getOriginalIndex(annotation.getEnd() - (length1 + length2));
                    if (originalEnd >= 0) {
                        Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - sentence2HolderStart + offset, originalEnd - sentence2HolderStart + offset);
                        newAnnotations.add(newAnnotation);
                        if (annotation.getData() instanceof SentenceBoundary) {
                            @SuppressWarnings("unchecked") Annotation<SentenceBoundary> sentenceBoundary = (Annotation<SentenceBoundary>) annotation;
                            sentenceBoundaries.add(sentenceBoundary);
                        }
                    }
                }
            }
            RawTextProcessor.this.addAnnotations(newAnnotations);
            if (LOG.isTraceEnabled()) {
                LOG.trace("ProcessedTextBlock Annotations received: " + annotations);
                LOG.trace("ProcessedTextBlock Annotations added: " + newAnnotations);
            }
        }

        @Override
        public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
        }
    });
    return processedTextBlock;
}
Also used : SentenceBoundary(com.joliciel.talismane.sentenceDetector.SentenceBoundary) AnnotatedText(com.joliciel.talismane.AnnotatedText) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) AnnotationObserver(com.joliciel.talismane.AnnotationObserver) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

Annotation (com.joliciel.talismane.Annotation)36 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 ArrayList (java.util.ArrayList)23 Config (com.typesafe.config.Config)22 AnnotatedText (com.joliciel.talismane.AnnotatedText)20 Sentence (com.joliciel.talismane.rawText.Sentence)12 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)11 List (java.util.List)7 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)6 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)6 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)4 TokenPlaceholder (com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder)4 SentenceBoundary (com.joliciel.talismane.sentenceDetector.SentenceBoundary)4 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)4 Matcher (java.util.regex.Matcher)4 AnnotationObserver (com.joliciel.talismane.AnnotationObserver)3 Decision (com.joliciel.talismane.machineLearning.Decision)3 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)3 SentenceDetectorFeature (com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature)3