Search in sources :

Example 1 with RawTextReplaceMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker in project talismane by joliciel-informatique.

the class RawTextTest method testGetProcessedText.

@Test
public void testGetProcessedText() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    String text = "1 2 3<skip>skip</skip> 4<skip>skip</skip> five";
    RawText rawText = new RawText(text, true, sessionId);
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("1 2 3".length(), "1 2 3<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
    skips.add(new Annotation<>("1 2 3<skip>skip</skip> 4".length(), "1 2 3<skip>skip</skip> 4<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
    rawText.addAnnotations(skips);
    List<Annotation<RawTextReplaceMarker>> replaces = new ArrayList<>();
    replaces.add(new Annotation<>("1 2 3<skip>skip</skip> 4<skip>skip</skip> ".length(), "1 2 3<skip>skip</skip> 4<skip>skip</skip> five".length(), new RawTextReplaceMarker("me", "5"), labels));
    rawText.addAnnotations(replaces);
    AnnotatedText processedTextBlock = rawText.getProcessedText();
    assertEquals("1 2 3 4 5", processedTextBlock.getText());
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 2 with RawTextReplaceMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker in project talismane by joliciel-informatique.

the class RawTextRegexAnnotator method annotate.

@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
    if (LOG.isTraceEnabled()) {
        LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
    }
    List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
    List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
    Matcher matcher = pattern.matcher(textBlock.getText());
    while (matcher.find()) {
        int matcherStart = 0;
        int matcherEnd = 0;
        if (groupIndex == 0) {
            matcherStart = matcher.start();
            matcherEnd = matcher.end();
        } else {
            matcherStart = matcher.start(groupIndex);
            matcherEnd = matcher.end(groupIndex);
        }
        CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
        if (LOG.isTraceEnabled()) {
            LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
            if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
                LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
            }
            LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
        }
        if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
            String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
            throw new MatchTooLargeException(errorString);
        }
        if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
            for (RawTextMarkType filterType : filterTypes) {
                switch(filterType) {
                    case REPLACE:
                        {
                            String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Setting replacement to: " + insertionText);
                            }
                            RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case NO_SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SKIP:
                        {
                            RawTextMarker marker = new RawTextSkipMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case TAG:
                        {
                            Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
                            tokenAttributes.add(annotation);
                            break;
                        }
                    default:
                        {
                            RawTextMarker marker = new RawTextMarker(filterType, this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                }
            }
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
        }
    }
    if (rawTextMarkers.size() > 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
            LOG.debug("Added annotations: " + rawTextMarkers);
        }
    }
    if (rawTextMarkers.size() > 0)
        textBlock.addAnnotations(rawTextMarkers);
    if (tokenAttributes.size() > 0)
        textBlock.addAnnotations(tokenAttributes);
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)

Example 3 with RawTextReplaceMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker in project talismane by joliciel-informatique.

the class RegexMarkerFilterTest method testApplyWithReplacement.

@Test
public void testApplyWithReplacement() throws Exception {
    RawTextRegexAnnotator filter = new RawTextRegexAnnotator(RawTextMarkType.REPLACE, "<skip>(.*?)</skip>", 0, 1000);
    filter.setReplacement("Skipped:$1");
    AnnotatedText text = new AnnotatedText("J'ai du <skip>skip me</skip>mal à le croire.<skip>skip this</skip>");
    filter.annotate(text);
    LOG.debug(text.getAnnotations().toString());
    List<Annotation<RawTextReplaceMarker>> replaces = text.getAnnotations(RawTextReplaceMarker.class);
    assertEquals(2, replaces.size());
    int i = 0;
    for (Annotation<RawTextReplaceMarker> replace : replaces) {
        if (i == 0) {
            assertEquals("J'ai du ".length(), replace.getStart());
            assertEquals("J'ai du <skip>skip me</skip>".length(), replace.getEnd());
            assertEquals("Skipped:skip me", replace.getData().getInsertionText());
        } else if (i == 2) {
            assertEquals("J'ai du <skip>skip me</skip>mal à le croire.".length(), replace.getStart());
            assertEquals("J'ai du <skip>skip me</skip>mal à le croire.<skip>skip this</skip>".length(), replace.getEnd());
            assertEquals("Skipped:skip this", replace.getData().getInsertionText());
        }
        i++;
    }
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) AnnotatedText(com.joliciel.talismane.AnnotatedText) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 4 with RawTextReplaceMarker

use of com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker in project talismane by joliciel-informatique.

the class RollingTextBlockTest method testGetProcessedTextBlock.

@Test
public void testGetProcessedTextBlock() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String[] labels = new String[0];
    RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
    textBlock = textBlock.roll("1 ");
    textBlock = textBlock.roll("2 ");
    textBlock = textBlock.roll("3<skip>skip</skip> 4<sk");
    textBlock = textBlock.roll("ip>skip</skip> five");
    // the rawTextBlock always contains the last two added sub-blocks
    // so annotations are relative to these sub-blocks
    AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
    List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
    skips.add(new Annotation<>("3".length(), "3<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
    skips.add(new Annotation<>("3<skip>skip</skip> 4".length(), "3<skip>skip</skip> 4<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
    rawTextBlock.addAnnotations(skips);
    textBlock = textBlock.roll(" 6");
    rawTextBlock = textBlock.getRawTextBlock();
    List<Annotation<RawTextReplaceMarker>> replaces = new ArrayList<>();
    replaces.add(new Annotation<>("ip>skip</skip> ".length(), "ip>skip</skip> five".length(), new RawTextReplaceMarker("me", "5"), labels));
    rawTextBlock.addAnnotations(replaces);
    AnnotatedText processedTextBlock = textBlock.getProcessedText();
    // the processed text always concerns sub-blocks 1, 2 and 3
    // at this point, sub-block 1 has already been flushed
    assertEquals("2 3 4 5", processedTextBlock.getText());
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) AnnotatedText(com.joliciel.talismane.AnnotatedText) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Annotation (com.joliciel.talismane.Annotation)4 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)4 AnnotatedText (com.joliciel.talismane.AnnotatedText)3 TalismaneTest (com.joliciel.talismane.TalismaneTest)3 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)3 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 Config (com.typesafe.config.Config)2 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)1 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)1 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)1 Matcher (java.util.regex.Matcher)1