Search in sources :

Example 1 with TokenAttribute

use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testPuctuation.

@Test
public void testPuctuation() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
    String replacement = null;
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
    Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
    filter.annotate(text);
    @SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
    LOG.debug(annotations.toString());
    assertEquals(2, annotations.size());
    @SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
    assertEquals("Bonjour".length(), placeholder.getStart());
    assertEquals("Bonjour.".length(), placeholder.getEnd());
    assertEquals("featureType", placeholder.getData().getKey());
    assertEquals("punctuation", placeholder.getData().getValue());
}
Also used : Config(com.typesafe.config.Config) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 2 with TokenAttribute

use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.

the class RegexTokenAnnotatorTest method testStartOfInput.

@Test
public void testStartOfInput() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String regex = "^Résumé\\.";
    String replacement = null;
    RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
    filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
    Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
    filter.annotate(text);
    @SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
    LOG.debug(annotations.toString());
    assertEquals(1, annotations.size());
    @SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
    assertEquals(0, placeholder.getStart());
    assertEquals(7, placeholder.getEnd());
    assertEquals("TAG", placeholder.getData().getKey());
}
Also used : Config(com.typesafe.config.Config) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Sentence(com.joliciel.talismane.rawText.Sentence) Annotation(com.joliciel.talismane.Annotation) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 3 with TokenAttribute

use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.

the class RawTextRegexAnnotator method annotate.

@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
    if (LOG.isTraceEnabled()) {
        LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
    }
    List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
    List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
    Matcher matcher = pattern.matcher(textBlock.getText());
    while (matcher.find()) {
        int matcherStart = 0;
        int matcherEnd = 0;
        if (groupIndex == 0) {
            matcherStart = matcher.start();
            matcherEnd = matcher.end();
        } else {
            matcherStart = matcher.start(groupIndex);
            matcherEnd = matcher.end(groupIndex);
        }
        CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
        if (LOG.isTraceEnabled()) {
            LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
            if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
                LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
            }
            LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
        }
        if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
            String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
            throw new MatchTooLargeException(errorString);
        }
        if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
            for (RawTextMarkType filterType : filterTypes) {
                switch(filterType) {
                    case REPLACE:
                        {
                            String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Setting replacement to: " + insertionText);
                            }
                            RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case NO_SENTENCE_BREAK:
                        {
                            RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case SKIP:
                        {
                            RawTextMarker marker = new RawTextSkipMarker(this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                    case TAG:
                        {
                            Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
                            tokenAttributes.add(annotation);
                            break;
                        }
                    default:
                        {
                            RawTextMarker marker = new RawTextMarker(filterType, this.toString());
                            Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
                            rawTextMarkers.add(annotation);
                            break;
                        }
                }
            }
        } else {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
            }
        }
    }
    if (rawTextMarkers.size() > 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
            LOG.debug("Added annotations: " + rawTextMarkers);
        }
    }
    if (rawTextMarkers.size() > 0)
        textBlock.addAnnotations(rawTextMarkers);
    if (tokenAttributes.size() > 0)
        textBlock.addAnnotations(tokenAttributes);
}
Also used : RawTextReplaceMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) RawTextSkipMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)

Example 4 with TokenAttribute

use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.

the class AbstractRegexAnnotator method annotate.

@Override
public void annotate(Sentence annotatedText, String... labels) {
    List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
    List<Annotation<TokenAttribute<?>>> annotations = new ArrayList<>();
    Matcher matcher = this.getPattern().matcher(annotatedText.getText());
    int lastStart = -1;
    while (matcher.find()) {
        int start = matcher.start(groupIndex);
        if (start > lastStart) {
            int end = matcher.end(groupIndex);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Regex: " + this.regex);
                LOG.trace("Next match: " + annotatedText.getText().subSequence(matcher.start(), matcher.end()).toString().replace('\n', '¶').replace('\r', '¶'));
                if (matcher.start() != start || matcher.end() != end) {
                    LOG.trace("But matching group: " + annotatedText.getText().subSequence(start, end).toString().replace('\n', '¶').replace('\r', '¶'));
                }
            }
            if (this.singleToken) {
                String replacement = this.findReplacement(annotatedText.getText(), matcher);
                TokenPlaceholder placeholder = new TokenPlaceholder(replacement, regex);
                Annotation<TokenPlaceholder> placeholderAnnotation = new Annotation<>(start, end, placeholder, labels);
                placeholders.add(placeholderAnnotation);
                if (LOG.isTraceEnabled())
                    LOG.trace("Added placeholder: " + placeholder.toString());
            }
            for (String key : attributes.keySet()) {
                TokenAttribute<?> attribute = attributes.get(key);
                Annotation<TokenAttribute<?>> annotation = new Annotation<>(start, end, attribute, labels);
                annotations.add(annotation);
                if (LOG.isTraceEnabled())
                    LOG.trace("Added attribute: " + attribute.toString());
            }
        }
        lastStart = start;
    }
    annotatedText.addAnnotations(placeholders);
    annotatedText.addAnnotations(annotations);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) TokenAttribute(com.joliciel.talismane.tokeniser.TokenAttribute) Annotation(com.joliciel.talismane.Annotation)

Aggregations

Annotation (com.joliciel.talismane.Annotation)4 TokenAttribute (com.joliciel.talismane.tokeniser.TokenAttribute)4 TalismaneTest (com.joliciel.talismane.TalismaneTest)2 Sentence (com.joliciel.talismane.rawText.Sentence)2 StringAttribute (com.joliciel.talismane.tokeniser.StringAttribute)2 Config (com.typesafe.config.Config)2 ArrayList (java.util.ArrayList)2 Matcher (java.util.regex.Matcher)2 Test (org.junit.Test)2 RawTextNoSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker)1 RawTextReplaceMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextReplaceMarker)1 RawTextSentenceBreakMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker)1 RawTextSkipMarker (com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker)1