use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testPuctuation.
@Test
public void testPuctuation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(2, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals("Bonjour".length(), placeholder.getStart());
assertEquals("Bonjour.".length(), placeholder.getEnd());
assertEquals("featureType", placeholder.getData().getKey());
assertEquals("punctuation", placeholder.getData().getValue());
}
use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testStartOfInput.
@Test
public void testStartOfInput() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "^Résumé\\.";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(1, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals(0, placeholder.getStart());
assertEquals(7, placeholder.getEnd());
assertEquals("TAG", placeholder.getData().getKey());
}
use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.
the class RawTextRegexAnnotator method annotate.
@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
if (LOG.isTraceEnabled()) {
LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
}
List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
Matcher matcher = pattern.matcher(textBlock.getText());
while (matcher.find()) {
int matcherStart = 0;
int matcherEnd = 0;
if (groupIndex == 0) {
matcherStart = matcher.start();
matcherEnd = matcher.end();
} else {
matcherStart = matcher.start(groupIndex);
matcherEnd = matcher.end(groupIndex);
}
CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
if (LOG.isTraceEnabled()) {
LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
}
LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
}
if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
throw new MatchTooLargeException(errorString);
}
if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
for (RawTextMarkType filterType : filterTypes) {
switch(filterType) {
case REPLACE:
{
String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
if (LOG.isTraceEnabled()) {
LOG.trace("Setting replacement to: " + insertionText);
}
RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case NO_SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SKIP:
{
RawTextMarker marker = new RawTextSkipMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case TAG:
{
Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
tokenAttributes.add(annotation);
break;
}
default:
{
RawTextMarker marker = new RawTextMarker(filterType, this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
}
}
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
}
}
if (rawTextMarkers.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
LOG.debug("Added annotations: " + rawTextMarkers);
}
}
if (rawTextMarkers.size() > 0)
textBlock.addAnnotations(rawTextMarkers);
if (tokenAttributes.size() > 0)
textBlock.addAnnotations(tokenAttributes);
}
use of com.joliciel.talismane.tokeniser.TokenAttribute in project talismane by joliciel-informatique.
the class AbstractRegexAnnotator method annotate.
@Override
public void annotate(Sentence annotatedText, String... labels) {
List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
List<Annotation<TokenAttribute<?>>> annotations = new ArrayList<>();
Matcher matcher = this.getPattern().matcher(annotatedText.getText());
int lastStart = -1;
while (matcher.find()) {
int start = matcher.start(groupIndex);
if (start > lastStart) {
int end = matcher.end(groupIndex);
if (LOG.isTraceEnabled()) {
LOG.trace("Regex: " + this.regex);
LOG.trace("Next match: " + annotatedText.getText().subSequence(matcher.start(), matcher.end()).toString().replace('\n', '¶').replace('\r', '¶'));
if (matcher.start() != start || matcher.end() != end) {
LOG.trace("But matching group: " + annotatedText.getText().subSequence(start, end).toString().replace('\n', '¶').replace('\r', '¶'));
}
}
if (this.singleToken) {
String replacement = this.findReplacement(annotatedText.getText(), matcher);
TokenPlaceholder placeholder = new TokenPlaceholder(replacement, regex);
Annotation<TokenPlaceholder> placeholderAnnotation = new Annotation<>(start, end, placeholder, labels);
placeholders.add(placeholderAnnotation);
if (LOG.isTraceEnabled())
LOG.trace("Added placeholder: " + placeholder.toString());
}
for (String key : attributes.keySet()) {
TokenAttribute<?> attribute = attributes.get(key);
Annotation<TokenAttribute<?>> annotation = new Annotation<>(start, end, attribute, labels);
annotations.add(annotation);
if (LOG.isTraceEnabled())
LOG.trace("Added attribute: " + attribute.toString());
}
}
lastStart = start;
}
annotatedText.addAnnotations(placeholders);
annotatedText.addAnnotations(annotations);
}
Aggregations