use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testPuctuation.
@Test
public void testPuctuation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(2, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals("Bonjour".length(), placeholder.getStart());
assertEquals("Bonjour.".length(), placeholder.getEnd());
assertEquals("featureType", placeholder.getData().getKey());
assertEquals("punctuation", placeholder.getData().getValue());
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApplyWithConsecutiveDollars.
@Test
public void testApplyWithConsecutiveDollars() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
String replacement = "\\$Email$2$1";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
filter.annotate(text);
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
LOG.debug(placeholders.toString());
assertEquals(1, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals(14, placeholder.getStart());
assertEquals(33, placeholder.getEnd());
assertEquals("$Email@test.comjoe.schmoe", placeholder.getData().getReplacement());
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testStartOfInput.
@Test
public void testStartOfInput() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "^Résumé\\.";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(1, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals(0, placeholder.getStart());
assertEquals(7, placeholder.getEnd());
assertEquals("TAG", placeholder.getData().getKey());
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RawTextRegexAnnotator method annotate.
@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
if (LOG.isTraceEnabled()) {
LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
}
List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
Matcher matcher = pattern.matcher(textBlock.getText());
while (matcher.find()) {
int matcherStart = 0;
int matcherEnd = 0;
if (groupIndex == 0) {
matcherStart = matcher.start();
matcherEnd = matcher.end();
} else {
matcherStart = matcher.start(groupIndex);
matcherEnd = matcher.end(groupIndex);
}
CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
if (LOG.isTraceEnabled()) {
LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
}
LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
}
if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
throw new MatchTooLargeException(errorString);
}
if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
for (RawTextMarkType filterType : filterTypes) {
switch(filterType) {
case REPLACE:
{
String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
if (LOG.isTraceEnabled()) {
LOG.trace("Setting replacement to: " + insertionText);
}
RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case NO_SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SKIP:
{
RawTextMarker marker = new RawTextSkipMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case TAG:
{
Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
tokenAttributes.add(annotation);
break;
}
default:
{
RawTextMarker marker = new RawTextMarker(filterType, this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
}
}
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
}
}
if (rawTextMarkers.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
LOG.debug("Added annotations: " + rawTextMarkers);
}
}
if (rawTextMarkers.size() > 0)
textBlock.addAnnotations(rawTextMarkers);
if (tokenAttributes.size() > 0)
textBlock.addAnnotations(tokenAttributes);
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RawTextProcessor method getProcessedText.
/**
* Return processed text ready for sentence detection.
*
* It has sentence break and non-sentence-break annotations inherited from the
* present RawTextProcessor. Any sentence-break annotations added will
* automatically get reflected in the current RollingTextBlock.
*
* @return
*/
public final AnnotatedText getProcessedText() {
LOG.trace("getProcessedTextBlock");
int textStartPos = this.getTextProcessingStart();
int textEndPos = this.getTextProcessingEnd();
SentenceHolder prevHolder = this.getPreviousSentenceHolder();
SentenceHolder currentHolder = this.getCurrentSentenceHolder();
SentenceHolder nextHolder = this.getNextSentenceHolder();
StringBuilder sb = new StringBuilder();
String processedText1 = prevHolder.getProcessedText();
String processedText2 = currentHolder.getProcessedText();
String processedText3 = nextHolder.getProcessedText();
sb.append(processedText1);
sb.append(processedText2);
sb.append(processedText3);
String processedText = sb.toString();
List<Annotation<RawTextMarker>> myAnnotations = this.getAnnotations(RawTextMarker.class);
List<Annotation<RawTextMarker>> hisAnnotations = new ArrayList<>();
int prevHolderOriginalIndex = prevHolder.getOriginalStartIndex();
for (Annotation<RawTextMarker> myAnnotation : myAnnotations) {
if ((myAnnotation.getStart() >= textStartPos && myAnnotation.getStart() < textEndPos) || ((myAnnotation.getEnd() >= textStartPos && myAnnotation.getEnd() < textEndPos))) {
int originalStart = prevHolderOriginalIndex + myAnnotation.getStart();
int originalEnd = prevHolderOriginalIndex + myAnnotation.getEnd();
int localStart = processedText1.length();
if (originalStart >= currentHolder.getOriginalStartIndex())
localStart += currentHolder.getIndex(originalStart);
int localEnd = processedText1.length() + currentHolder.getIndex(originalEnd);
Annotation<RawTextMarker> hisAnnotation = myAnnotation.getAnnotation(localStart, localEnd);
hisAnnotations.add(hisAnnotation);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("raw annotations: " + myAnnotations);
LOG.trace("processed annotations: " + hisAnnotations);
}
AnnotatedText processedTextBlock = new AnnotatedText(processedText, processedText1.length(), processedText1.length() + processedText2.length());
processedTextBlock.addAnnotations(hisAnnotations);
processedTextBlock.addObserver(new AnnotationObserver() {
// an observer which adds any annotations added to the
// processedTextBlock back to myself, at the correct position
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
int offset = textStartPos;
int length1 = prevHolder.getProcessedText().length();
int length2 = currentHolder.getProcessedText().length();
int sentence2HolderStart = currentHolder.getOriginalStartIndex();
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
int originalStart = -1;
if (annotation.getStart() < length1)
originalStart = prevHolder.getOriginalIndex(annotation.getStart());
else if (annotation.getStart() < length1 + length2)
originalStart = currentHolder.getOriginalIndex(annotation.getStart() - length1);
if (originalStart >= 0) {
int originalEnd = -1;
if (annotation.getEnd() <= length1 + length2)
originalEnd = currentHolder.getOriginalIndex(annotation.getEnd() - length1);
else
originalEnd = nextHolder.getOriginalIndex(annotation.getEnd() - (length1 + length2));
if (originalEnd >= 0) {
Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - sentence2HolderStart + offset, originalEnd - sentence2HolderStart + offset);
newAnnotations.add(newAnnotation);
if (annotation.getData() instanceof SentenceBoundary) {
@SuppressWarnings("unchecked") Annotation<SentenceBoundary> sentenceBoundary = (Annotation<SentenceBoundary>) annotation;
sentenceBoundaries.add(sentenceBoundary);
}
}
}
}
RawTextProcessor.this.addAnnotations(newAnnotations);
if (LOG.isTraceEnabled()) {
LOG.trace("ProcessedTextBlock Annotations received: " + annotations);
LOG.trace("ProcessedTextBlock Annotations added: " + newAnnotations);
}
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
return processedTextBlock;
}
Aggregations