use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RawTextRegexAnnotator method annotate.
@Override
public void annotate(AnnotatedText textBlock, String... labels) throws MatchTooLargeException {
if (LOG.isTraceEnabled()) {
LOG.trace("Matching " + regex.replace('\n', '¶').replace('\r', '¶'));
}
List<Annotation<RawTextMarker>> rawTextMarkers = new ArrayList<>();
List<Annotation<TokenAttribute<?>>> tokenAttributes = new ArrayList<>();
Matcher matcher = pattern.matcher(textBlock.getText());
while (matcher.find()) {
int matcherStart = 0;
int matcherEnd = 0;
if (groupIndex == 0) {
matcherStart = matcher.start();
matcherEnd = matcher.end();
} else {
matcherStart = matcher.start(groupIndex);
matcherEnd = matcher.end(groupIndex);
}
CharSequence matchText = textBlock.getText().subSequence(matcher.start(), matcher.end());
if (LOG.isTraceEnabled()) {
LOG.trace("Next match: " + matchText.toString().replace('\n', '¶').replace('\r', '¶'));
if (matcher.start() != matcherStart || matcher.end() != matcherEnd) {
LOG.trace("But matching group: " + textBlock.getText().subSequence(matcherStart, matcherEnd).toString().replace('\n', '¶').replace('\r', '¶'));
}
LOG.trace("matcher.start()=" + matcher.start() + ", matcher.end()=" + matcher.end() + ", matcherStart=" + matcherStart + ", matcherEnd=" + matcherEnd + ", analysisStart=" + textBlock.getAnalysisStart() + ", analysisEnd=" + textBlock.getAnalysisEnd());
}
if (blockSize > 0 && matcherEnd - matcherStart > blockSize) {
String errorString = "Match size (" + (matcherEnd - matcherStart) + ") bigger than block size (" + blockSize + "). " + "Increase blockSize or change filter. " + "Maybe you need to change a greedy quantifier (e.g. .*) to a reluctant quantifier (e.g. .*?)? " + "Regex: " + regex + ". Text: " + matchText;
throw new MatchTooLargeException(errorString);
}
if (matcherStart >= textBlock.getAnalysisStart() && matcherStart < textBlock.getAnalysisEnd()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Start in range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
for (RawTextMarkType filterType : filterTypes) {
switch(filterType) {
case REPLACE:
{
String insertionText = RegexUtils.getReplacement(replacement, textBlock.getText(), matcher);
if (LOG.isTraceEnabled()) {
LOG.trace("Setting replacement to: " + insertionText);
}
RawTextMarker marker = new RawTextReplaceMarker(this.toString(), insertionText);
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case NO_SENTENCE_BREAK:
{
RawTextMarker marker = new RawTextNoSentenceBreakMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case SKIP:
{
RawTextMarker marker = new RawTextSkipMarker(this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
case TAG:
{
Annotation<TokenAttribute<?>> annotation = new Annotation<TokenAttribute<?>>(matcherStart, matcherEnd, this.attribute, labels);
tokenAttributes.add(annotation);
break;
}
default:
{
RawTextMarker marker = new RawTextMarker(filterType, this.toString());
Annotation<RawTextMarker> annotation = new Annotation<>(matcherStart, matcherEnd, marker, labels);
rawTextMarkers.add(annotation);
break;
}
}
}
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("Start out of range: analysisStart " + textBlock.getAnalysisStart() + ">= matcherStart [[" + matcherStart + "]] < analysisEnd " + textBlock.getAnalysisEnd());
}
}
}
if (rawTextMarkers.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("For regex: " + this.regex.replace('\n', '¶').replace('\r', '¶'));
LOG.debug("Added annotations: " + rawTextMarkers);
}
}
if (rawTextMarkers.size() > 0)
textBlock.addAnnotations(rawTextMarkers);
if (tokenAttributes.size() > 0)
textBlock.addAnnotations(tokenAttributes);
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RawTextTest method testGetDetectedSentences.
@Test
public void testGetDetectedSentences() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
String text = "Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.";
RawText textBlock = new RawText(text, true, sessionId);
// we add a sentence break annotation to the raw text (as if it was
// added by a filter)
System.out.println("we add a sentence break annotation (as if it was added by a filter)");
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = new ArrayList<>();
sentenceBreaks.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSentenceBreakMarker("me"), labels));
textBlock.addAnnotations(sentenceBreaks);
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("Sentence 1".length(), "Sentence 1<sent/>".length(), new RawTextSkipMarker("me"), labels));
textBlock.addAnnotations(skips);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
// add sentence boundaries to the processed text (as if they were added
// by a sentence detector)
System.out.println("add sentence boundaries to the processed text (as if they were added by a sentence detector)");
List<Annotation<SentenceBoundary>> sentenceBoundaries = new ArrayList<>();
sentenceBoundaries.add(new Annotation<>("Sentence 1".length(), "Sentence 1 Sentence 2.".length(), new SentenceBoundary(), labels));
sentenceBoundaries.add(new Annotation<>("Sentence 1 Sentence 2.".length(), "Sentence 1 Sentence 2. Sentence 3.".length(), new SentenceBoundary(), labels));
processedTextBlock.addAnnotations(sentenceBoundaries);
assertEquals("Sentence 1 Sentence 2. Sentence 3. Sentence 4.", processedTextBlock.getText());
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
// ensure that the sentence boundary annotations in the original text
// are in the right place
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4.", textBlock.getText());
sentenceBoundaries = textBlock.getAnnotations(SentenceBoundary.class);
assertEquals(2, sentenceBoundaries.size());
assertEquals("Sentence 1<sent/>".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Sentence 1<sent/>Sentence 2.".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3.".length(), sentenceBoundaries.get(1).getEnd());
List<Sentence> sentences = textBlock.getDetectedSentences();
System.out.println("sentences: " + sentences.toString());
assertEquals(4, sentences.size());
assertEquals("Sentence 1", sentences.get(0).getText());
assertEquals("".length(), sentences.get(0).getOriginalIndex(0));
assertEquals("Sentence 2.", sentences.get(1).getText());
assertEquals("Sentence 1<sent/>".length(), sentences.get(1).getOriginalIndex(0));
assertEquals("Sentence 3.", sentences.get(2).getText());
assertEquals("Sentence 1<sent/>Sentence 2. ".length(), sentences.get(2).getOriginalIndex(0));
assertEquals("Sentence 4.", sentences.get(3).getText());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. ".length(), sentences.get(3).getOriginalIndex(0));
// test that sentence annotations get added to the original raw text
Sentence sentence4 = sentences.get(3);
List<Annotation<String>> annotations = new ArrayList<>();
annotations.add(new Annotation<String>("Sentence ".length(), "Sentence 4".length(), "four", labels));
sentence4.addAnnotations(annotations);
annotations = textBlock.getAnnotations(String.class);
assertEquals(1, annotations.size());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence ".length(), annotations.get(0).getStart());
assertEquals("Sentence 1<sent/>Sentence 2. Sentence 3. Sentence 4".length(), annotations.get(0).getEnd());
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RegexMarkerFilterTest method testApply.
@Test
public void testApply() throws Exception {
RawTextRegexAnnotator filter = new RawTextRegexAnnotator(RawTextMarkType.SKIP, "<skip>.*?</skip>", 0, 1000);
AnnotatedText text = new AnnotatedText("J'ai du <skip>skip me</skip>mal à le croire.<skip>skip me</skip>");
filter.annotate(text);
LOG.debug(text.getAnnotations().toString());
List<Annotation<RawTextSkipMarker>> skips = text.getAnnotations(RawTextSkipMarker.class);
assertEquals(2, skips.size());
int i = 0;
for (Annotation<RawTextSkipMarker> skip : skips) {
if (i == 0) {
assertEquals("J'ai du ".length(), skip.getStart());
assertEquals("J'ai du <skip>skip me</skip>".length(), skip.getEnd());
} else if (i == 2) {
assertEquals("J'ai du <skip>skip me</skip>mal à le croire.".length(), skip.getStart());
assertEquals("J'ai du <skip>skip me</skip>mal à le croire.<skip>skip me</skip>".length(), skip.getEnd());
}
i++;
}
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testGetRawTextBlock.
@Test
public void testGetRawTextBlock() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("1 ");
textBlock = textBlock.roll("2 ");
textBlock = textBlock.roll("3<skip>skip</skip> 4<sk");
textBlock = textBlock.roll("ip>skip</skip> 5");
// the rawTextBlock always contains the last two added sub-blocks
// so annotations are relative to these sub-blocks
AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("3".length(), "3<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
skips.add(new Annotation<>("3<skip>skip</skip> 4".length(), "3<skip>skip</skip> 4<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
rawTextBlock.addAnnotations(skips);
List<Annotation<RawTextSkipMarker>> sourceSkips = textBlock.getAnnotations(RawTextSkipMarker.class);
System.out.println(sourceSkips.toString());
assertEquals(2, sourceSkips.size());
int i = 0;
// account blocks 1 and 2
for (Annotation<RawTextSkipMarker> skip : sourceSkips) {
if (i == 0) {
assertEquals("1 2 3".length(), skip.getStart());
assertEquals("1 2 3<skip>skip</skip>".length(), skip.getEnd());
} else if (i == 1) {
assertEquals("1 2 3<skip>skip</skip> 4".length(), skip.getStart());
assertEquals("1 2 3<skip>skip</skip> 4<skip>skip</skip>".length(), skip.getEnd());
}
i++;
}
}
use of com.joliciel.talismane.rawText.RawTextMarker.RawTextSkipMarker in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testGetProcessedTextBlock.
@Test
public void testGetProcessedTextBlock() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("1 ");
textBlock = textBlock.roll("2 ");
textBlock = textBlock.roll("3<skip>skip</skip> 4<sk");
textBlock = textBlock.roll("ip>skip</skip> five");
// the rawTextBlock always contains the last two added sub-blocks
// so annotations are relative to these sub-blocks
AnnotatedText rawTextBlock = textBlock.getRawTextBlock();
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("3".length(), "3<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
skips.add(new Annotation<>("3<skip>skip</skip> 4".length(), "3<skip>skip</skip> 4<skip>skip</skip>".length(), new RawTextSkipMarker("me"), labels));
rawTextBlock.addAnnotations(skips);
textBlock = textBlock.roll(" 6");
rawTextBlock = textBlock.getRawTextBlock();
List<Annotation<RawTextReplaceMarker>> replaces = new ArrayList<>();
replaces.add(new Annotation<>("ip>skip</skip> ".length(), "ip>skip</skip> five".length(), new RawTextReplaceMarker("me", "5"), labels));
rawTextBlock.addAnnotations(replaces);
AnnotatedText processedTextBlock = textBlock.getProcessedText();
// the processed text always concerns sub-blocks 1, 2 and 3
// at this point, sub-block 1 has already been flushed
assertEquals("2 3 4 5", processedTextBlock.getText());
}
Aggregations