use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RollingTextBlockTest method testNoSentenceAnnotationLocation.
@Test
public void testNoSentenceAnnotationLocation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
// String text = "I see Mr. Jones and <skip/>Mrs. Smith.";
RollingTextBlock textBlock = new RollingTextBlock(true, null, sessionId);
textBlock = textBlock.roll("I see ");
textBlock = textBlock.roll("Mr. Jones ");
textBlock = textBlock.roll("and <sk");
AnnotatedText rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreaks = new ArrayList<>();
System.out.println("we add no sentence break annotations (as if they were added by a filter)");
noSentenceBreaks.add(new Annotation<>("".length(), "Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
rawText.addAnnotations(noSentenceBreaks);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("ip/>Mrs.");
rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
List<Annotation<RawTextSkipMarker>> skips = new ArrayList<>();
skips.add(new Annotation<>("and ".length(), "and <skip/>".length(), new RawTextSkipMarker("me"), labels));
rawText.addAnnotations(skips);
AnnotatedText processedTextBlock = textBlock.getProcessedText();
assertEquals("I see Mr. Jones and ", processedTextBlock.getText());
// ensure that the no sentence break text got added at the right place
// in the processed text
noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
System.out.println("Processed annotations: " + noSentenceBreaks);
assertEquals(1, noSentenceBreaks.size());
assertEquals("I see ".length(), noSentenceBreaks.get(0).getStart());
assertEquals("I see Mr.".length(), noSentenceBreaks.get(0).getEnd());
textBlock = textBlock.roll(" Smith.");
rawText = textBlock.getRawTextBlock();
System.out.println("rawText text: " + rawText.getText());
noSentenceBreaks = new ArrayList<>();
noSentenceBreaks.add(new Annotation<>("ip/>".length(), "ip/>Mrs.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
rawText.addAnnotations(noSentenceBreaks);
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
textBlock = textBlock.roll("");
System.out.println("textBlock text: " + textBlock.getText());
System.out.println("textBlock annotations: " + textBlock.getAnnotations().toString());
processedTextBlock = textBlock.getProcessedText();
assertEquals("and Mrs. Smith.", processedTextBlock.getText());
// ensure that the no sentence break text got added at the right place
// in the processed text
noSentenceBreaks = processedTextBlock.getAnnotations(RawTextNoSentenceBreakMarker.class);
System.out.println("Processed annotations: " + noSentenceBreaks);
assertEquals(1, noSentenceBreaks.size());
assertEquals("and ".length(), noSentenceBreaks.get(0).getStart());
assertEquals("and Mrs.".length(), noSentenceBreaks.get(0).getEnd());
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class SimpleTokeniserTest method testTokenise.
@Test
public void testTokenise() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Click http://www.blah-di-blah.com now", sessionId);
List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Click ".length(), "Click http://www.blah-di-blah.com".length(), new TokenPlaceholder("URL", ""), labels);
annotations.add(annotation);
sentence.addAnnotations(annotations);
SimpleTokeniser simpleTokeniser = new SimpleTokeniser(sessionId);
TokenSequence tokenSequence = simpleTokeniser.tokeniseSentence(sentence);
System.out.println(tokenSequence.toString());
assertEquals(3, tokenSequence.size());
assertEquals("Click", tokenSequence.get(0).getAnalyisText());
assertEquals("URL", tokenSequence.get(1).getAnalyisText());
assertEquals("now", tokenSequence.get(2).getAnalyisText());
List<Annotation<TokenBoundary>> tokenBoundaries = sentence.getAnnotations(TokenBoundary.class);
assertEquals(3, tokenBoundaries.size());
assertEquals("".length(), tokenBoundaries.get(0).getStart());
assertEquals("Click".length(), tokenBoundaries.get(0).getEnd());
assertEquals("Click", tokenBoundaries.get(0).getData().getAnalysisText());
assertEquals("Click ".length(), tokenBoundaries.get(1).getStart());
assertEquals("URL", tokenBoundaries.get(1).getData().getAnalysisText());
assertEquals("Click http://www.blah-di-blah.com".length(), tokenBoundaries.get(1).getEnd());
assertEquals("Click http://www.blah-di-blah.com ".length(), tokenBoundaries.get(2).getStart());
assertEquals("Click http://www.blah-di-blah.com now".length(), tokenBoundaries.get(2).getEnd());
assertEquals("now", tokenBoundaries.get(2).getData().getAnalysisText());
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class TokenSequenceTest method testTokeniseSentenceWithPlaceholders.
@Test
public void testTokeniseSentenceWithPlaceholders() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Write to me at joe.schome@test.com, otherwise go to http://test.com.", sessionId);
final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Write to me at ".length(), "Write to me at joe.schome@test.com".length(), new TokenPlaceholder("Email", "blah"), labels);
placeholders.add(placeholder0);
Annotation<TokenPlaceholder> placeholder1 = new Annotation<>("Write to me at joe.schome@test.com, otherwise go to ".length(), "Write to me at joe.schome@test.com, otherwise go to http://test.com".length(), new TokenPlaceholder("URL", "blah"), labels);
placeholders.add(placeholder1);
sentence.addAnnotations(placeholders);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
assertEquals(19, tokenSequence.listWithWhiteSpace().size());
assertEquals(11, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
if (i == 0) {
assertEquals("Write", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 2) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 3) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 4) {
assertEquals("me", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 6) {
assertEquals("at", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 7) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 8) {
assertEquals("Email", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 9) {
assertEquals(",", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 10) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 11) {
assertEquals("otherwise", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 12) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 13) {
assertEquals("go", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 14) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 15) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 16) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 17) {
assertEquals("URL", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 18) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Write", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 2) {
assertEquals("me", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 3) {
assertEquals("at", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 4) {
assertEquals("Email", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals(",", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 6) {
assertEquals("otherwise", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 7) {
assertEquals("go", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 8) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 9) {
assertEquals("URL", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 10) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class PatternTokeniserTest method testTokenise.
@Test
public void testTokenise() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan sur www.google.com.", sessionId);
List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Je n'ai pas l'ourang-outan sur ".length(), "Je n'ai pas l'ourang-outan sur www.google.com".length(), new TokenPlaceholder("URL", ""), labels);
annotations.add(annotation);
sentence.addAnnotations(annotations);
List<String> tokeniserPatterns = new ArrayList<String>();
tokeniserPatterns.add("IS_NOT_SEPARATOR -_");
tokeniserPatterns.add("IS_SEPARATOR_AFTER '");
TokeniserPatternManager patternManager = new TokeniserPatternManager(tokeniserPatterns, sessionId);
PatternTokeniser tokeniser = new PatternTokeniser(null, patternManager, null, 1, sessionId);
List<TokenSequence> tokenSequences = tokeniser.tokenise(sentence);
TokenSequence tokenSequence = tokenSequences.get(0);
LOG.debug(tokenSequence.toString());
assertEquals(9, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
} else if (i == 1) {
assertEquals("n'", token.getAnalyisText());
} else if (i == 2) {
assertEquals("ai", token.getAnalyisText());
} else if (i == 3) {
assertEquals("pas", token.getAnalyisText());
} else if (i == 4) {
assertEquals("l'", token.getAnalyisText());
} else if (i == 5) {
assertEquals("ourang-outan", token.getAnalyisText());
} else if (i == 6) {
assertEquals("sur", token.getAnalyisText());
} else if (i == 7) {
assertEquals("URL", token.getAnalyisText());
} else if (i == 8) {
assertEquals(".", token.getAnalyisText());
}
i++;
}
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApplyWithDollars.
@Test
public void testApplyWithDollars() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
String replacement = "\\$Email$2:$1";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
filter.annotate(text);
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
LOG.debug(placeholders.toString());
assertEquals(1, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals(14, placeholder.getStart());
assertEquals(33, placeholder.getEnd());
assertEquals("$Email@test.com:joe.schmoe", placeholder.getData().getReplacement());
}
Aggregations