use of com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder in project talismane by joliciel-informatique.
the class TokenSequence method findDefaultTokens.
/**
* Add tokens from the underlying sentence, pre-separated into tokens matching
* {@link Tokeniser#getTokenSeparators(String)}, except wherever
* {@link TokenPlaceholder} annotations have been added.
*/
public void findDefaultTokens() {
if (!defaultTokensFound) {
CharSequence text = sentence.getText();
Pattern separatorPattern = Tokeniser.getTokenSeparators(sessionId);
Matcher matcher = separatorPattern.matcher(text);
Set<Integer> separatorMatches = new HashSet<Integer>();
while (matcher.find()) separatorMatches.add(matcher.start());
int currentPos = 0;
for (int i = 0; i < text.length(); i++) {
if (placeholderMap.containsKey(i)) {
if (i > currentPos)
this.addToken(currentPos, i);
Annotation<TokenPlaceholder> placeholder = placeholderMap.get(i);
Token token = this.addToken(placeholder.getStart(), placeholder.getEnd());
if (placeholder.getData().getReplacement() != null)
token.setText(placeholder.getData().getReplacement());
if (separatorPattern.matcher(token.getText()).matches())
token.setSeparator(true);
// skip until after the placeholder
i = placeholder.getEnd() - 1;
currentPos = placeholder.getEnd();
} else if (separatorMatches.contains(i)) {
if (i > currentPos)
this.addToken(currentPos, i);
Token separator = this.addToken(i, i + 1);
separator.setSeparator(true);
currentPos = i + 1;
}
}
if (currentPos < text.length())
this.addToken(currentPos, text.length());
this.defaultTokensFound = true;
}
}
use of com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder in project talismane by joliciel-informatique.
the class SimpleTokeniserTest method testTokenise.
@Test
public void testTokenise() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Click http://www.blah-di-blah.com now", sessionId);
List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Click ".length(), "Click http://www.blah-di-blah.com".length(), new TokenPlaceholder("URL", ""), labels);
annotations.add(annotation);
sentence.addAnnotations(annotations);
SimpleTokeniser simpleTokeniser = new SimpleTokeniser(sessionId);
TokenSequence tokenSequence = simpleTokeniser.tokeniseSentence(sentence);
System.out.println(tokenSequence.toString());
assertEquals(3, tokenSequence.size());
assertEquals("Click", tokenSequence.get(0).getAnalyisText());
assertEquals("URL", tokenSequence.get(1).getAnalyisText());
assertEquals("now", tokenSequence.get(2).getAnalyisText());
List<Annotation<TokenBoundary>> tokenBoundaries = sentence.getAnnotations(TokenBoundary.class);
assertEquals(3, tokenBoundaries.size());
assertEquals("".length(), tokenBoundaries.get(0).getStart());
assertEquals("Click".length(), tokenBoundaries.get(0).getEnd());
assertEquals("Click", tokenBoundaries.get(0).getData().getAnalysisText());
assertEquals("Click ".length(), tokenBoundaries.get(1).getStart());
assertEquals("URL", tokenBoundaries.get(1).getData().getAnalysisText());
assertEquals("Click http://www.blah-di-blah.com".length(), tokenBoundaries.get(1).getEnd());
assertEquals("Click http://www.blah-di-blah.com ".length(), tokenBoundaries.get(2).getStart());
assertEquals("Click http://www.blah-di-blah.com now".length(), tokenBoundaries.get(2).getEnd());
assertEquals("now", tokenBoundaries.get(2).getData().getAnalysisText());
}
use of com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder in project talismane by joliciel-informatique.
the class TokenSequenceTest method testTokeniseSentenceWithPlaceholders.
@Test
public void testTokeniseSentenceWithPlaceholders() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Write to me at joe.schome@test.com, otherwise go to http://test.com.", sessionId);
final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Write to me at ".length(), "Write to me at joe.schome@test.com".length(), new TokenPlaceholder("Email", "blah"), labels);
placeholders.add(placeholder0);
Annotation<TokenPlaceholder> placeholder1 = new Annotation<>("Write to me at joe.schome@test.com, otherwise go to ".length(), "Write to me at joe.schome@test.com, otherwise go to http://test.com".length(), new TokenPlaceholder("URL", "blah"), labels);
placeholders.add(placeholder1);
sentence.addAnnotations(placeholders);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
assertEquals(19, tokenSequence.listWithWhiteSpace().size());
assertEquals(11, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
if (i == 0) {
assertEquals("Write", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 2) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 3) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 4) {
assertEquals("me", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 6) {
assertEquals("at", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 7) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 8) {
assertEquals("Email", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 9) {
assertEquals(",", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 10) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 11) {
assertEquals("otherwise", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 12) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 13) {
assertEquals("go", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 14) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 15) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 16) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 17) {
assertEquals("URL", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 18) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Write", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 2) {
assertEquals("me", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 3) {
assertEquals("at", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 4) {
assertEquals("Email", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals(",", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 6) {
assertEquals("otherwise", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 7) {
assertEquals("go", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 8) {
assertEquals("to", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 9) {
assertEquals("URL", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 10) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
}
use of com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder in project talismane by joliciel-informatique.
the class PatternTokeniserTest method testTokenise.
@Test
public void testTokenise() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan sur www.google.com.", sessionId);
List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Je n'ai pas l'ourang-outan sur ".length(), "Je n'ai pas l'ourang-outan sur www.google.com".length(), new TokenPlaceholder("URL", ""), labels);
annotations.add(annotation);
sentence.addAnnotations(annotations);
List<String> tokeniserPatterns = new ArrayList<String>();
tokeniserPatterns.add("IS_NOT_SEPARATOR -_");
tokeniserPatterns.add("IS_SEPARATOR_AFTER '");
TokeniserPatternManager patternManager = new TokeniserPatternManager(tokeniserPatterns, sessionId);
PatternTokeniser tokeniser = new PatternTokeniser(null, patternManager, null, 1, sessionId);
List<TokenSequence> tokenSequences = tokeniser.tokenise(sentence);
TokenSequence tokenSequence = tokenSequences.get(0);
LOG.debug(tokenSequence.toString());
assertEquals(9, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
} else if (i == 1) {
assertEquals("n'", token.getAnalyisText());
} else if (i == 2) {
assertEquals("ai", token.getAnalyisText());
} else if (i == 3) {
assertEquals("pas", token.getAnalyisText());
} else if (i == 4) {
assertEquals("l'", token.getAnalyisText());
} else if (i == 5) {
assertEquals("ourang-outan", token.getAnalyisText());
} else if (i == 6) {
assertEquals("sur", token.getAnalyisText());
} else if (i == 7) {
assertEquals("URL", token.getAnalyisText());
} else if (i == 8) {
assertEquals(".", token.getAnalyisText());
}
i++;
}
}
use of com.joliciel.talismane.sentenceAnnotators.TokenPlaceholder in project talismane by joliciel-informatique.
the class TokenSequenceTest method testTokeniseSentenceWithPlaceholdersNoSeparators.
@Test
public void testTokeniseSentenceWithPlaceholdersNoSeparators() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Il t’aime.", sessionId);
final List<Annotation<StringAttribute>> annotations = new ArrayList<>();
Annotation<StringAttribute> annotation1 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("phrase", "verbal"), labels);
annotations.add(annotation1);
Annotation<StringAttribute> annotation2 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("person", "3rd"), labels);
annotations.add(annotation2);
Annotation<StringAttribute> annotation3 = new Annotation<>("Il ".length(), "Il t’".length(), new StringAttribute("type", "object"), labels);
annotations.add(annotation3);
final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Il t".length(), "Il t’".length(), new TokenPlaceholder("'", "blah"), labels);
placeholders.add(placeholder0);
sentence.addAnnotations(annotations);
sentence.addAnnotations(placeholders);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
LOG.debug(tokenSequence.listWithWhiteSpace().toString());
LOG.debug(tokenSequence.toString());
assertEquals(6, tokenSequence.listWithWhiteSpace().size());
assertEquals(5, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
if (i == 0) {
assertEquals("Il", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(0, token.getAttributes().size());
} else if (i == 1) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(0, token.getAttributes().size());
} else if (i == 2) {
assertEquals("t", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(3, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
assertEquals("object", token.getAttributes().get("type").getValue());
} else if (i == 3) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(3, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
assertEquals("object", token.getAttributes().get("type").getValue());
} else if (i == 4) {
assertEquals("aime", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(2, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
} else if (i == 5) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(0, token.getAttributes().size());
}
i++;
}
i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Il", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals("t", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 2) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 3) {
assertEquals("aime", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 4) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
}
Aggregations