use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenisedAtomicTokenSequenceTest method testGetTokenSequence.
@Test
public void testGetTokenSequence() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("Je n'ai pas encore l'ourang-outan.", sessionId);
TokeniserOutcome[] tokeniserOutcomeArray = new TokeniserOutcome[] { // Je
TokeniserOutcome.SEPARATE, // _
TokeniserOutcome.SEPARATE, // n
TokeniserOutcome.SEPARATE, // '
TokeniserOutcome.JOIN, // ai
TokeniserOutcome.SEPARATE, // _
TokeniserOutcome.SEPARATE, // pas
TokeniserOutcome.SEPARATE, // _
TokeniserOutcome.JOIN, // encore
TokeniserOutcome.JOIN, // _
TokeniserOutcome.SEPARATE, // l
TokeniserOutcome.SEPARATE, // '
TokeniserOutcome.JOIN, // ourang
TokeniserOutcome.SEPARATE, // -
TokeniserOutcome.JOIN, // outan
TokeniserOutcome.JOIN, // .
TokeniserOutcome.SEPARATE };
TokenisedAtomicTokenSequence atomicTokenSequence = new TokenisedAtomicTokenSequence(sentence, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
Decision decision = new Decision(tokeniserOutcomeArray[i++].name());
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
atomicTokenSequence.add(taggedToken);
}
TokenSequence newTokenSequence = atomicTokenSequence.inferTokenSequence();
LOG.debug(newTokenSequence.toString());
i = 0;
for (Token token : newTokenSequence) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
} else if (i == 1) {
assertEquals("n'", token.getAnalyisText());
} else if (i == 2) {
assertEquals("ai", token.getAnalyisText());
} else if (i == 3) {
assertEquals("pas encore", token.getAnalyisText());
} else if (i == 4) {
assertEquals("l'", token.getAnalyisText());
} else if (i == 5) {
assertEquals("ourang-outan", token.getAnalyisText());
} else if (i == 6) {
assertEquals(".", token.getAnalyisText());
}
i++;
}
assertEquals(7, newTokenSequence.size());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class PatternTokeniserTest method testTokenise.
@Test
public void testTokenise() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan sur www.google.com.", sessionId);
List<Annotation<TokenPlaceholder>> annotations = new ArrayList<>();
Annotation<TokenPlaceholder> annotation = new Annotation<TokenPlaceholder>("Je n'ai pas l'ourang-outan sur ".length(), "Je n'ai pas l'ourang-outan sur www.google.com".length(), new TokenPlaceholder("URL", ""), labels);
annotations.add(annotation);
sentence.addAnnotations(annotations);
List<String> tokeniserPatterns = new ArrayList<String>();
tokeniserPatterns.add("IS_NOT_SEPARATOR -_");
tokeniserPatterns.add("IS_SEPARATOR_AFTER '");
TokeniserPatternManager patternManager = new TokeniserPatternManager(tokeniserPatterns, sessionId);
PatternTokeniser tokeniser = new PatternTokeniser(null, patternManager, null, 1, sessionId);
List<TokenSequence> tokenSequences = tokeniser.tokenise(sentence);
TokenSequence tokenSequence = tokenSequences.get(0);
LOG.debug(tokenSequence.toString());
assertEquals(9, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
} else if (i == 1) {
assertEquals("n'", token.getAnalyisText());
} else if (i == 2) {
assertEquals("ai", token.getAnalyisText());
} else if (i == 3) {
assertEquals("pas", token.getAnalyisText());
} else if (i == 4) {
assertEquals("l'", token.getAnalyisText());
} else if (i == 5) {
assertEquals("ourang-outan", token.getAnalyisText());
} else if (i == 6) {
assertEquals("sur", token.getAnalyisText());
} else if (i == 7) {
assertEquals("URL", token.getAnalyisText());
} else if (i == 8) {
assertEquals(".", token.getAnalyisText());
}
i++;
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenPatternTest method testMatch3.
@Test
public void testMatch3() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("Z'ensuite il aille...", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
TokenPattern tokenPattern = new TokenPattern("{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.getTokenSeparators(sessionId));
List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence);
assertEquals(1, patternMatches.size());
TokenPatternMatchSequence matchSequence = patternMatches.get(0);
assertEquals(3, matchSequence.getTokenSequence().size());
assertEquals("Z", matchSequence.getTokenSequence().get(0).getOriginalText());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApplyWithDollars.
@Test
public void testApplyWithDollars() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
String replacement = "\\$Email$2:$1";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
filter.annotate(text);
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
LOG.debug(placeholders.toString());
assertEquals(1, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals(14, placeholder.getStart());
assertEquals(33, placeholder.getEnd());
assertEquals("$Email@test.com:joe.schmoe", placeholder.getData().getReplacement());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testPuctuation.
@Test
public void testPuctuation() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "[\\p{IsPunctuation}&&[^%$#@§¶‰‱]]+";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("featureType", new StringAttribute("featureType", "punctuation"));
Sentence text = new Sentence("Bonjour. Comment ça va?", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(2, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals("Bonjour".length(), placeholder.getStart());
assertEquals("Bonjour.".length(), placeholder.getEnd());
assertEquals("featureType", placeholder.getData().getKey());
assertEquals("punctuation", placeholder.getData().getValue());
}
Aggregations