use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApply.
@Test
public void testApply() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b[\\w.%-]+@[-.\\w]+\\.[A-Za-z]{2,4}\\b";
String replacement = "Email";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
filter.annotate(text);
LOG.debug(text.getAnnotations().toString());
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
assertEquals(1, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals(14, placeholder.getStart());
assertEquals(33, placeholder.getEnd());
assertEquals("Email", placeholder.getData().getReplacement());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class SerializationTest method testSerialize.
@Test
public void testSerialize() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
String sessionId = "test";
Sentence sentence = new Sentence("Il aime les pommes", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Il".length());
tokenSequence.addToken("Il ".length(), "Il aime".length());
tokenSequence.addToken("Il aime ".length(), "Il aime les".length());
tokenSequence.addToken("Il aime les ".length(), "Il aime les pommes".length());
PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("CLS", 0.90), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("V", 0.70), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("DET", 0.60), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("NC", 0.80), sessionId));
posTagSequence.prependRoot();
ParseConfiguration configuration = new ParseConfiguration(posTagSequence);
LOG.debug(configuration.toString());
// ROOT ... il
new ShiftTransition().apply(configuration);
LOG.debug("Shift -> " + configuration.toString());
// ROOT il <- aime
new LeftArcEagerTransition("suj").apply(configuration);
LOG.debug("Left -> " + configuration.toString());
// ROOT -> aime
new RightArcEagerTransition("root").apply(configuration);
LOG.debug("Right -> " + configuration.toString());
// ROOT aime ... les
new ShiftTransition().apply(configuration);
LOG.debug("Shift -> " + configuration.toString());
// ROOT aime les <- pommes
new LeftArcEagerTransition("det").apply(configuration);
LOG.debug("Left -> " + configuration.toString());
// ROOT aime -> pommes
new RightArcEagerTransition("obj").apply(configuration);
LOG.debug("Right -> " + configuration.toString());
ParseTree parseTree = new ParseTree(configuration, true);
LOG.debug(parseTree.toString());
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
oos.writeObject(sentence);
oos.writeObject(tokenSequence);
oos.writeObject(posTagSequence);
oos.writeObject(configuration);
oos.writeObject(parseTree);
byte[] bytes = bos.toByteArray();
ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
Sentence sentence2 = (Sentence) ois.readObject();
TokenSequence tokenSequence2 = (TokenSequence) ois.readObject();
PosTagSequence posTagSequence2 = (PosTagSequence) ois.readObject();
ParseConfiguration configuration2 = (ParseConfiguration) ois.readObject();
ParseTree parseTree2 = (ParseTree) ois.readObject();
assertEquals(sentence, sentence2);
assertEquals(tokenSequence, tokenSequence2);
assertEquals(posTagSequence, posTagSequence2);
assertEquals(configuration, configuration2);
assertEquals(parseTree, parseTree2);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenSequenceTest method testTokeniseSentence.
@Test
public void testTokeniseSentence() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("Je n'ai pas l'ourang-outan.", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
assertEquals(14, tokenSequence.listWithWhiteSpace().size());
assertEquals(11, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 2) {
assertEquals("n", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 3) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 4) {
assertEquals("ai", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 6) {
assertEquals("pas", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 7) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 8) {
assertEquals("l", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 9) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 10) {
assertEquals("ourang", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 11) {
assertEquals("-", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 12) {
assertEquals("outan", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 13) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Je", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals("n", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 2) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 3) {
assertEquals("ai", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 4) {
assertEquals("pas", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 5) {
assertEquals("l", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 6) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 7) {
assertEquals("ourang", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 8) {
assertEquals("-", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 9) {
assertEquals("outan", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 10) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenSequenceTest method testTokeniseSentenceWithPlaceholdersNoSeparators.
@Test
public void testTokeniseSentenceWithPlaceholdersNoSeparators() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Il t’aime.", sessionId);
final List<Annotation<StringAttribute>> annotations = new ArrayList<>();
Annotation<StringAttribute> annotation1 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("phrase", "verbal"), labels);
annotations.add(annotation1);
Annotation<StringAttribute> annotation2 = new Annotation<>("Il ".length(), "Il t’aime".length(), new StringAttribute("person", "3rd"), labels);
annotations.add(annotation2);
Annotation<StringAttribute> annotation3 = new Annotation<>("Il ".length(), "Il t’".length(), new StringAttribute("type", "object"), labels);
annotations.add(annotation3);
final List<Annotation<TokenPlaceholder>> placeholders = new ArrayList<>();
Annotation<TokenPlaceholder> placeholder0 = new Annotation<>("Il t".length(), "Il t’".length(), new TokenPlaceholder("'", "blah"), labels);
placeholders.add(placeholder0);
sentence.addAnnotations(annotations);
sentence.addAnnotations(placeholders);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
LOG.debug(tokenSequence.listWithWhiteSpace().toString());
LOG.debug(tokenSequence.toString());
assertEquals(6, tokenSequence.listWithWhiteSpace().size());
assertEquals(5, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence.listWithWhiteSpace()) {
if (i == 0) {
assertEquals("Il", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(0, token.getAttributes().size());
} else if (i == 1) {
assertEquals(" ", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(0, token.getAttributes().size());
} else if (i == 2) {
assertEquals("t", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(3, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
assertEquals("object", token.getAttributes().get("type").getValue());
} else if (i == 3) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(3, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
assertEquals("object", token.getAttributes().get("type").getValue());
} else if (i == 4) {
assertEquals("aime", token.getAnalyisText());
assertEquals(false, token.isSeparator());
assertEquals(2, token.getAttributes().size());
assertEquals("verbal", token.getAttributes().get("phrase").getValue());
assertEquals("3rd", token.getAttributes().get("person").getValue());
} else if (i == 5) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
assertEquals(0, token.getAttributes().size());
}
i++;
}
i = 0;
for (Token token : tokenSequence) {
if (i == 0) {
assertEquals("Il", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 1) {
assertEquals("t", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 2) {
assertEquals("'", token.getAnalyisText());
assertEquals(true, token.isSeparator());
} else if (i == 3) {
assertEquals("aime", token.getAnalyisText());
assertEquals(false, token.isSeparator());
} else if (i == 4) {
assertEquals(".", token.getAnalyisText());
assertEquals(true, token.isSeparator());
}
i++;
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenSequenceTest method testOverlappingPlaceholders.
@Test
public void testOverlappingPlaceholders() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String[] labels = new String[0];
final Sentence sentence = new Sentence("Pakistan International Airlines Company", sessionId);
final List<Annotation<StringAttribute>> annotations = new ArrayList<>();
Annotation<StringAttribute> annotation1 = new Annotation<>("".length(), "Pakistan".length(), new StringAttribute("namedEntity", "place"), labels);
Annotation<StringAttribute> annotation1b = new Annotation<>("".length(), "Pakistan".length(), new StringAttribute("startsWithP", "true"), labels);
annotations.add(annotation1);
annotations.add(annotation1b);
Annotation<StringAttribute> annotation2 = new Annotation<>("".length(), "Pakistan International Airlines".length(), new StringAttribute("namedEntity", "company"), labels);
Annotation<StringAttribute> annotation2b = new Annotation<>("".length(), "Pakistan International Airlines".length(), new StringAttribute("asianCompany", "true"), labels);
annotations.add(annotation2);
annotations.add(annotation2b);
Annotation<StringAttribute> annotation3 = new Annotation<>("Pakistan ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("namedEntity", "company"), labels);
Annotation<StringAttribute> annotation3b = new Annotation<>("Pakistan ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("asianCompany", "false"), labels);
annotations.add(annotation3);
annotations.add(annotation3b);
Annotation<StringAttribute> annotation4 = new Annotation<>("Pakistan International Airlines ".length(), "Pakistan International Airlines Company".length(), new StringAttribute("startsWithC", "true"), labels);
annotations.add(annotation4);
sentence.addAnnotations(annotations);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
LOG.debug(tokenSequence.listWithWhiteSpace().toString());
LOG.debug(tokenSequence.toString());
assertEquals(4, tokenSequence.size());
int i = 0;
for (Token token : tokenSequence) {
LOG.debug(token.getAttributes().toString());
if (i == 0) {
assertEquals("Pakistan", token.getAnalyisText());
assertEquals(3, token.getAttributes().size());
assertEquals("company", token.getAttributes().get("namedEntity").getValue());
assertEquals("true", token.getAttributes().get("startsWithP").getValue());
assertEquals("true", token.getAttributes().get("asianCompany").getValue());
} else if (i == 1) {
assertEquals("International", token.getAnalyisText());
assertEquals(2, token.getAttributes().size());
assertEquals("company", token.getAttributes().get("namedEntity").getValue());
assertEquals("true", token.getAttributes().get("asianCompany").getValue());
} else if (i == 2) {
assertEquals("Airlines", token.getAnalyisText());
assertEquals(2, token.getAttributes().size());
assertEquals("company", token.getAttributes().get("namedEntity").getValue());
assertEquals("true", token.getAttributes().get("asianCompany").getValue());
} else if (i == 3) {
assertEquals("Company", token.getAnalyisText());
assertEquals(1, token.getAttributes().size());
assertEquals("true", token.getAttributes().get("startsWithC").getValue());
}
i++;
}
}
Aggregations