use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenPatternTest method testMatch2.
@Test
public void testMatch2() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("Qu'ensuite il aille...", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
TokenPattern tokenPattern = new TokenPattern("{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.getTokenSeparators(sessionId));
List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence);
assertEquals(0, patternMatches.size());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TalismaneAPIExamples method example2.
/**
* Similar to example1, but begins with filtering and sentence detection.
*/
public static void example2(String sessionId) throws Exception {
String text = "Les gens qui voient de travers pensent que les bancs verts qu'on voit sur les trottoirs " + "sont faits pour les impotents ou les ventripotents. " + "Mais c'est une absurdité, car, à la vérité, ils sont là, c'est notoire, " + "pour accueillir quelque temps les amours débutants.";
RawText rawText = new RawText(text, true, sessionId);
// issues (e.g. replace " with ")
for (RawTextAnnotator filter : TalismaneSession.get(sessionId).getTextAnnotators()) {
filter.annotate(rawText);
}
// retrieve the processed text after filters have been applied
AnnotatedText processedText = rawText.getProcessedText();
// detect sentences
SentenceDetector sentenceDetector = SentenceDetector.getInstance(sessionId);
sentenceDetector.detectSentences(processedText);
// the detected sentences can be retrieved directly from the raw text
// this allows annotations made on the sentences to get reflected in the
// raw text
List<Sentence> sentences = rawText.getDetectedSentences();
for (Sentence sentence : sentences) {
// assignment for a given word)
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
}
// tokenise the text
Tokeniser tokeniser = Tokeniser.getInstance(sessionId);
TokenSequence tokenSequence = tokeniser.tokeniseSentence(sentence);
// pos-tag the token sequence
PosTagger posTagger = PosTaggers.getPosTagger(sessionId);
PosTagSequence posTagSequence = posTagger.tagSentence(tokenSequence);
System.out.println(posTagSequence);
// parse the pos-tag sequence
Parser parser = Parsers.getParser(sessionId);
ParseConfiguration parseConfiguration = parser.parseSentence(posTagSequence);
System.out.println(parseConfiguration);
ParseTree parseTree = new ParseTree(parseConfiguration, true);
System.out.println(parseTree);
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class StandoffReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
if (configuration == null && sentenceIndex < sentences.size()) {
List<StandoffToken> tokens = sentences.get(sentenceIndex++);
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new RuntimeException("Linguistic rules have not been set.");
String text = "";
for (StandoffToken standoffToken : tokens) {
String word = standoffToken.text;
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
Sentence sentence = new Sentence(text, sessionId);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
}
PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
for (StandoffToken standoffToken : tokens) {
Token token = tokenSequence.addToken(standoffToken.text);
Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
if (LOG.isTraceEnabled()) {
LOG.trace(posTaggedToken.toString());
}
posTaggedToken.setComment(standoffToken.comment);
posTagSequence.addPosTaggedToken(posTaggedToken);
idTokenMap.put(standoffToken.id, posTaggedToken);
LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
}
tokenSequence.setWithRoot(true);
configuration = new ParseConfiguration(posTagSequence);
for (StandoffToken standoffToken : tokens) {
StandoffRelation relation = relationMap.get(standoffToken.id);
if (relation != null) {
PosTaggedToken head = idTokenMap.get(relation.fromToken);
PosTaggedToken dependent = idTokenMap.get(relation.toToken);
if (head == null) {
throw new TalismaneException("No token found for head id: " + relation.fromToken);
}
if (dependent == null) {
throw new TalismaneException("No token found for dependent id: " + relation.toToken);
}
DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
arc.setComment(relation.comment);
} else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
if (punctuationDepLabel != null) {
PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
for (int i = dependent.getIndex() - 1; i >= 0; i--) {
PosTaggedToken head = posTagSequence.get(i);
if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
continue;
configuration.addDependency(head, dependent, punctuationDepLabel, null);
break;
}
}
}
}
}
}
return (configuration != null);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class PatternEventStream method hasNext.
@Override
public boolean hasNext() throws TalismaneException, IOException {
if (currentPatternMatches != null) {
if (currentIndex == currentPatternMatches.size()) {
currentPatternMatches = null;
}
}
while (currentPatternMatches == null) {
if (this.corpusReader.hasNextSentence()) {
currentPatternMatches = new ArrayList<TokenPatternMatch>();
currentOutcomes = new ArrayList<TokeniserOutcome>();
currentIndex = 0;
TokenSequence realSequence = corpusReader.nextTokenSequence();
List<Integer> tokenSplits = realSequence.getTokenSplits();
String text = realSequence.getSentence().getText().toString();
LOG.debug("Sentence: " + text);
Sentence sentence = new Sentence(text, sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
List<TaggedToken<TokeniserOutcome>> currentSentence = this.getTaggedTokens(tokenSequence, tokenSplits);
// check if anything matches each pattern
for (TokenPattern parsedPattern : this.tokeniserPatternManager.getParsedTestPatterns()) {
List<TokenPatternMatchSequence> tokenPatternMatches = parsedPattern.match(tokenSequence);
for (TokenPatternMatchSequence tokenPatternMatchSequence : tokenPatternMatches) {
if (LOG.isTraceEnabled())
LOG.trace("Matched pattern: " + parsedPattern + ": " + tokenPatternMatchSequence.getTokenSequence());
// check if entire pattern is separated or joined
TokeniserOutcome outcome = null;
TokeniserOutcome defaultOutcome = null;
boolean haveMismatch = false;
TokenPatternMatch tokenPatternMatch = null;
for (Token token : tokenPatternMatchSequence.getTokensToCheck()) {
if (tokenPatternMatch == null) {
for (TokenPatternMatch patternMatch : tokenPatternMatchSequence.getTokenPatternMatches()) {
if (patternMatch.getToken().equals(token)) {
tokenPatternMatch = patternMatch;
break;
}
}
}
TaggedToken<TokeniserOutcome> taggedToken = currentSentence.get(token.getIndexWithWhiteSpace());
if (outcome == null) {
outcome = taggedToken.getTag();
defaultOutcome = defaultOutcomes.get(token.getIndexWithWhiteSpace());
} else if (taggedToken.getTag() != outcome) {
// this should only happen when two patterns
// overlap:
// e.g. "aussi bien que" and "bien que", or
// "plutot que" and "plutot que de"
// AND the outer pattern is separated, while
// the inner pattern is joined
LOG.debug("Mismatch in pattern: " + tokenPatternMatch + ", " + taggedToken);
haveMismatch = true;
}
}
currentPatternMatches.add(tokenPatternMatch);
if (haveMismatch) {
currentOutcomes.add(defaultOutcome);
} else {
currentOutcomes.add(outcome);
}
}
}
if (currentPatternMatches.size() == 0) {
currentPatternMatches = null;
currentOutcomes = null;
}
} else {
break;
}
}
return currentPatternMatches != null;
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApplyWithUnmatchingGroups.
@Test
public void testApplyWithUnmatchingGroups() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b(\\d)(\\d)?\\b";
String replacement = "Number$1$2";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("Two-digit number: 42. One-digit number: 7.", sessionId);
filter.annotate(text);
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
LOG.debug(placeholders.toString());
assertEquals(2, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals("Two-digit number: ".length(), placeholder.getStart());
assertEquals("Two-digit number: 42".length(), placeholder.getEnd());
assertEquals("Number42", placeholder.getData().getReplacement());
placeholder = placeholders.get(1);
assertEquals("Two-digit number: 42. One-digit number: ".length(), placeholder.getStart());
assertEquals("Two-digit number: 42. One-digit number: 7".length(), placeholder.getEnd());
assertEquals("Number7", placeholder.getData().getReplacement());
}
Aggregations