use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class Talismane method analyse.
/**
* Analyse the data provided by this reader, as specified by the
* configuration.
*
* @param reader
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if it's impossible to read a sentence from an annotated corpus
*/
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
long startTime = System.currentTimeMillis();
try {
TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
PosTagAnnotatedCorpusReader posTagCorpusReader = null;
if (this.startModule.equals(Module.posTagger)) {
tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
}
if (this.startModule.equals(Module.parser)) {
posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
}
LinkedList<String> textSegments = new LinkedList<String>();
LinkedList<Sentence> sentences = new LinkedList<Sentence>();
TokenSequence tokenSequence = null;
PosTagSequence posTagSequence = null;
StringBuilder stringBuilder = new StringBuilder();
boolean finished = false;
int sentenceCount = 0;
CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
int endBlockCharacterCount = 0;
URI currentURI = null;
File currentFile = null;
while (!finished) {
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
// Note SentenceDetector and Tokeniser start modules treated
// identically,
// except that for SentenceDetector we apply a probabilistic
// sentence detector
// whereas for Tokeniser we assume all sentence breaks are
// marked by filters
// read characters from the reader, one at a time
char c;
int r = -1;
try {
r = reader.read();
} catch (IOException e) {
LogUtils.logError(LOG, e);
}
if (r == -1) {
finished = true;
c = '\n';
} else {
c = (char) r;
}
// Jump out if we have 3 consecutive end-block characters.
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
endBlockCharacterCount++;
if (endBlockCharacterCount == 3) {
LOG.info("Three consecutive end-block characters. Exiting.");
finished = true;
}
} else {
endBlockCharacterCount = 0;
}
// have sentence detector
if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
if (stringBuilder.length() > 0) {
String textSegment = stringBuilder.toString();
stringBuilder = new StringBuilder();
textSegments.add(textSegment);
}
// is the current block > 0 characters?
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
textSegments.addLast("");
}
}
if (finished) {
if (stringBuilder.length() > 0) {
textSegments.addLast(stringBuilder.toString());
stringBuilder = new StringBuilder();
}
// add three final text segments to roll everything
// through processing
textSegments.addLast("");
textSegments.addLast("");
textSegments.addLast("");
}
if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
while (textSegments.size() > 0) {
// roll in a new block 4, and roll the other blocks
// leftwards
String nextText = textSegments.removeFirst();
rollingTextBlock = rollingTextBlock.roll(nextText);
// annotate block 3 with raw text filters
AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
textAnnotator.annotate(rawTextBlock);
}
// detect sentences in block 2 using the sentence
// detector
AnnotatedText processedText = rollingTextBlock.getProcessedText();
if (LOG.isTraceEnabled()) {
LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
}
if (this.startModule.equals(Module.sentenceDetector)) {
sentenceDetector.detectSentences(processedText);
}
// get the sentences detected in block 2
List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
for (Sentence sentence : theSentences) {
sentences.add(sentence);
sentenceCount++;
}
if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
finished = true;
}
}
// we have at least one text segment to process
} else if (this.startModule.equals(Module.posTagger)) {
if (tokenCorpusReader.hasNextSentence()) {
tokenSequence = tokenCorpusReader.nextTokenSequence();
} else {
tokenSequence = null;
finished = true;
}
} else if (this.startModule.equals(Module.parser)) {
if (posTagCorpusReader.hasNextSentence()) {
posTagSequence = posTagCorpusReader.nextPosTagSequence();
} else {
posTagSequence = null;
finished = true;
}
}
// which start module?
boolean needToProcess = false;
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
while (needToProcess) {
Sentence sentence = null;
if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
sentence = sentences.poll();
LOG.debug("Sentence: " + sentence);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
LOG.debug("Setting current file to " + currentFile.getPath());
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
if (sentence.getLeftoverOriginalText().length() > 0) {
writer.append(sentence.getLeftoverOriginalText() + "\n");
}
for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
sentenceProcessor.onNextSentence(sentence);
}
}
// need to read next sentence
List<TokenSequence> tokenSequences = null;
if (this.needsTokeniser()) {
tokenSequences = tokeniser.tokenise(sentence);
tokenSequence = tokenSequences.get(0);
for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
}
}
// need to tokenise ?
List<PosTagSequence> posTagSequences = null;
if (this.needsPosTagger()) {
posTagSequence = null;
if (tokenSequences == null) {
tokenSequences = new ArrayListNoNulls<>();
tokenSequences.add(tokenSequence);
}
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
posTagSequence = posTagSequences.get(0);
} else {
posTagSequence = posTagger.tagSentence(tokenSequence);
}
for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
}
tokenSequence = null;
}
if (this.needsParser()) {
if (posTagSequences == null) {
posTagSequences = new ArrayListNoNulls<>();
posTagSequences.add(posTagSequence);
}
ParseConfiguration parseConfiguration = null;
List<ParseConfiguration> parseConfigurations = null;
try {
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
parseConfiguration = parseConfigurations.get(0);
} else {
parseConfiguration = parser.parseSentence(posTagSequence);
}
for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
}
} catch (Exception e) {
LogUtils.logError(LOG, e);
if (stopOnError)
throw new RuntimeException(e);
}
posTagSequence = null;
}
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
}
// next sentence
}
// Check if there's any leftover output to output!
if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
writer.append(rollingTextBlock.getLeftoverOriginalText());
} finally {
IOException exception = null;
try {
reader.close();
writer.flush();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (SentenceProcessor processor : this.sentenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
try {
writer.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
if (exception != null)
throw exception;
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class PatternTokeniser method tokeniseInternal.
@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) throws TalismaneException, IOException {
List<TokenisedAtomicTokenSequence> sequences;
// Assign each separator its default value
List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(initialSequence);
List<Decision> defaultDecisions = new ArrayList<Decision>(defaultOutcomes.size());
for (TokeniserOutcome outcome : defaultOutcomes) {
Decision tokeniserDecision = new Decision(outcome.name());
tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
tokeniserDecision.addAuthority("_" + "DefaultDecision");
defaultDecisions.add(tokeniserDecision);
}
// For each test pattern, see if anything in the sentence matches it
if (this.decisionMaker != null) {
List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>();
Set<Token> matchedTokens = new HashSet<Token>();
for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(initialSequence);
for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
if (matchSequence.getTokensToCheck().size() > 0) {
matchingSequences.add(matchSequence);
matchedTokens.addAll(matchSequence.getTokensToCheck());
TokenPatternMatch primaryMatch = null;
Token token = matchSequence.getTokensToCheck().get(0);
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences == null) {
matchSequences = new TreeSet<TokenPatternMatchSequence>();
tokenMatchSequenceMap.put(token, matchSequences);
}
matchSequences.add(matchSequence);
for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) {
if (patternMatch.getToken().equals(token)) {
primaryMatch = patternMatch;
break;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Found match: " + primaryMatch);
}
primaryMatchMap.put(matchSequence, primaryMatch);
}
}
}
// we want to create the n most likely token sequences
// the sequence has to correspond to a token pattern
Map<TokenPatternMatchSequence, List<Decision>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision>>();
for (TokenPatternMatchSequence matchSequence : matchingSequences) {
TokenPatternMatch match = primaryMatchMap.get(matchSequence);
LOG.debug("next pattern match: " + match.toString());
List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
for (TokenPatternMatchFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(match, env);
if (featureResult != null) {
tokenFeatureResults.add(featureResult);
}
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(tokenFeatureResults);
for (ClassificationObserver observer : this.observers) observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions);
for (Decision decision : decisions) {
decision.addAuthority("_" + this.getClass().getSimpleName());
decision.addAuthority("_" + "Patterns");
decision.addAuthority(match.getPattern().getName());
}
matchSequenceDecisionMap.put(matchSequence, decisions);
}
// initially create a heap with a single, empty sequence
PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
TokenisedAtomicTokenSequence emptySequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
heap.add(emptySequence);
for (int i = 0; i < initialSequence.listWithWhiteSpace().size(); i++) {
Token token = initialSequence.listWithWhiteSpace().get(i);
if (LOG.isTraceEnabled()) {
LOG.trace("Token : \"" + token.getAnalyisText() + "\"");
}
// build a new heap for this iteration
PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
if (i == 0) {
// first token is always "separate" from the outside world
Decision decision = new Decision(TokeniserOutcome.SEPARATE.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
decision.addAuthority("_" + "DefaultDecision");
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(emptySequence);
newSequence.add(taggedToken);
heap.add(newSequence);
continue;
}
// limit the heap breadth to K
int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth() : previousHeap.size();
for (int j = 0; j < maxSequences; j++) {
TokenisedAtomicTokenSequence history = previousHeap.poll();
// Find the separating & non-separating decisions
if (history.size() > i) {
// token already added as part of a sequence
// introduced by another token
heap.add(history);
} else if (tokenMatchSequenceMap.containsKey(token)) {
// token begins one or more match sequences
// these are ordered from shortest to longest (via
// TreeSet)
List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>(tokenMatchSequenceMap.get(token));
// Since sequences P1..Pn contain each other,
// there can be exactly matchSequences.size()
// consistent solutions
// Assume the default is separate
// 0: all separate
// 1: join P1, separate rest
// 2: join P2, separate rest
// ...
// n: join Pn
// We need to add each of these to the heap
// by taking the product of all probabilities
// consistent with each solution
// The probabities for each solution are (j=join,
// s=separate)
// All separate: s1 x s2 x ... x sn
// P1: j1 x s2 x ... x sn
// P2: j1 x j2 x ... x sn
// ...
// Pn: j1 x j2 x ... x jn
// Any solution of the form s1 x j2 would be
// inconsistent, and is not considered
// If Pi and Pj start and end on the exact same
// token, then the solution for both is
// Pi: j1 x ... x ji x jj x sj+1 ... x sn
// Pj: j1 x ... x ji x jj x sj+1 ... x sn
// Note of course that we're never likely to have
// more than two Ps here,
// but we need a solution for more just to be sure
// to be sure
TokeniserOutcome defaultOutcome = TokeniserOutcome.valueOf(defaultDecisions.get(token.getIndexWithWhiteSpace()).getOutcome());
TokeniserOutcome otherOutcome = null;
if (defaultOutcome == TokeniserOutcome.SEPARATE)
otherOutcome = TokeniserOutcome.JOIN;
else
otherOutcome = TokeniserOutcome.SEPARATE;
double[] decisionProbs = new double[matchSequences.size() + 1];
for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] = 1;
// Note: k0 = default decision (e.g. separate all),
// k1=first pattern
// p1 = first pattern
int p = 1;
int prevEndIndex = -1;
for (TokenPatternMatchSequence matchSequence : matchSequences) {
int endIndex = matchSequence.getTokensToCheck().get(matchSequence.getTokensToCheck().size() - 1).getEndIndex();
List<Decision> decisions = matchSequenceDecisionMap.get(matchSequence);
for (Decision decision : decisions) {
for (int k = 0; k < decisionProbs.length; k++) {
if (decision.getOutcome().equals(defaultOutcome.name())) {
// e.g. separate in most cases
if (k < p && endIndex > prevEndIndex)
decisionProbs[k] *= decision.getProbability();
else if (k + 1 < p && endIndex <= prevEndIndex)
decisionProbs[k] *= decision.getProbability();
} else {
// e.g. join in most cases
if (k >= p && endIndex > prevEndIndex)
decisionProbs[k] *= decision.getProbability();
else if (k + 1 >= p && endIndex <= prevEndIndex)
decisionProbs[k] *= decision.getProbability();
}
}
// next k
}
// next decision (only 2 of these)
prevEndIndex = endIndex;
p++;
}
// transform to probability distribution
double sumProbs = 0;
for (int k = 0; k < decisionProbs.length; k++) sumProbs += decisionProbs[k];
if (sumProbs > 0)
for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] /= sumProbs;
// Apply default decision
// Since this is the default decision for all tokens
// in the sequence, we don't add the other tokens
// for now,
// so as to allow them
// to get examined one at a time, just in case one
// of them starts its own separate sequence
Decision defaultDecision = new Decision(defaultOutcome.name(), decisionProbs[0]);
defaultDecision.addAuthority("_" + this.getClass().getSimpleName());
defaultDecision.addAuthority("_" + "Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
defaultDecision.addAuthority(matchSequence.getTokenPattern().getName());
}
TaggedToken<TokeniserOutcome> defaultTaggedToken = new TaggedToken<>(token, defaultDecision, TokeniserOutcome.valueOf(defaultDecision.getOutcome()));
TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(history);
defaultSequence.add(defaultTaggedToken);
defaultSequence.addDecision(defaultDecision);
heap.add(defaultSequence);
// Apply one non-default decision per match sequence
for (int k = 0; k < matchSequences.size(); k++) {
TokenPatternMatchSequence matchSequence = matchSequences.get(k);
double prob = decisionProbs[k + 1];
Decision decision = new Decision(otherOutcome.name(), prob);
decision.addAuthority("_" + this.getClass().getSimpleName());
decision.addAuthority("_" + "Patterns");
decision.addAuthority(matchSequence.getTokenPattern().getName());
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
newSequence.add(taggedToken);
newSequence.addDecision(decision);
// in this sequence to the solution
for (Token tokenInSequence : matchSequence.getTokensToCheck()) {
if (tokenInSequence.equals(token)) {
continue;
}
Decision decisionInSequence = new Decision(decision.getOutcome());
decisionInSequence.addAuthority("_" + this.getClass().getSimpleName());
decisionInSequence.addAuthority("_" + "DecisionInSequence");
decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default");
decisionInSequence.addAuthority("_" + "Patterns");
TaggedToken<TokeniserOutcome> taggedTokenInSequence = new TaggedToken<>(tokenInSequence, decisionInSequence, TokeniserOutcome.valueOf(decisionInSequence.getOutcome()));
newSequence.add(taggedTokenInSequence);
}
heap.add(newSequence);
}
// next sequence
} else {
// token doesn't start match sequence, and hasn't
// already been added to the current sequence
Decision decision = defaultDecisions.get(i);
if (matchedTokens.contains(token)) {
decision = new Decision(decision.getOutcome());
decision.addAuthority("_" + this.getClass().getSimpleName());
decision.addAuthority("_" + "DecisionInSequence");
decision.addAuthority("_" + "DecisionInSequence_default");
decision.addAuthority("_" + "Patterns");
}
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
newSequence.add(taggedToken);
heap.add(newSequence);
}
}
// next sequence in the old heap
}
// next token
sequences = new ArrayList<TokenisedAtomicTokenSequence>();
int k = 0;
while (!heap.isEmpty()) {
sequences.add(heap.poll());
k++;
if (k >= this.getBeamWidth())
break;
}
} else {
sequences = new ArrayList<TokenisedAtomicTokenSequence>();
TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
int i = 0;
for (Token token : initialSequence.listWithWhiteSpace()) {
Decision decision = defaultDecisions.get(i++);
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
defaultSequence.add(taggedToken);
}
sequences.add(defaultSequence);
}
// have decision maker?
return sequences;
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class CombinedLexicalAttributesTest method testCheckInternalMultipleEntries.
@Test
public void testCheckInternalMultipleEntries() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("je demande", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("demande", tokenSequence, 1, "je ".length(), "je demande".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<String> featureResult = feature.checkInternal(context, env);
String outcome = featureResult.getOutcome();
System.out.println(outcome);
assertEquals("1;3", outcome);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class CombinedLexicalAttributesTest method testCheckInternalMultipleAttributes.
@Test
public void testCheckInternalMultipleAttributes() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("blah", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person, number);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<String> featureResult = feature.checkInternal(context, env);
String outcome = featureResult.getOutcome();
System.out.println(outcome);
assertEquals("1;3|p;s", outcome);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternalMultipleAttributes.
@Test
public void testCheckInternalMultipleAttributes() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("blah", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, person, number);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
for (WeightedOutcome<String> outcome : outcomes) {
assertTrue("3|p".equals(outcome.getOutcome()) || "1|s".equals(outcome.getOutcome()) || "3|s".equals(outcome.getOutcome()));
}
assertEquals(3, outcomes.size());
}
Aggregations