use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.
the class TokenPerLineCorpusReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
while (sentenceLines == null) {
List<UnprocessedLine> lines = new ArrayList<>();
int skippedLineCount = 0;
if (!this.hasNextLine())
break;
while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
String line = "";
if (this.hasNextLine())
line = this.nextLine().replace("\r", "");
lineNumber++;
if (LOG.isTraceEnabled())
LOG.trace("Line " + lineNumber + ": " + line);
if (line.length() > 0) {
boolean skip = false;
for (Pattern skipLinePattern : skipLinePatterns) {
if (skipLinePattern.matcher(line).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
skip = true;
skippedLineCount++;
break;
}
}
List<CorpusSentenceRule> myRules = new ArrayList<>();
List<Matcher> myMatchers = new ArrayList<>();
for (CorpusSentenceRule sentenceRule : sentenceRules) {
Matcher matcher = sentenceRule.getPattern().matcher(line);
if (matcher.matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Matched rule: " + sentenceRule);
myRules.add(sentenceRule);
myMatchers.add(matcher);
}
}
UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
lines.add(unprocessedLine);
} else {
if (lines.size() == 0 || lines.size() == skippedLineCount) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
// end of sentence
boolean includeMe = true;
// check cross-validation
if (this.getCrossValidationSize() > 0) {
if (this.getIncludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
includeMe = false;
}
} else if (this.getExcludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
includeMe = false;
}
}
}
if (this.getStartSentence() > sentenceCount) {
includeMe = false;
}
sentenceCount++;
LOG.debug("sentenceCount: " + sentenceCount);
if (!includeMe) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
sentenceLines = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (!unprocessedLine.skip) {
CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
sentenceLines.add(corpusLine);
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
}
}
List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (LOG.isTraceEnabled())
LOG.trace("Line " + unprocessedLine);
for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
Matcher matcher = unprocessedLine.matchers.get(i);
if (LOG.isTraceEnabled())
LOG.trace("Testing rule " + sentenceRule);
CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
if (LOG.isTraceEnabled())
LOG.trace("Result: " + action);
if (action != null) {
if (action instanceof MergeAction)
mergeActions.add((MergeAction) action);
break;
}
}
}
if (mergeActions.size() > 0) {
List<CorpusLine> newSentenceLines = new ArrayList<>();
Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
}
}
int i = 1;
Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
int nextIndexToMerge = iIndexToMerge.next();
int linesRemoved = 0;
Map<Integer, Integer> indexChangeMap = new HashMap<>();
indexChangeMap.put(0, 0);
for (CorpusLine corpusLine : sentenceLines) {
if (i == nextIndexToMerge) {
MergeAction mergeAction = indexesToMerge.get(i);
if (i == mergeAction.getFirstIndex()) {
newSentenceLines.add(mergeAction.getMergedLine());
linesRemoved -= 1;
}
linesRemoved += 1;
if (iIndexToMerge.hasNext())
nextIndexToMerge = iIndexToMerge.next();
else
nextIndexToMerge = -1;
} else {
newSentenceLines.add(corpusLine);
}
indexChangeMap.put(i, i - linesRemoved);
i++;
}
for (CorpusLine corpusLine : newSentenceLines) {
corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
int governorIndex = corpusLine.getGovernorIndex();
if (governorIndex >= 0)
corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
if (nonProjGovernorIndex >= 0)
corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
}
sentenceLines = newSentenceLines;
}
Sentence sentence = null;
if (sentenceReader != null && sentenceReader.hasNextSentence()) {
sentence = sentenceReader.nextSentence();
} else {
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new TalismaneException("Linguistic rules have not been set.");
String text = "";
for (CorpusLine corpusLine : sentenceLines) {
String word = corpusLine.getElement(CorpusElement.TOKEN);
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
sentence = new Sentence(text, currentFile, sessionId);
}
for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
sentenceAnnotator.annotate(sentence);
}
this.processSentence(sentence, sentenceLines);
}
}
}
}
return (sentenceLines != null);
}
use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.
the class ParserEvaluator method evaluate.
/**
* @throws TalismaneException
* if an attempt is made to evaluate with a tokeniser but no
* pos-tagger
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
List<PosTagSequence> posTagSequences = null;
List<TokenSequence> tokenSequences = null;
if (tokeniser != null) {
if (posTagger == null)
throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
// annotate the sentence for pre token filters
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
if (LOG.isTraceEnabled()) {
LOG.trace("TokenFilter: " + annotator);
LOG.trace("annotations: " + sentence.getAnnotations());
}
}
tokenSequences = tokeniser.tokenise(sentence);
} else {
tokenSequences = new ArrayList<TokenSequence>();
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
posTagSequence.removeRoot();
tokenSequences.add(posTagSequence.getTokenSequence());
}
if (posTagger != null) {
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
} else {
posTagSequences = new ArrayList<PosTagSequence>();
PosTagSequence posTagSequence = null;
posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
posTagSequences.add(posTagSequence);
}
} else {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
posTagSequences = new ArrayList<PosTagSequence>();
posTagSequences.add(posTagSequence);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseStart(realConfiguration, posTagSequences);
}
List<ParseConfiguration> guessedConfigurations = null;
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
} else {
ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
guessedConfigurations = new ArrayList<ParseConfiguration>();
guessedConfigurations.add(bestGuess);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseEnd(realConfiguration, guessedConfigurations);
}
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.
the class Talismane method analyse.
/**
* Analyse the data provided by this reader, as specified by the
* configuration.
*
* @param reader
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if it's impossible to read a sentence from an annotated corpus
*/
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
long startTime = System.currentTimeMillis();
try {
TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
PosTagAnnotatedCorpusReader posTagCorpusReader = null;
if (this.startModule.equals(Module.posTagger)) {
tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
}
if (this.startModule.equals(Module.parser)) {
posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
}
LinkedList<String> textSegments = new LinkedList<String>();
LinkedList<Sentence> sentences = new LinkedList<Sentence>();
TokenSequence tokenSequence = null;
PosTagSequence posTagSequence = null;
StringBuilder stringBuilder = new StringBuilder();
boolean finished = false;
int sentenceCount = 0;
CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
int endBlockCharacterCount = 0;
URI currentURI = null;
File currentFile = null;
while (!finished) {
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
// Note SentenceDetector and Tokeniser start modules treated
// identically,
// except that for SentenceDetector we apply a probabilistic
// sentence detector
// whereas for Tokeniser we assume all sentence breaks are
// marked by filters
// read characters from the reader, one at a time
char c;
int r = -1;
try {
r = reader.read();
} catch (IOException e) {
LogUtils.logError(LOG, e);
}
if (r == -1) {
finished = true;
c = '\n';
} else {
c = (char) r;
}
// Jump out if we have 3 consecutive end-block characters.
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
endBlockCharacterCount++;
if (endBlockCharacterCount == 3) {
LOG.info("Three consecutive end-block characters. Exiting.");
finished = true;
}
} else {
endBlockCharacterCount = 0;
}
// have sentence detector
if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
if (stringBuilder.length() > 0) {
String textSegment = stringBuilder.toString();
stringBuilder = new StringBuilder();
textSegments.add(textSegment);
}
// is the current block > 0 characters?
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
textSegments.addLast("");
}
}
if (finished) {
if (stringBuilder.length() > 0) {
textSegments.addLast(stringBuilder.toString());
stringBuilder = new StringBuilder();
}
// add three final text segments to roll everything
// through processing
textSegments.addLast("");
textSegments.addLast("");
textSegments.addLast("");
}
if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
while (textSegments.size() > 0) {
// roll in a new block 4, and roll the other blocks
// leftwards
String nextText = textSegments.removeFirst();
rollingTextBlock = rollingTextBlock.roll(nextText);
// annotate block 3 with raw text filters
AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
textAnnotator.annotate(rawTextBlock);
}
// detect sentences in block 2 using the sentence
// detector
AnnotatedText processedText = rollingTextBlock.getProcessedText();
if (LOG.isTraceEnabled()) {
LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
}
if (this.startModule.equals(Module.sentenceDetector)) {
sentenceDetector.detectSentences(processedText);
}
// get the sentences detected in block 2
List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
for (Sentence sentence : theSentences) {
sentences.add(sentence);
sentenceCount++;
}
if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
finished = true;
}
}
// we have at least one text segment to process
} else if (this.startModule.equals(Module.posTagger)) {
if (tokenCorpusReader.hasNextSentence()) {
tokenSequence = tokenCorpusReader.nextTokenSequence();
} else {
tokenSequence = null;
finished = true;
}
} else if (this.startModule.equals(Module.parser)) {
if (posTagCorpusReader.hasNextSentence()) {
posTagSequence = posTagCorpusReader.nextPosTagSequence();
} else {
posTagSequence = null;
finished = true;
}
}
// which start module?
boolean needToProcess = false;
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
while (needToProcess) {
Sentence sentence = null;
if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
sentence = sentences.poll();
LOG.debug("Sentence: " + sentence);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
LOG.debug("Setting current file to " + currentFile.getPath());
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
if (sentence.getLeftoverOriginalText().length() > 0) {
writer.append(sentence.getLeftoverOriginalText() + "\n");
}
for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
sentenceProcessor.onNextSentence(sentence);
}
}
// need to read next sentence
List<TokenSequence> tokenSequences = null;
if (this.needsTokeniser()) {
tokenSequences = tokeniser.tokenise(sentence);
tokenSequence = tokenSequences.get(0);
for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
}
}
// need to tokenise ?
List<PosTagSequence> posTagSequences = null;
if (this.needsPosTagger()) {
posTagSequence = null;
if (tokenSequences == null) {
tokenSequences = new ArrayListNoNulls<>();
tokenSequences.add(tokenSequence);
}
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
posTagSequence = posTagSequences.get(0);
} else {
posTagSequence = posTagger.tagSentence(tokenSequence);
}
for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
}
tokenSequence = null;
}
if (this.needsParser()) {
if (posTagSequences == null) {
posTagSequences = new ArrayListNoNulls<>();
posTagSequences.add(posTagSequence);
}
ParseConfiguration parseConfiguration = null;
List<ParseConfiguration> parseConfigurations = null;
try {
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
parseConfiguration = parseConfigurations.get(0);
} else {
parseConfiguration = parser.parseSentence(posTagSequence);
}
for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
}
} catch (Exception e) {
LogUtils.logError(LOG, e);
if (stopOnError)
throw new RuntimeException(e);
}
posTagSequence = null;
}
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
}
// next sentence
}
// Check if there's any leftover output to output!
if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
writer.append(rollingTextBlock.getLeftoverOriginalText());
} finally {
IOException exception = null;
try {
reader.close();
writer.flush();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (SentenceProcessor processor : this.sentenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
try {
writer.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
if (exception != null)
throw exception;
}
}
use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.
the class TalismaneAPIExamples method example2.
/**
* Similar to example1, but begins with filtering and sentence detection.
*/
public static void example2(String sessionId) throws Exception {
String text = "Les gens qui voient de travers pensent que les bancs verts qu'on voit sur les trottoirs " + "sont faits pour les impotents ou les ventripotents. " + "Mais c'est une absurdité, car, à la vérité, ils sont là, c'est notoire, " + "pour accueillir quelque temps les amours débutants.";
RawText rawText = new RawText(text, true, sessionId);
// issues (e.g. replace " with ")
for (RawTextAnnotator filter : TalismaneSession.get(sessionId).getTextAnnotators()) {
filter.annotate(rawText);
}
// retrieve the processed text after filters have been applied
AnnotatedText processedText = rawText.getProcessedText();
// detect sentences
SentenceDetector sentenceDetector = SentenceDetector.getInstance(sessionId);
sentenceDetector.detectSentences(processedText);
// the detected sentences can be retrieved directly from the raw text
// this allows annotations made on the sentences to get reflected in the
// raw text
List<Sentence> sentences = rawText.getDetectedSentences();
for (Sentence sentence : sentences) {
// assignment for a given word)
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
}
// tokenise the text
Tokeniser tokeniser = Tokeniser.getInstance(sessionId);
TokenSequence tokenSequence = tokeniser.tokeniseSentence(sentence);
// pos-tag the token sequence
PosTagger posTagger = PosTaggers.getPosTagger(sessionId);
PosTagSequence posTagSequence = posTagger.tagSentence(tokenSequence);
System.out.println(posTagSequence);
// parse the pos-tag sequence
Parser parser = Parsers.getParser(sessionId);
ParseConfiguration parseConfiguration = parser.parseSentence(posTagSequence);
System.out.println(parseConfiguration);
ParseTree parseTree = new ParseTree(parseConfiguration, true);
System.out.println(parseTree);
}
}
use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.
the class StandoffReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
if (configuration == null && sentenceIndex < sentences.size()) {
List<StandoffToken> tokens = sentences.get(sentenceIndex++);
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new RuntimeException("Linguistic rules have not been set.");
String text = "";
for (StandoffToken standoffToken : tokens) {
String word = standoffToken.text;
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
Sentence sentence = new Sentence(text, sessionId);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
}
PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
for (StandoffToken standoffToken : tokens) {
Token token = tokenSequence.addToken(standoffToken.text);
Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
if (LOG.isTraceEnabled()) {
LOG.trace(posTaggedToken.toString());
}
posTaggedToken.setComment(standoffToken.comment);
posTagSequence.addPosTaggedToken(posTaggedToken);
idTokenMap.put(standoffToken.id, posTaggedToken);
LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
}
tokenSequence.setWithRoot(true);
configuration = new ParseConfiguration(posTagSequence);
for (StandoffToken standoffToken : tokens) {
StandoffRelation relation = relationMap.get(standoffToken.id);
if (relation != null) {
PosTaggedToken head = idTokenMap.get(relation.fromToken);
PosTaggedToken dependent = idTokenMap.get(relation.toToken);
if (head == null) {
throw new TalismaneException("No token found for head id: " + relation.fromToken);
}
if (dependent == null) {
throw new TalismaneException("No token found for dependent id: " + relation.toToken);
}
DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
arc.setComment(relation.comment);
} else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
if (punctuationDepLabel != null) {
PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
for (int i = dependent.getIndex() - 1; i >= 0; i--) {
PosTaggedToken head = posTagSequence.get(i);
if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
continue;
configuration.addDependency(head, dependent, punctuationDepLabel, null);
break;
}
}
}
}
}
}
return (configuration != null);
}
Aggregations