use of com.joliciel.talismane.parser.ParseConfiguration in project talismane by joliciel-informatique.
the class StackSearchFeature method check.
@Override
public FeatureResult<PosTaggedTokenWrapper> check(ParseConfigurationWrapper wrapper, RuntimeEnvironment env) throws TalismaneException {
ParseConfiguration configuration = wrapper.getParseConfiguration();
int index = 1;
if (indexFeature != null) {
FeatureResult<Integer> indexResult = indexFeature.check(wrapper, env);
if (indexResult == null)
return null;
index = indexResult.getOutcome();
}
Iterator<PosTaggedToken> stackIterator = configuration.getStack().iterator();
ParseConfigurationAddress parseConfigurationAddress = new ParseConfigurationAddress(env);
parseConfigurationAddress.setParseConfiguration(configuration);
int i = -1;
PosTaggedToken resultToken = null;
while (stackIterator.hasNext()) {
PosTaggedToken token = stackIterator.next();
i++;
if (i < index)
continue;
parseConfigurationAddress.setPosTaggedToken(token);
FeatureResult<Boolean> criterionResult = criterionFeature.check(parseConfigurationAddress, env);
if (criterionResult != null) {
boolean criterion = criterionResult.getOutcome();
if (criterion) {
resultToken = token;
break;
}
}
}
FeatureResult<PosTaggedTokenWrapper> featureResult = null;
if (resultToken != null)
featureResult = this.generateResult(resultToken);
return featureResult;
}
use of com.joliciel.talismane.parser.ParseConfiguration in project talismane by joliciel-informatique.
the class Talismane method analyse.
/**
* Analyse the data provided by this reader, as specified by the
* configuration.
*
* @param reader
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if it's impossible to read a sentence from an annotated corpus
*/
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
long startTime = System.currentTimeMillis();
try {
TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
PosTagAnnotatedCorpusReader posTagCorpusReader = null;
if (this.startModule.equals(Module.posTagger)) {
tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
}
if (this.startModule.equals(Module.parser)) {
posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
}
LinkedList<String> textSegments = new LinkedList<String>();
LinkedList<Sentence> sentences = new LinkedList<Sentence>();
TokenSequence tokenSequence = null;
PosTagSequence posTagSequence = null;
StringBuilder stringBuilder = new StringBuilder();
boolean finished = false;
int sentenceCount = 0;
CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
int endBlockCharacterCount = 0;
URI currentURI = null;
File currentFile = null;
while (!finished) {
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
// Note SentenceDetector and Tokeniser start modules treated
// identically,
// except that for SentenceDetector we apply a probabilistic
// sentence detector
// whereas for Tokeniser we assume all sentence breaks are
// marked by filters
// read characters from the reader, one at a time
char c;
int r = -1;
try {
r = reader.read();
} catch (IOException e) {
LogUtils.logError(LOG, e);
}
if (r == -1) {
finished = true;
c = '\n';
} else {
c = (char) r;
}
// Jump out if we have 3 consecutive end-block characters.
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
endBlockCharacterCount++;
if (endBlockCharacterCount == 3) {
LOG.info("Three consecutive end-block characters. Exiting.");
finished = true;
}
} else {
endBlockCharacterCount = 0;
}
// have sentence detector
if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
if (stringBuilder.length() > 0) {
String textSegment = stringBuilder.toString();
stringBuilder = new StringBuilder();
textSegments.add(textSegment);
}
// is the current block > 0 characters?
if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
textSegments.addLast("");
}
}
if (finished) {
if (stringBuilder.length() > 0) {
textSegments.addLast(stringBuilder.toString());
stringBuilder = new StringBuilder();
}
// add three final text segments to roll everything
// through processing
textSegments.addLast("");
textSegments.addLast("");
textSegments.addLast("");
}
if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
stringBuilder.append(c);
while (textSegments.size() > 0) {
// roll in a new block 4, and roll the other blocks
// leftwards
String nextText = textSegments.removeFirst();
rollingTextBlock = rollingTextBlock.roll(nextText);
// annotate block 3 with raw text filters
AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
textAnnotator.annotate(rawTextBlock);
}
// detect sentences in block 2 using the sentence
// detector
AnnotatedText processedText = rollingTextBlock.getProcessedText();
if (LOG.isTraceEnabled()) {
LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
}
if (this.startModule.equals(Module.sentenceDetector)) {
sentenceDetector.detectSentences(processedText);
}
// get the sentences detected in block 2
List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
for (Sentence sentence : theSentences) {
sentences.add(sentence);
sentenceCount++;
}
if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
finished = true;
}
}
// we have at least one text segment to process
} else if (this.startModule.equals(Module.posTagger)) {
if (tokenCorpusReader.hasNextSentence()) {
tokenSequence = tokenCorpusReader.nextTokenSequence();
} else {
tokenSequence = null;
finished = true;
}
} else if (this.startModule.equals(Module.parser)) {
if (posTagCorpusReader.hasNextSentence()) {
posTagSequence = posTagCorpusReader.nextPosTagSequence();
} else {
posTagSequence = null;
finished = true;
}
}
// which start module?
boolean needToProcess = false;
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
while (needToProcess) {
Sentence sentence = null;
if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
sentence = sentences.poll();
LOG.debug("Sentence: " + sentence);
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
LOG.debug("Setting current file to " + currentFile.getPath());
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
if (sentence.getLeftoverOriginalText().length() > 0) {
writer.append(sentence.getLeftoverOriginalText() + "\n");
}
for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
sentenceProcessor.onNextSentence(sentence);
}
}
// need to read next sentence
List<TokenSequence> tokenSequences = null;
if (this.needsTokeniser()) {
tokenSequences = tokeniser.tokenise(sentence);
tokenSequence = tokenSequences.get(0);
for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
}
}
// need to tokenise ?
List<PosTagSequence> posTagSequences = null;
if (this.needsPosTagger()) {
posTagSequence = null;
if (tokenSequences == null) {
tokenSequences = new ArrayListNoNulls<>();
tokenSequences.add(tokenSequence);
}
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
posTagSequence = posTagSequences.get(0);
} else {
posTagSequence = posTagger.tagSentence(tokenSequence);
}
for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
}
tokenSequence = null;
}
if (this.needsParser()) {
if (posTagSequences == null) {
posTagSequences = new ArrayListNoNulls<>();
posTagSequences.add(posTagSequence);
}
ParseConfiguration parseConfiguration = null;
List<ParseConfiguration> parseConfigurations = null;
try {
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
parseConfiguration = parseConfigurations.get(0);
} else {
parseConfiguration = parser.parseSentence(posTagSequence);
}
for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
}
} catch (Exception e) {
LogUtils.logError(LOG, e);
if (stopOnError)
throw new RuntimeException(e);
}
posTagSequence = null;
}
if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
needToProcess = !sentences.isEmpty();
else if (this.startModule.equals(Module.posTagger))
needToProcess = tokenSequence != null;
else if (this.startModule.equals(Module.parser))
needToProcess = posTagSequence != null;
}
// next sentence
}
// Check if there's any leftover output to output!
if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
writer.append(rollingTextBlock.getLeftoverOriginalText());
} finally {
IOException exception = null;
try {
reader.close();
writer.flush();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (SentenceProcessor processor : this.sentenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
try {
writer.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
exception = e;
}
if (exception != null)
throw exception;
}
}
use of com.joliciel.talismane.parser.ParseConfiguration in project talismane by joliciel-informatique.
the class ParseOutputRewriterTest method testGetCorpusLines.
@Test
public void testGetCorpusLines() throws Exception {
TalismaneSession.clearSessions();
System.setProperty("config.file", "src/test/resources/testWithOutputRules.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String input = "";
input += "1\tAu\tau\tADP+DET\t0\troot\n";
input += "2\tsein\tsein\tNOUN\t1\tfixed\n";
input += "3\tmême\tmême\tADV\t1\tadvmod\n";
input += "4\tdu\tdu\tADP+DET\t5\tcase\n";
input += "5\tParti\tParti\tPROPN\t1\tnmod\n";
input += "6\tsocialiste\tsocialiste\tADJ\t5\tfixed\n";
input += "7\tauquel\tauquel\tADP+PRON\t8\tobl\n";
input += "8\tappartient\tappartenir\tVERB\t5\tacl:relcl\n";
input += "9\tM.\tmonsieur\tNOUN\t8\tnsubj\n";
input += "10\tDupont\tDupont\tPROPN\t9\tflat:name\n";
StringReader stringReader = new StringReader(input);
ParserRegexBasedCorpusReader reader = new ParserRegexBasedCorpusReader(stringReader, config.getConfig("talismane.core.test.parser.input"), sessionId);
ParseConfiguration parseConfiguration = reader.nextConfiguration();
final StringWriter writer = new StringWriter();
try (ParseOutputRewriter rewriter = new ParseOutputRewriter(writer, sessionId)) {
List<CorpusLine> corpusLines = rewriter.getCorpusLines(parseConfiguration);
int i = 1;
for (CorpusLine corpusLine : corpusLines) {
LOG.debug("line " + corpusLine.getIndex() + ": " + corpusLine.getElements());
if (i == 1) {
assertEquals(1, corpusLine.getIndex());
assertEquals("à", corpusLine.getToken());
assertEquals("à", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(0, corpusLine.getGovernorIndex());
assertEquals("root", corpusLine.getLabel());
} else if (i == 2) {
assertEquals(2, corpusLine.getIndex());
assertEquals("le", corpusLine.getToken());
assertEquals("le", corpusLine.getLemma());
assertEquals("DET", corpusLine.getPosTag());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 3) {
assertEquals(3, corpusLine.getIndex());
assertEquals("sein", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 4) {
assertEquals(4, corpusLine.getIndex());
assertEquals("même", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("advmod", corpusLine.getLabel());
} else if (i == 5) {
assertEquals(5, corpusLine.getIndex());
assertEquals("de", corpusLine.getToken());
assertEquals("de", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("case", corpusLine.getLabel());
} else if (i == 6) {
assertEquals(6, corpusLine.getIndex());
assertEquals("le", corpusLine.getToken());
assertEquals("le", corpusLine.getLemma());
assertEquals("DET", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("det", corpusLine.getLabel());
} else if (i == 7) {
assertEquals(7, corpusLine.getIndex());
assertEquals("Parti", corpusLine.getToken());
assertEquals(1, corpusLine.getGovernorIndex());
assertEquals("nmod", corpusLine.getLabel());
} else if (i == 8) {
assertEquals(8, corpusLine.getIndex());
assertEquals("socialiste", corpusLine.getToken());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("fixed", corpusLine.getLabel());
} else if (i == 9) {
assertEquals(9, corpusLine.getIndex());
assertEquals("à", corpusLine.getToken());
assertEquals("à", corpusLine.getLemma());
assertEquals("ADP", corpusLine.getPosTag());
assertEquals(10, corpusLine.getGovernorIndex());
assertEquals("case", corpusLine.getLabel());
} else if (i == 10) {
assertEquals(10, corpusLine.getIndex());
assertEquals("lequel", corpusLine.getToken());
assertEquals("lequel", corpusLine.getLemma());
assertEquals("PRON", corpusLine.getPosTag());
assertEquals(11, corpusLine.getGovernorIndex());
assertEquals("obl", corpusLine.getLabel());
} else if (i == 11) {
assertEquals(11, corpusLine.getIndex());
assertEquals("appartient", corpusLine.getToken());
assertEquals("VERB", corpusLine.getPosTag());
assertEquals(7, corpusLine.getGovernorIndex());
assertEquals("acl:relcl", corpusLine.getLabel());
} else if (i == 12) {
assertEquals(12, corpusLine.getIndex());
assertEquals("M.", corpusLine.getToken());
assertEquals("NOUN", corpusLine.getPosTag());
assertEquals(11, corpusLine.getGovernorIndex());
assertEquals("nsubj", corpusLine.getLabel());
} else if (i == 13) {
assertEquals(13, corpusLine.getIndex());
assertEquals("Dupont", corpusLine.getToken());
assertEquals("PROPN", corpusLine.getPosTag());
assertEquals(12, corpusLine.getGovernorIndex());
assertEquals("flat:name", corpusLine.getLabel());
}
i++;
}
assertEquals(13, corpusLines.size());
}
}
use of com.joliciel.talismane.parser.ParseConfiguration in project talismane by joliciel-informatique.
the class StandoffReader method nextConfiguration.
@Override
public ParseConfiguration nextConfiguration() throws TalismaneException, IOException {
ParseConfiguration nextConfiguration = null;
if (this.hasNextSentence()) {
nextConfiguration = configuration;
configuration = null;
}
return nextConfiguration;
}
use of com.joliciel.talismane.parser.ParseConfiguration in project talismane by joliciel-informatique.
the class ValencyByLabelFeature method check.
@Override
public FeatureResult<Integer> check(ParseConfigurationWrapper wrapper, RuntimeEnvironment env) throws TalismaneException {
ParseConfiguration configuration = wrapper.getParseConfiguration();
FeatureResult<PosTaggedTokenWrapper> tokenResult = addressFunction.check(wrapper, env);
FeatureResult<Integer> featureResult = null;
if (tokenResult != null) {
FeatureResult<String> depLabelResult = dependencyLabelFeature.check(wrapper, env);
if (depLabelResult != null) {
PosTaggedToken posTaggedToken = tokenResult.getOutcome().getPosTaggedToken();
String label = depLabelResult.getOutcome();
int valency = configuration.getDependents(posTaggedToken, label).size();
featureResult = this.generateResult(valency);
}
}
return featureResult;
}
Aggregations