Search in sources :

Example 1 with SentenceAnnotator

use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.

the class TokenPerLineCorpusReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        while (sentenceLines == null) {
            List<UnprocessedLine> lines = new ArrayList<>();
            int skippedLineCount = 0;
            if (!this.hasNextLine())
                break;
            while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
                String line = "";
                if (this.hasNextLine())
                    line = this.nextLine().replace("\r", "");
                lineNumber++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Line " + lineNumber + ": " + line);
                if (line.length() > 0) {
                    boolean skip = false;
                    for (Pattern skipLinePattern : skipLinePatterns) {
                        if (skipLinePattern.matcher(line).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
                            skip = true;
                            skippedLineCount++;
                            break;
                        }
                    }
                    List<CorpusSentenceRule> myRules = new ArrayList<>();
                    List<Matcher> myMatchers = new ArrayList<>();
                    for (CorpusSentenceRule sentenceRule : sentenceRules) {
                        Matcher matcher = sentenceRule.getPattern().matcher(line);
                        if (matcher.matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Matched rule: " + sentenceRule);
                            myRules.add(sentenceRule);
                            myMatchers.add(matcher);
                        }
                    }
                    UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
                    lines.add(unprocessedLine);
                } else {
                    if (lines.size() == 0 || lines.size() == skippedLineCount) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    // end of sentence
                    boolean includeMe = true;
                    // check cross-validation
                    if (this.getCrossValidationSize() > 0) {
                        if (this.getIncludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
                                includeMe = false;
                            }
                        } else if (this.getExcludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
                                includeMe = false;
                            }
                        }
                    }
                    if (this.getStartSentence() > sentenceCount) {
                        includeMe = false;
                    }
                    sentenceCount++;
                    LOG.debug("sentenceCount: " + sentenceCount);
                    if (!includeMe) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    sentenceLines = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (!unprocessedLine.skip) {
                            CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
                            sentenceLines.add(corpusLine);
                            if (this.lexicalEntryReader != null) {
                                WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
                                this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
                                corpusLine.setLexicalEntry(lexicalEntry);
                            }
                        }
                    }
                    List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Line " + unprocessedLine);
                        for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
                            CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
                            Matcher matcher = unprocessedLine.matchers.get(i);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Testing rule " + sentenceRule);
                            CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Result: " + action);
                            if (action != null) {
                                if (action instanceof MergeAction)
                                    mergeActions.add((MergeAction) action);
                                break;
                            }
                        }
                    }
                    if (mergeActions.size() > 0) {
                        List<CorpusLine> newSentenceLines = new ArrayList<>();
                        Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
                        for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
                            for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
                                indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
                            }
                        }
                        int i = 1;
                        Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
                        int nextIndexToMerge = iIndexToMerge.next();
                        int linesRemoved = 0;
                        Map<Integer, Integer> indexChangeMap = new HashMap<>();
                        indexChangeMap.put(0, 0);
                        for (CorpusLine corpusLine : sentenceLines) {
                            if (i == nextIndexToMerge) {
                                MergeAction mergeAction = indexesToMerge.get(i);
                                if (i == mergeAction.getFirstIndex()) {
                                    newSentenceLines.add(mergeAction.getMergedLine());
                                    linesRemoved -= 1;
                                }
                                linesRemoved += 1;
                                if (iIndexToMerge.hasNext())
                                    nextIndexToMerge = iIndexToMerge.next();
                                else
                                    nextIndexToMerge = -1;
                            } else {
                                newSentenceLines.add(corpusLine);
                            }
                            indexChangeMap.put(i, i - linesRemoved);
                            i++;
                        }
                        for (CorpusLine corpusLine : newSentenceLines) {
                            corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
                            int governorIndex = corpusLine.getGovernorIndex();
                            if (governorIndex >= 0)
                                corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
                            int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
                            if (nonProjGovernorIndex >= 0)
                                corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
                        }
                        sentenceLines = newSentenceLines;
                    }
                    Sentence sentence = null;
                    if (sentenceReader != null && sentenceReader.hasNextSentence()) {
                        sentence = sentenceReader.nextSentence();
                    } else {
                        LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
                        if (rules == null)
                            throw new TalismaneException("Linguistic rules have not been set.");
                        String text = "";
                        for (CorpusLine corpusLine : sentenceLines) {
                            String word = corpusLine.getElement(CorpusElement.TOKEN);
                            if (rules.shouldAddSpace(text, word))
                                text += " ";
                            text += word;
                        }
                        sentence = new Sentence(text, currentFile, sessionId);
                    }
                    for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                        sentenceAnnotator.annotate(sentence);
                    }
                    this.processSentence(sentence, sentenceLines);
                }
            }
        }
    }
    return (sentenceLines != null);
}
Also used : Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) ArrayList(java.util.ArrayList) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) Sentence(com.joliciel.talismane.rawText.Sentence) Pattern(java.util.regex.Pattern) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) TreeMap(java.util.TreeMap) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)

Example 2 with SentenceAnnotator

use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.

the class ParserEvaluator method evaluate.

/**
 * @throws TalismaneException
 *           if an attempt is made to evaluate with a tokeniser but no
 *           pos-tagger
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
        List<PosTagSequence> posTagSequences = null;
        List<TokenSequence> tokenSequences = null;
        if (tokeniser != null) {
            if (posTagger == null)
                throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
            Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
            // annotate the sentence for pre token filters
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("TokenFilter: " + annotator);
                    LOG.trace("annotations: " + sentence.getAnnotations());
                }
            }
            tokenSequences = tokeniser.tokenise(sentence);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
            posTagSequence.removeRoot();
            tokenSequences.add(posTagSequence.getTokenSequence());
        }
        if (posTagger != null) {
            if (posTagger instanceof NonDeterministicPosTagger) {
                NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            } else {
                posTagSequences = new ArrayList<PosTagSequence>();
                PosTagSequence posTagSequence = null;
                posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
                posTagSequences.add(posTagSequence);
            }
        } else {
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
            posTagSequences = new ArrayList<PosTagSequence>();
            posTagSequences.add(posTagSequence);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseStart(realConfiguration, posTagSequences);
        }
        List<ParseConfiguration> guessedConfigurations = null;
        if (parser instanceof NonDeterministicParser) {
            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
            guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
        } else {
            ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
            guessedConfigurations = new ArrayList<ParseConfiguration>();
            guessedConfigurations.add(bestGuess);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseEnd(realConfiguration, guessedConfigurations);
        }
    }
    for (ParseEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 3 with SentenceAnnotator

use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.

the class Talismane method analyse.

/**
 * Analyse the data provided by this reader, as specified by the
 * configuration.
 *
 * @param reader
 * @throws IOException
 * @throws ReflectiveOperationException
 * @throws TalismaneException
 *           if it's impossible to read a sentence from an annotated corpus
 */
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
    long startTime = System.currentTimeMillis();
    try {
        TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
        PosTagAnnotatedCorpusReader posTagCorpusReader = null;
        if (this.startModule.equals(Module.posTagger)) {
            tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
        }
        if (this.startModule.equals(Module.parser)) {
            posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
        }
        LinkedList<String> textSegments = new LinkedList<String>();
        LinkedList<Sentence> sentences = new LinkedList<Sentence>();
        TokenSequence tokenSequence = null;
        PosTagSequence posTagSequence = null;
        StringBuilder stringBuilder = new StringBuilder();
        boolean finished = false;
        int sentenceCount = 0;
        CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
        RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
        int endBlockCharacterCount = 0;
        URI currentURI = null;
        File currentFile = null;
        while (!finished) {
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
                // Note SentenceDetector and Tokeniser start modules treated
                // identically,
                // except that for SentenceDetector we apply a probabilistic
                // sentence detector
                // whereas for Tokeniser we assume all sentence breaks are
                // marked by filters
                // read characters from the reader, one at a time
                char c;
                int r = -1;
                try {
                    r = reader.read();
                } catch (IOException e) {
                    LogUtils.logError(LOG, e);
                }
                if (r == -1) {
                    finished = true;
                    c = '\n';
                } else {
                    c = (char) r;
                }
                // Jump out if we have 3 consecutive end-block characters.
                if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    endBlockCharacterCount++;
                    if (endBlockCharacterCount == 3) {
                        LOG.info("Three consecutive end-block characters. Exiting.");
                        finished = true;
                    }
                } else {
                    endBlockCharacterCount = 0;
                }
                // have sentence detector
                if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
                        stringBuilder.append(c);
                    if (stringBuilder.length() > 0) {
                        String textSegment = stringBuilder.toString();
                        stringBuilder = new StringBuilder();
                        textSegments.add(textSegment);
                    }
                    // is the current block > 0 characters?
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                        textSegments.addLast("");
                    }
                }
                if (finished) {
                    if (stringBuilder.length() > 0) {
                        textSegments.addLast(stringBuilder.toString());
                        stringBuilder = new StringBuilder();
                    }
                    // add three final text segments to roll everything
                    // through processing
                    textSegments.addLast("");
                    textSegments.addLast("");
                    textSegments.addLast("");
                }
                if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
                    stringBuilder.append(c);
                while (textSegments.size() > 0) {
                    // roll in a new block 4, and roll the other blocks
                    // leftwards
                    String nextText = textSegments.removeFirst();
                    rollingTextBlock = rollingTextBlock.roll(nextText);
                    // annotate block 3 with raw text filters
                    AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
                    for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
                        textAnnotator.annotate(rawTextBlock);
                    }
                    // detect sentences in block 2 using the sentence
                    // detector
                    AnnotatedText processedText = rollingTextBlock.getProcessedText();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
                    }
                    if (this.startModule.equals(Module.sentenceDetector)) {
                        sentenceDetector.detectSentences(processedText);
                    }
                    // get the sentences detected in block 2
                    List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
                    for (Sentence sentence : theSentences) {
                        sentences.add(sentence);
                        sentenceCount++;
                    }
                    if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
                        finished = true;
                    }
                }
            // we have at least one text segment to process
            } else if (this.startModule.equals(Module.posTagger)) {
                if (tokenCorpusReader.hasNextSentence()) {
                    tokenSequence = tokenCorpusReader.nextTokenSequence();
                } else {
                    tokenSequence = null;
                    finished = true;
                }
            } else if (this.startModule.equals(Module.parser)) {
                if (posTagCorpusReader.hasNextSentence()) {
                    posTagSequence = posTagCorpusReader.nextPosTagSequence();
                } else {
                    posTagSequence = null;
                    finished = true;
                }
            }
            // which start module?
            boolean needToProcess = false;
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                needToProcess = !sentences.isEmpty();
            else if (this.startModule.equals(Module.posTagger))
                needToProcess = tokenSequence != null;
            else if (this.startModule.equals(Module.parser))
                needToProcess = posTagSequence != null;
            while (needToProcess) {
                Sentence sentence = null;
                if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
                    sentence = sentences.poll();
                    LOG.debug("Sentence: " + sentence);
                    for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
                    if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
                        currentURI = sentence.getFileURI();
                        currentFile = sentence.getFile();
                        LOG.debug("Setting current file to " + currentFile.getPath());
                        if (writer instanceof CurrentFileObserver)
                            ((CurrentFileObserver) writer).onNextFile(currentFile);
                        for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                    }
                    if (sentence.getLeftoverOriginalText().length() > 0) {
                        writer.append(sentence.getLeftoverOriginalText() + "\n");
                    }
                    for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
                        sentenceProcessor.onNextSentence(sentence);
                    }
                }
                // need to read next sentence
                List<TokenSequence> tokenSequences = null;
                if (this.needsTokeniser()) {
                    tokenSequences = tokeniser.tokenise(sentence);
                    tokenSequence = tokenSequences.get(0);
                    for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
                        tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
                    }
                }
                // need to tokenise ?
                List<PosTagSequence> posTagSequences = null;
                if (this.needsPosTagger()) {
                    posTagSequence = null;
                    if (tokenSequences == null) {
                        tokenSequences = new ArrayListNoNulls<>();
                        tokenSequences.add(tokenSequence);
                    }
                    if (posTagger instanceof NonDeterministicPosTagger) {
                        NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                        posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
                        posTagSequence = posTagSequences.get(0);
                    } else {
                        posTagSequence = posTagger.tagSentence(tokenSequence);
                    }
                    for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
                        posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
                    }
                    tokenSequence = null;
                }
                if (this.needsParser()) {
                    if (posTagSequences == null) {
                        posTagSequences = new ArrayListNoNulls<>();
                        posTagSequences.add(posTagSequence);
                    }
                    ParseConfiguration parseConfiguration = null;
                    List<ParseConfiguration> parseConfigurations = null;
                    try {
                        if (parser instanceof NonDeterministicParser) {
                            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
                            parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
                            parseConfiguration = parseConfigurations.get(0);
                        } else {
                            parseConfiguration = parser.parseSentence(posTagSequence);
                        }
                        for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
                            parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
                        }
                    } catch (Exception e) {
                        LogUtils.logError(LOG, e);
                        if (stopOnError)
                            throw new RuntimeException(e);
                    }
                    posTagSequence = null;
                }
                if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                    needToProcess = !sentences.isEmpty();
                else if (this.startModule.equals(Module.posTagger))
                    needToProcess = tokenSequence != null;
                else if (this.startModule.equals(Module.parser))
                    needToProcess = posTagSequence != null;
            }
        // next sentence
        }
        // Check if there's any leftover output to output!
        if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
            writer.append(rollingTextBlock.getLeftoverOriginalText());
    } finally {
        IOException exception = null;
        try {
            reader.close();
            writer.flush();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (SentenceProcessor processor : this.sentenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
            try {
                processor.onCompleteAnalysis();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
            try {
                processor.onCompleteParse();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        LOG.debug("Total time for Talismane.process(): " + totalTime);
        try {
            writer.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        if (exception != null)
            throw exception;
    }
}
Also used : TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) RawTextAnnotator(com.joliciel.talismane.rawText.RawTextAnnotator) URI(java.net.URI) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) Sentence(com.joliciel.talismane.rawText.Sentence) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) RollingTextBlock(com.joliciel.talismane.rawText.RollingTextBlock) IOException(java.io.IOException) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) LinkedList(java.util.LinkedList) IOException(java.io.IOException) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) CurrentFileProvider(com.joliciel.talismane.utils.io.CurrentFileProvider) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) File(java.io.File)

Example 4 with SentenceAnnotator

use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.

the class TalismaneAPIExamples method example2.

/**
 * Similar to example1, but begins with filtering and sentence detection.
 */
public static void example2(String sessionId) throws Exception {
    String text = "Les gens qui voient de travers pensent que les bancs verts qu'on voit sur les trottoirs " + "sont faits pour les impotents ou les ventripotents. " + "Mais c'est une absurdité, car, à la vérité, ils sont là, c'est notoire, " + "pour accueillir quelque temps les amours débutants.";
    RawText rawText = new RawText(text, true, sessionId);
    // issues (e.g. replace &quot; with ")
    for (RawTextAnnotator filter : TalismaneSession.get(sessionId).getTextAnnotators()) {
        filter.annotate(rawText);
    }
    // retrieve the processed text after filters have been applied
    AnnotatedText processedText = rawText.getProcessedText();
    // detect sentences
    SentenceDetector sentenceDetector = SentenceDetector.getInstance(sessionId);
    sentenceDetector.detectSentences(processedText);
    // the detected sentences can be retrieved directly from the raw text
    // this allows annotations made on the sentences to get reflected in the
    // raw text
    List<Sentence> sentences = rawText.getDetectedSentences();
    for (Sentence sentence : sentences) {
        // assignment for a given word)
        for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
            annotator.annotate(sentence);
        }
        // tokenise the text
        Tokeniser tokeniser = Tokeniser.getInstance(sessionId);
        TokenSequence tokenSequence = tokeniser.tokeniseSentence(sentence);
        // pos-tag the token sequence
        PosTagger posTagger = PosTaggers.getPosTagger(sessionId);
        PosTagSequence posTagSequence = posTagger.tagSentence(tokenSequence);
        System.out.println(posTagSequence);
        // parse the pos-tag sequence
        Parser parser = Parsers.getParser(sessionId);
        ParseConfiguration parseConfiguration = parser.parseSentence(posTagSequence);
        System.out.println(parseConfiguration);
        ParseTree parseTree = new ParseTree(parseConfiguration, true);
        System.out.println(parseTree);
    }
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) RawTextAnnotator(com.joliciel.talismane.rawText.RawTextAnnotator) RawText(com.joliciel.talismane.rawText.RawText) OptionParser(joptsimple.OptionParser) SentenceDetector(com.joliciel.talismane.sentenceDetector.SentenceDetector) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Tokeniser(com.joliciel.talismane.tokeniser.Tokeniser) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) PosTagger(com.joliciel.talismane.posTagger.PosTagger)

Example 5 with SentenceAnnotator

use of com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator in project talismane by joliciel-informatique.

the class StandoffReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        if (configuration == null && sentenceIndex < sentences.size()) {
            List<StandoffToken> tokens = sentences.get(sentenceIndex++);
            LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
            if (rules == null)
                throw new RuntimeException("Linguistic rules have not been set.");
            String text = "";
            for (StandoffToken standoffToken : tokens) {
                String word = standoffToken.text;
                if (rules.shouldAddSpace(text, word))
                    text += " ";
                text += word;
            }
            Sentence sentence = new Sentence(text, sessionId);
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
            }
            PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
            PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
            Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
            for (StandoffToken standoffToken : tokens) {
                Token token = tokenSequence.addToken(standoffToken.text);
                Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(posTaggedToken.toString());
                }
                posTaggedToken.setComment(standoffToken.comment);
                posTagSequence.addPosTaggedToken(posTaggedToken);
                idTokenMap.put(standoffToken.id, posTaggedToken);
                LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
            }
            tokenSequence.setWithRoot(true);
            configuration = new ParseConfiguration(posTagSequence);
            for (StandoffToken standoffToken : tokens) {
                StandoffRelation relation = relationMap.get(standoffToken.id);
                if (relation != null) {
                    PosTaggedToken head = idTokenMap.get(relation.fromToken);
                    PosTaggedToken dependent = idTokenMap.get(relation.toToken);
                    if (head == null) {
                        throw new TalismaneException("No token found for head id: " + relation.fromToken);
                    }
                    if (dependent == null) {
                        throw new TalismaneException("No token found for dependent id: " + relation.toToken);
                    }
                    DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
                    arc.setComment(relation.comment);
                } else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
                    if (punctuationDepLabel != null) {
                        PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
                        for (int i = dependent.getIndex() - 1; i >= 0; i--) {
                            PosTaggedToken head = posTagSequence.get(i);
                            if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
                                continue;
                            configuration.addDependency(head, dependent, punctuationDepLabel, null);
                            break;
                        }
                    }
                }
            }
        }
    }
    return (configuration != null);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TalismaneException(com.joliciel.talismane.TalismaneException) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) Decision(com.joliciel.talismane.machineLearning.Decision) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) PretokenisedSequence(com.joliciel.talismane.tokeniser.PretokenisedSequence) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)4 TalismaneException (com.joliciel.talismane.TalismaneException)3 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)3 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)3 LinguisticRules (com.joliciel.talismane.LinguisticRules)2 NonDeterministicParser (com.joliciel.talismane.parser.NonDeterministicParser)2 NonDeterministicPosTagger (com.joliciel.talismane.posTagger.NonDeterministicPosTagger)2 RawTextAnnotator (com.joliciel.talismane.rawText.RawTextAnnotator)2 HashMap (java.util.HashMap)2 AnnotatedText (com.joliciel.talismane.AnnotatedText)1 MergeAction (com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction)1 CompactLexicalEntry (com.joliciel.talismane.lexicon.CompactLexicalEntry)1 WritableLexicalEntry (com.joliciel.talismane.lexicon.WritableLexicalEntry)1 Decision (com.joliciel.talismane.machineLearning.Decision)1 DependencyArc (com.joliciel.talismane.parser.DependencyArc)1 ParseConfigurationProcessor (com.joliciel.talismane.parser.output.ParseConfigurationProcessor)1 PosTagAnnotatedCorpusReader (com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader)1 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)1