Examples with Sentence - com.joliciel.talismane.rawText.Sentence

Example 6 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class Talismane method analyse.

/**
 * Analyse the data provided by this reader, as specified by the
 * configuration.
 *
 * @param reader
 * @throws IOException
 * @throws ReflectiveOperationException
 * @throws TalismaneException
 *           if it's impossible to read a sentence from an annotated corpus
 */
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
    long startTime = System.currentTimeMillis();
    try {
        TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
        PosTagAnnotatedCorpusReader posTagCorpusReader = null;
        if (this.startModule.equals(Module.posTagger)) {
            tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
        }
        if (this.startModule.equals(Module.parser)) {
            posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
        }
        LinkedList<String> textSegments = new LinkedList<String>();
        LinkedList<Sentence> sentences = new LinkedList<Sentence>();
        TokenSequence tokenSequence = null;
        PosTagSequence posTagSequence = null;
        StringBuilder stringBuilder = new StringBuilder();
        boolean finished = false;
        int sentenceCount = 0;
        CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
        RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
        int endBlockCharacterCount = 0;
        URI currentURI = null;
        File currentFile = null;
        while (!finished) {
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
                // Note SentenceDetector and Tokeniser start modules treated
                // identically,
                // except that for SentenceDetector we apply a probabilistic
                // sentence detector
                // whereas for Tokeniser we assume all sentence breaks are
                // marked by filters
                // read characters from the reader, one at a time
                char c;
                int r = -1;
                try {
                    r = reader.read();
                } catch (IOException e) {
                    LogUtils.logError(LOG, e);
                }
                if (r == -1) {
                    finished = true;
                    c = '\n';
                } else {
                    c = (char) r;
                }
                // Jump out if we have 3 consecutive end-block characters.
                if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    endBlockCharacterCount++;
                    if (endBlockCharacterCount == 3) {
                        LOG.info("Three consecutive end-block characters. Exiting.");
                        finished = true;
                    }
                } else {
                    endBlockCharacterCount = 0;
                }
                // have sentence detector
                if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
                        stringBuilder.append(c);
                    if (stringBuilder.length() > 0) {
                        String textSegment = stringBuilder.toString();
                        stringBuilder = new StringBuilder();
                        textSegments.add(textSegment);
                    }
                    // is the current block > 0 characters?
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                        textSegments.addLast("");
                    }
                }
                if (finished) {
                    if (stringBuilder.length() > 0) {
                        textSegments.addLast(stringBuilder.toString());
                        stringBuilder = new StringBuilder();
                    }
                    // add three final text segments to roll everything
                    // through processing
                    textSegments.addLast("");
                    textSegments.addLast("");
                    textSegments.addLast("");
                }
                if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
                    stringBuilder.append(c);
                while (textSegments.size() > 0) {
                    // roll in a new block 4, and roll the other blocks
                    // leftwards
                    String nextText = textSegments.removeFirst();
                    rollingTextBlock = rollingTextBlock.roll(nextText);
                    // annotate block 3 with raw text filters
                    AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
                    for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
                        textAnnotator.annotate(rawTextBlock);
                    }
                    // detect sentences in block 2 using the sentence
                    // detector
                    AnnotatedText processedText = rollingTextBlock.getProcessedText();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
                    }
                    if (this.startModule.equals(Module.sentenceDetector)) {
                        sentenceDetector.detectSentences(processedText);
                    }
                    // get the sentences detected in block 2
                    List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
                    for (Sentence sentence : theSentences) {
                        sentences.add(sentence);
                        sentenceCount++;
                    }
                    if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
                        finished = true;
                    }
                }
            // we have at least one text segment to process
            } else if (this.startModule.equals(Module.posTagger)) {
                if (tokenCorpusReader.hasNextSentence()) {
                    tokenSequence = tokenCorpusReader.nextTokenSequence();
                } else {
                    tokenSequence = null;
                    finished = true;
                }
            } else if (this.startModule.equals(Module.parser)) {
                if (posTagCorpusReader.hasNextSentence()) {
                    posTagSequence = posTagCorpusReader.nextPosTagSequence();
                } else {
                    posTagSequence = null;
                    finished = true;
                }
            }
            // which start module?
            boolean needToProcess = false;
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                needToProcess = !sentences.isEmpty();
            else if (this.startModule.equals(Module.posTagger))
                needToProcess = tokenSequence != null;
            else if (this.startModule.equals(Module.parser))
                needToProcess = posTagSequence != null;
            while (needToProcess) {
                Sentence sentence = null;
                if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
                    sentence = sentences.poll();
                    LOG.debug("Sentence: " + sentence);
                    for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
                    if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
                        currentURI = sentence.getFileURI();
                        currentFile = sentence.getFile();
                        LOG.debug("Setting current file to " + currentFile.getPath());
                        if (writer instanceof CurrentFileObserver)
                            ((CurrentFileObserver) writer).onNextFile(currentFile);
                        for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                    }
                    if (sentence.getLeftoverOriginalText().length() > 0) {
                        writer.append(sentence.getLeftoverOriginalText() + "\n");
                    }
                    for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
                        sentenceProcessor.onNextSentence(sentence);
                    }
                }
                // need to read next sentence
                List<TokenSequence> tokenSequences = null;
                if (this.needsTokeniser()) {
                    tokenSequences = tokeniser.tokenise(sentence);
                    tokenSequence = tokenSequences.get(0);
                    for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
                        tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
                    }
                }
                // need to tokenise ?
                List<PosTagSequence> posTagSequences = null;
                if (this.needsPosTagger()) {
                    posTagSequence = null;
                    if (tokenSequences == null) {
                        tokenSequences = new ArrayListNoNulls<>();
                        tokenSequences.add(tokenSequence);
                    }
                    if (posTagger instanceof NonDeterministicPosTagger) {
                        NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                        posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
                        posTagSequence = posTagSequences.get(0);
                    } else {
                        posTagSequence = posTagger.tagSentence(tokenSequence);
                    }
                    for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
                        posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
                    }
                    tokenSequence = null;
                }
                if (this.needsParser()) {
                    if (posTagSequences == null) {
                        posTagSequences = new ArrayListNoNulls<>();
                        posTagSequences.add(posTagSequence);
                    }
                    ParseConfiguration parseConfiguration = null;
                    List<ParseConfiguration> parseConfigurations = null;
                    try {
                        if (parser instanceof NonDeterministicParser) {
                            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
                            parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
                            parseConfiguration = parseConfigurations.get(0);
                        } else {
                            parseConfiguration = parser.parseSentence(posTagSequence);
                        }
                        for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
                            parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
                        }
                    } catch (Exception e) {
                        LogUtils.logError(LOG, e);
                        if (stopOnError)
                            throw new RuntimeException(e);
                    }
                    posTagSequence = null;
                }
                if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                    needToProcess = !sentences.isEmpty();
                else if (this.startModule.equals(Module.posTagger))
                    needToProcess = tokenSequence != null;
                else if (this.startModule.equals(Module.parser))
                    needToProcess = posTagSequence != null;
            }
        // next sentence
        }
        // Check if there's any leftover output to output!
        if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
            writer.append(rollingTextBlock.getLeftoverOriginalText());
    } finally {
        IOException exception = null;
        try {
            reader.close();
            writer.flush();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (SentenceProcessor processor : this.sentenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
            try {
                processor.onCompleteAnalysis();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
            try {
                processor.onCompleteParse();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        LOG.debug("Total time for Talismane.process(): " + totalTime);
        try {
            writer.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        if (exception != null)
            throw exception;
    }
}

Also used : TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) RawTextAnnotator(com.joliciel.talismane.rawText.RawTextAnnotator) URI(java.net.URI) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) Sentence(com.joliciel.talismane.rawText.Sentence) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) RollingTextBlock(com.joliciel.talismane.rawText.RollingTextBlock) IOException(java.io.IOException) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) LinkedList(java.util.LinkedList) IOException(java.io.IOException) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) CurrentFileProvider(com.joliciel.talismane.utils.io.CurrentFileProvider) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) File(java.io.File)

Example 7 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class PatternTokeniser method tokeniseInternal.

@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) throws TalismaneException, IOException {
    List<TokenisedAtomicTokenSequence> sequences;
    // Assign each separator its default value
    List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(initialSequence);
    List<Decision> defaultDecisions = new ArrayList<Decision>(defaultOutcomes.size());
    for (TokeniserOutcome outcome : defaultOutcomes) {
        Decision tokeniserDecision = new Decision(outcome.name());
        tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
        tokeniserDecision.addAuthority("_" + "DefaultDecision");
        defaultDecisions.add(tokeniserDecision);
    }
    // For each test pattern, see if anything in the sentence matches it
    if (this.decisionMaker != null) {
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(initialSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                if (matchSequence.getTokensToCheck().size() > 0) {
                    matchingSequences.add(matchSequence);
                    matchedTokens.addAll(matchSequence.getTokensToCheck());
                    TokenPatternMatch primaryMatch = null;
                    Token token = matchSequence.getTokensToCheck().get(0);
                    Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                    if (matchSequences == null) {
                        matchSequences = new TreeSet<TokenPatternMatchSequence>();
                        tokenMatchSequenceMap.put(token, matchSequences);
                    }
                    matchSequences.add(matchSequence);
                    for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) {
                        if (patternMatch.getToken().equals(token)) {
                            primaryMatch = patternMatch;
                            break;
                        }
                    }
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Found match: " + primaryMatch);
                    }
                    primaryMatchMap.put(matchSequence, primaryMatch);
                }
            }
        }
        // we want to create the n most likely token sequences
        // the sequence has to correspond to a token pattern
        Map<TokenPatternMatchSequence, List<Decision>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision>>();
        for (TokenPatternMatchSequence matchSequence : matchingSequences) {
            TokenPatternMatch match = primaryMatchMap.get(matchSequence);
            LOG.debug("next pattern match: " + match.toString());
            List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
            for (TokenPatternMatchFeature<?> feature : features) {
                RuntimeEnvironment env = new RuntimeEnvironment();
                FeatureResult<?> featureResult = feature.check(match, env);
                if (featureResult != null) {
                    tokenFeatureResults.add(featureResult);
                }
            }
            if (LOG.isTraceEnabled()) {
                SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
                for (String featureResultString : featureResultSet) {
                    LOG.trace(featureResultString);
                }
            }
            List<Decision> decisions = this.decisionMaker.decide(tokenFeatureResults);
            for (ClassificationObserver observer : this.observers) observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions);
            for (Decision decision : decisions) {
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "Patterns");
                decision.addAuthority(match.getPattern().getName());
            }
            matchSequenceDecisionMap.put(matchSequence, decisions);
        }
        // initially create a heap with a single, empty sequence
        PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence emptySequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        heap.add(emptySequence);
        for (int i = 0; i < initialSequence.listWithWhiteSpace().size(); i++) {
            Token token = initialSequence.listWithWhiteSpace().get(i);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Token : \"" + token.getAnalyisText() + "\"");
            }
            // build a new heap for this iteration
            PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
            heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
            if (i == 0) {
                // first token is always "separate" from the outside world
                Decision decision = new Decision(TokeniserOutcome.SEPARATE.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "DefaultDecision");
                TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(emptySequence);
                newSequence.add(taggedToken);
                heap.add(newSequence);
                continue;
            }
            // limit the heap breadth to K
            int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth() : previousHeap.size();
            for (int j = 0; j < maxSequences; j++) {
                TokenisedAtomicTokenSequence history = previousHeap.poll();
                // Find the separating & non-separating decisions
                if (history.size() > i) {
                    // token already added as part of a sequence
                    // introduced by another token
                    heap.add(history);
                } else if (tokenMatchSequenceMap.containsKey(token)) {
                    // token begins one or more match sequences
                    // these are ordered from shortest to longest (via
                    // TreeSet)
                    List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>(tokenMatchSequenceMap.get(token));
                    // Since sequences P1..Pn contain each other,
                    // there can be exactly matchSequences.size()
                    // consistent solutions
                    // Assume the default is separate
                    // 0: all separate
                    // 1: join P1, separate rest
                    // 2: join P2, separate rest
                    // ...
                    // n: join Pn
                    // We need to add each of these to the heap
                    // by taking the product of all probabilities
                    // consistent with each solution
                    // The probabities for each solution are (j=join,
                    // s=separate)
                    // All separate: s1 x s2 x ... x sn
                    // P1: j1 x s2 x ... x sn
                    // P2: j1 x j2 x ... x sn
                    // ...
                    // Pn: j1 x j2 x ... x jn
                    // Any solution of the form s1 x j2 would be
                    // inconsistent, and is not considered
                    // If Pi and Pj start and end on the exact same
                    // token, then the solution for both is
                    // Pi: j1 x ... x ji x jj x sj+1 ... x sn
                    // Pj: j1 x ... x ji x jj x sj+1 ... x sn
                    // Note of course that we're never likely to have
                    // more than two Ps here,
                    // but we need a solution for more just to be sure
                    // to be sure
                    TokeniserOutcome defaultOutcome = TokeniserOutcome.valueOf(defaultDecisions.get(token.getIndexWithWhiteSpace()).getOutcome());
                    TokeniserOutcome otherOutcome = null;
                    if (defaultOutcome == TokeniserOutcome.SEPARATE)
                        otherOutcome = TokeniserOutcome.JOIN;
                    else
                        otherOutcome = TokeniserOutcome.SEPARATE;
                    double[] decisionProbs = new double[matchSequences.size() + 1];
                    for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] = 1;
                    // Note: k0 = default decision (e.g. separate all),
                    // k1=first pattern
                    // p1 = first pattern
                    int p = 1;
                    int prevEndIndex = -1;
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        int endIndex = matchSequence.getTokensToCheck().get(matchSequence.getTokensToCheck().size() - 1).getEndIndex();
                        List<Decision> decisions = matchSequenceDecisionMap.get(matchSequence);
                        for (Decision decision : decisions) {
                            for (int k = 0; k < decisionProbs.length; k++) {
                                if (decision.getOutcome().equals(defaultOutcome.name())) {
                                    // e.g. separate in most cases
                                    if (k < p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 < p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                } else {
                                    // e.g. join in most cases
                                    if (k >= p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 >= p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                }
                            }
                        // next k
                        }
                        // next decision (only 2 of these)
                        prevEndIndex = endIndex;
                        p++;
                    }
                    // transform to probability distribution
                    double sumProbs = 0;
                    for (int k = 0; k < decisionProbs.length; k++) sumProbs += decisionProbs[k];
                    if (sumProbs > 0)
                        for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] /= sumProbs;
                    // Apply default decision
                    // Since this is the default decision for all tokens
                    // in the sequence, we don't add the other tokens
                    // for now,
                    // so as to allow them
                    // to get examined one at a time, just in case one
                    // of them starts its own separate sequence
                    Decision defaultDecision = new Decision(defaultOutcome.name(), decisionProbs[0]);
                    defaultDecision.addAuthority("_" + this.getClass().getSimpleName());
                    defaultDecision.addAuthority("_" + "Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        defaultDecision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                    TaggedToken<TokeniserOutcome> defaultTaggedToken = new TaggedToken<>(token, defaultDecision, TokeniserOutcome.valueOf(defaultDecision.getOutcome()));
                    TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(history);
                    defaultSequence.add(defaultTaggedToken);
                    defaultSequence.addDecision(defaultDecision);
                    heap.add(defaultSequence);
                    // Apply one non-default decision per match sequence
                    for (int k = 0; k < matchSequences.size(); k++) {
                        TokenPatternMatchSequence matchSequence = matchSequences.get(k);
                        double prob = decisionProbs[k + 1];
                        Decision decision = new Decision(otherOutcome.name(), prob);
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "Patterns");
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                        TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                        newSequence.add(taggedToken);
                        newSequence.addDecision(decision);
                        // in this sequence to the solution
                        for (Token tokenInSequence : matchSequence.getTokensToCheck()) {
                            if (tokenInSequence.equals(token)) {
                                continue;
                            }
                            Decision decisionInSequence = new Decision(decision.getOutcome());
                            decisionInSequence.addAuthority("_" + this.getClass().getSimpleName());
                            decisionInSequence.addAuthority("_" + "DecisionInSequence");
                            decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default");
                            decisionInSequence.addAuthority("_" + "Patterns");
                            TaggedToken<TokeniserOutcome> taggedTokenInSequence = new TaggedToken<>(tokenInSequence, decisionInSequence, TokeniserOutcome.valueOf(decisionInSequence.getOutcome()));
                            newSequence.add(taggedTokenInSequence);
                        }
                        heap.add(newSequence);
                    }
                // next sequence
                } else {
                    // token doesn't start match sequence, and hasn't
                    // already been added to the current sequence
                    Decision decision = defaultDecisions.get(i);
                    if (matchedTokens.contains(token)) {
                        decision = new Decision(decision.getOutcome());
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "DecisionInSequence");
                        decision.addAuthority("_" + "DecisionInSequence_default");
                        decision.addAuthority("_" + "Patterns");
                    }
                    TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                    TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                    newSequence.add(taggedToken);
                    heap.add(newSequence);
                }
            }
        // next sequence in the old heap
        }
        // next token
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        int k = 0;
        while (!heap.isEmpty()) {
            sequences.add(heap.poll());
            k++;
            if (k >= this.getBeamWidth())
                break;
        }
    } else {
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        int i = 0;
        for (Token token : initialSequence.listWithWhiteSpace()) {
            Decision decision = defaultDecisions.get(i++);
            TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
            defaultSequence.add(taggedToken);
        }
        sequences.add(defaultSequence);
    }
    // have decision maker?
    return sequences;
}

Also used : ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) PriorityQueue(java.util.PriorityQueue) LoggerFactory(org.slf4j.LoggerFactory) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) TalismaneSession(com.joliciel.talismane.TalismaneSession) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) TokenPatternMatchFeatureParser(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeatureParser) TokenPatternMatchFeature(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeature) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Tokeniser(com.joliciel.talismane.tokeniser.Tokeniser) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) Token(com.joliciel.talismane.tokeniser.Token) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) SortedSet(java.util.SortedSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PriorityQueue(java.util.PriorityQueue) Decision(com.joliciel.talismane.machineLearning.Decision) ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 8 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class CombinedLexicalAttributesTest method testCheckInternalMultipleEntries.

@Test
public void testCheckInternalMultipleEntries() throws Exception {
    System.setProperty("config.file", "src/test/resources/testWithLex.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    Sentence sentence = new Sentence("je demande", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    Token token = new Token("demande", tokenSequence, 1, "je ".length(), "je demande".length(), sessionId);
    Decision decision = new Decision("V", 1.0);
    final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
    PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {

        @Override
        protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
            return this.generateResult(posTaggedToken);
        }
    };
    StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
    CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person);
    PosTagSequence history = new PosTagSequence(tokenSequence);
    PosTaggerContext context = new PosTaggerContextImpl(token, history);
    RuntimeEnvironment env = new RuntimeEnvironment();
    FeatureResult<String> featureResult = feature.checkInternal(context, env);
    String outcome = featureResult.getOutcome();
    System.out.println(outcome);
    assertEquals("1;3", outcome);
}

Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Config(com.typesafe.config.Config) StringLiteralFeature(com.joliciel.talismane.machineLearning.features.StringLiteralFeature) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) PosTaggerContext(com.joliciel.talismane.posTagger.PosTaggerContext) Decision(com.joliciel.talismane.machineLearning.Decision) PosTaggerContextImpl(com.joliciel.talismane.posTagger.PosTaggerContextImpl) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 9 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class CombinedLexicalAttributesTest method testCheckInternalMultipleAttributes.

@Test
public void testCheckInternalMultipleAttributes() throws Exception {
    System.setProperty("config.file", "src/test/resources/testWithLex.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    Sentence sentence = new Sentence("blah", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
    Decision decision = new Decision("V", 1.0);
    final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
    PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {

        @Override
        protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
            return this.generateResult(posTaggedToken);
        }
    };
    StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
    StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
    CombinedLexicalAttributesFeature<PosTaggerContext> feature = new CombinedLexicalAttributesFeature<>(addressFunction, person, number);
    PosTagSequence history = new PosTagSequence(tokenSequence);
    PosTaggerContext context = new PosTaggerContextImpl(token, history);
    RuntimeEnvironment env = new RuntimeEnvironment();
    FeatureResult<String> featureResult = feature.checkInternal(context, env);
    String outcome = featureResult.getOutcome();
    System.out.println(outcome);
    assertEquals("1;3|p;s", outcome);
}

Example 10 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class LexicalAttributeFeatureTest method testCheckInternalMultipleAttributes.

@Test
public void testCheckInternalMultipleAttributes() throws Exception {
    System.setProperty("config.file", "src/test/resources/testWithLex.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    Sentence sentence = new Sentence("blah", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    Token token = new Token("blah", tokenSequence, 1, "".length(), "blah".length(), sessionId);
    Decision decision = new Decision("V", 1.0);
    final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
    PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {

        @Override
        protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
            return this.generateResult(posTaggedToken);
        }
    };
    StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
    StringLiteralFeature<PosTaggedTokenWrapper> number = new StringLiteralFeature<>(LexicalAttribute.Number.name());
    LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, person, number);
    PosTagSequence history = new PosTagSequence(tokenSequence);
    PosTaggerContext context = new PosTaggerContextImpl(token, history);
    RuntimeEnvironment env = new RuntimeEnvironment();
    FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
    List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
    System.out.println(outcomes);
    for (WeightedOutcome<String> outcome : outcomes) {
        assertTrue("3|p".equals(outcome.getOutcome()) || "1|s".equals(outcome.getOutcome()) || "3|s".equals(outcome.getOutcome()));
    }
    assertEquals(3, outcomes.size());
}

Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Config(com.typesafe.config.Config) StringLiteralFeature(com.joliciel.talismane.machineLearning.features.StringLiteralFeature) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) PosTaggerContext(com.joliciel.talismane.posTagger.PosTaggerContext) Decision(com.joliciel.talismane.machineLearning.Decision) PosTaggerContextImpl(com.joliciel.talismane.posTagger.PosTaggerContextImpl) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) List(java.util.List) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)43 Config (com.typesafe.config.Config)31 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)25 Token (com.joliciel.talismane.tokeniser.Token)14 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)13 Annotation (com.joliciel.talismane.Annotation)12 Decision (com.joliciel.talismane.machineLearning.Decision)11 ArrayList (java.util.ArrayList)9 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 TalismaneException (com.joliciel.talismane.TalismaneException)6 HashMap (java.util.HashMap)6 List (java.util.List)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5