Search in sources :

Example 46 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 47 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class SentenceProcessor method getProcessors.

/**
 * Collect the processors specified in the configuration key
 * talismane.core.[sessionId].sentence-detector.output.processors.<br>
 * <br>
 * Each processor must implement this interface and must have a constructor
 * matching one of the following signatures:<br>
 * - ( {@link File} outputDir, {@link String} sessionId)<br>
 * - ( {@link String} sessionId)<br>
 * <br>
 * Optionally, it can have a constructor with the following signature:<br>
 * - ( {@link Writer} writer, {@link String} sessionId)<br>
 * If a writer is provided here, then the first processor with the above
 * constructor will be given the writer.
 *
 * @param writer
 *          if specified, will be used for the first processor in the list
 *          with a writer in the constructor
 * @param outDir
 *          directory in which to write the various outputs
 * @return
 * @throws IOException
 * @throws TalismaneException
 *           if a processor does not implement this interface, or if no
 *           constructor is found with the correct signature
 */
public static List<SentenceProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
    Config config = ConfigFactory.load();
    Config myConfig = config.getConfig("talismane.core." + sessionId + ".sentence-detector");
    List<SentenceProcessor> processors = new ArrayList<>();
    List<String> classes = myConfig.getStringList("output.processors");
    if (outDir != null)
        outDir.mkdirs();
    Writer firstProcessorWriter = writer;
    for (String className : classes) {
        @SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
        if (!SentenceProcessor.class.isAssignableFrom(untypedClass))
            throw new TalismaneException("Class " + className + " does not implement interface " + SentenceProcessor.class.getSimpleName());
        @SuppressWarnings("unchecked") Class<? extends SentenceProcessor> clazz = untypedClass;
        Constructor<? extends SentenceProcessor> cons = null;
        SentenceProcessor processor = null;
        if (firstProcessorWriter != null) {
            try {
                cons = clazz.getConstructor(Writer.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(firstProcessorWriter, sessionId);
                firstProcessorWriter = null;
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(File.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(outDir, sessionId);
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(sessionId);
            } else {
                throw new TalismaneException("No constructor found with correct signature for: " + className);
            }
        }
        processors.add(processor);
    }
    return processors;
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) File(java.io.File) Writer(java.io.Writer)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)47 ArrayList (java.util.ArrayList)27 Config (com.typesafe.config.Config)14 File (java.io.File)11 List (java.util.List)10 TreeSet (java.util.TreeSet)10 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)9 IOException (java.io.IOException)9 HashMap (java.util.HashMap)9 Set (java.util.Set)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 Token (com.joliciel.talismane.tokeniser.Token)8 Map (java.util.Map)8 SortedSet (java.util.SortedSet)8 Collectors (java.util.stream.Collectors)8 Logger (org.slf4j.Logger)8 LoggerFactory (org.slf4j.LoggerFactory)8 Sentence (com.joliciel.talismane.rawText.Sentence)7