Search in sources :

Example 1 with PosTagAnnotatedCorpusReader

use of com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader in project talismane by joliciel-informatique.

the class Talismane method analyse.

/**
 * Analyse the data provided by this reader, as specified by the
 * configuration.
 *
 * @param reader
 * @throws IOException
 * @throws ReflectiveOperationException
 * @throws TalismaneException
 *           if it's impossible to read a sentence from an annotated corpus
 */
public void analyse(Reader reader) throws IOException, ReflectiveOperationException, TalismaneException {
    long startTime = System.currentTimeMillis();
    try {
        TokeniserAnnotatedCorpusReader tokenCorpusReader = null;
        PosTagAnnotatedCorpusReader posTagCorpusReader = null;
        if (this.startModule.equals(Module.posTagger)) {
            tokenCorpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
        }
        if (this.startModule.equals(Module.parser)) {
            posTagCorpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
        }
        LinkedList<String> textSegments = new LinkedList<String>();
        LinkedList<Sentence> sentences = new LinkedList<Sentence>();
        TokenSequence tokenSequence = null;
        PosTagSequence posTagSequence = null;
        StringBuilder stringBuilder = new StringBuilder();
        boolean finished = false;
        int sentenceCount = 0;
        CurrentFileProvider currentFileProvider = reader instanceof CurrentFileProvider ? (CurrentFileProvider) reader : null;
        RollingTextBlock rollingTextBlock = new RollingTextBlock(this.processByDefault, currentFileProvider, sessionId);
        int endBlockCharacterCount = 0;
        URI currentURI = null;
        File currentFile = null;
        while (!finished) {
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser)) {
                // Note SentenceDetector and Tokeniser start modules treated
                // identically,
                // except that for SentenceDetector we apply a probabilistic
                // sentence detector
                // whereas for Tokeniser we assume all sentence breaks are
                // marked by filters
                // read characters from the reader, one at a time
                char c;
                int r = -1;
                try {
                    r = reader.read();
                } catch (IOException e) {
                    LogUtils.logError(LOG, e);
                }
                if (r == -1) {
                    finished = true;
                    c = '\n';
                } else {
                    c = (char) r;
                }
                // Jump out if we have 3 consecutive end-block characters.
                if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    endBlockCharacterCount++;
                    if (endBlockCharacterCount == 3) {
                        LOG.info("Three consecutive end-block characters. Exiting.");
                        finished = true;
                    }
                } else {
                    endBlockCharacterCount = 0;
                }
                // have sentence detector
                if (finished || (Character.isWhitespace(c) && c != '\r' && c != '\n' && stringBuilder.length() > TalismaneSession.get(sessionId).getBlockSize()) || c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter())
                        stringBuilder.append(c);
                    if (stringBuilder.length() > 0) {
                        String textSegment = stringBuilder.toString();
                        stringBuilder = new StringBuilder();
                        textSegments.add(textSegment);
                    }
                    // is the current block > 0 characters?
                    if (c == TalismaneSession.get(sessionId).getEndBlockCharacter()) {
                        textSegments.addLast("");
                    }
                }
                if (finished) {
                    if (stringBuilder.length() > 0) {
                        textSegments.addLast(stringBuilder.toString());
                        stringBuilder = new StringBuilder();
                    }
                    // add three final text segments to roll everything
                    // through processing
                    textSegments.addLast("");
                    textSegments.addLast("");
                    textSegments.addLast("");
                }
                if (c != TalismaneSession.get(sessionId).getEndBlockCharacter())
                    stringBuilder.append(c);
                while (textSegments.size() > 0) {
                    // roll in a new block 4, and roll the other blocks
                    // leftwards
                    String nextText = textSegments.removeFirst();
                    rollingTextBlock = rollingTextBlock.roll(nextText);
                    // annotate block 3 with raw text filters
                    AnnotatedText rawTextBlock = rollingTextBlock.getRawTextBlock();
                    for (RawTextAnnotator textAnnotator : TalismaneSession.get(sessionId).getTextAnnotators()) {
                        textAnnotator.annotate(rawTextBlock);
                    }
                    // detect sentences in block 2 using the sentence
                    // detector
                    AnnotatedText processedText = rollingTextBlock.getProcessedText();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("processedText: " + processedText.getText().toString().replace('\n', '¶').replace('\r', '¶'));
                    }
                    if (this.startModule.equals(Module.sentenceDetector)) {
                        sentenceDetector.detectSentences(processedText);
                    }
                    // get the sentences detected in block 2
                    List<Sentence> theSentences = rollingTextBlock.getDetectedSentences();
                    for (Sentence sentence : theSentences) {
                        sentences.add(sentence);
                        sentenceCount++;
                    }
                    if (this.sentenceCount > 0 && sentenceCount >= this.sentenceCount) {
                        finished = true;
                    }
                }
            // we have at least one text segment to process
            } else if (this.startModule.equals(Module.posTagger)) {
                if (tokenCorpusReader.hasNextSentence()) {
                    tokenSequence = tokenCorpusReader.nextTokenSequence();
                } else {
                    tokenSequence = null;
                    finished = true;
                }
            } else if (this.startModule.equals(Module.parser)) {
                if (posTagCorpusReader.hasNextSentence()) {
                    posTagSequence = posTagCorpusReader.nextPosTagSequence();
                } else {
                    posTagSequence = null;
                    finished = true;
                }
            }
            // which start module?
            boolean needToProcess = false;
            if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                needToProcess = !sentences.isEmpty();
            else if (this.startModule.equals(Module.posTagger))
                needToProcess = tokenSequence != null;
            else if (this.startModule.equals(Module.parser))
                needToProcess = posTagSequence != null;
            while (needToProcess) {
                Sentence sentence = null;
                if (this.startModule.compareTo(Module.tokeniser) <= 0 && this.endModule.compareTo(Module.sentenceDetector) >= 0) {
                    sentence = sentences.poll();
                    LOG.debug("Sentence: " + sentence);
                    for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) annotator.annotate(sentence);
                    if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
                        currentURI = sentence.getFileURI();
                        currentFile = sentence.getFile();
                        LOG.debug("Setting current file to " + currentFile.getPath());
                        if (writer instanceof CurrentFileObserver)
                            ((CurrentFileObserver) writer).onNextFile(currentFile);
                        for (SentenceProcessor processor : sentenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (TokenSequenceProcessor processor : tokenSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (PosTagSequenceProcessor processor : posTagSequenceProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                        for (ParseConfigurationProcessor processor : parseConfigurationProcessors) if (processor instanceof CurrentFileObserver)
                            ((CurrentFileObserver) processor).onNextFile(currentFile);
                    }
                    if (sentence.getLeftoverOriginalText().length() > 0) {
                        writer.append(sentence.getLeftoverOriginalText() + "\n");
                    }
                    for (SentenceProcessor sentenceProcessor : sentenceProcessors) {
                        sentenceProcessor.onNextSentence(sentence);
                    }
                }
                // need to read next sentence
                List<TokenSequence> tokenSequences = null;
                if (this.needsTokeniser()) {
                    tokenSequences = tokeniser.tokenise(sentence);
                    tokenSequence = tokenSequences.get(0);
                    for (TokenSequenceProcessor tokenSequenceProcessor : tokenSequenceProcessors) {
                        tokenSequenceProcessor.onNextTokenSequence(tokenSequence);
                    }
                }
                // need to tokenise ?
                List<PosTagSequence> posTagSequences = null;
                if (this.needsPosTagger()) {
                    posTagSequence = null;
                    if (tokenSequences == null) {
                        tokenSequences = new ArrayListNoNulls<>();
                        tokenSequences.add(tokenSequence);
                    }
                    if (posTagger instanceof NonDeterministicPosTagger) {
                        NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                        posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
                        posTagSequence = posTagSequences.get(0);
                    } else {
                        posTagSequence = posTagger.tagSentence(tokenSequence);
                    }
                    for (PosTagSequenceProcessor posTagSequenceProcessor : this.posTagSequenceProcessors) {
                        posTagSequenceProcessor.onNextPosTagSequence(posTagSequence);
                    }
                    tokenSequence = null;
                }
                if (this.needsParser()) {
                    if (posTagSequences == null) {
                        posTagSequences = new ArrayListNoNulls<>();
                        posTagSequences.add(posTagSequence);
                    }
                    ParseConfiguration parseConfiguration = null;
                    List<ParseConfiguration> parseConfigurations = null;
                    try {
                        if (parser instanceof NonDeterministicParser) {
                            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
                            parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
                            parseConfiguration = parseConfigurations.get(0);
                        } else {
                            parseConfiguration = parser.parseSentence(posTagSequence);
                        }
                        for (ParseConfigurationProcessor parseConfigurationProcessor : this.parseConfigurationProcessors) {
                            parseConfigurationProcessor.onNextParseConfiguration(parseConfiguration);
                        }
                    } catch (Exception e) {
                        LogUtils.logError(LOG, e);
                        if (stopOnError)
                            throw new RuntimeException(e);
                    }
                    posTagSequence = null;
                }
                if (this.startModule.equals(Module.sentenceDetector) || this.startModule.equals(Module.tokeniser))
                    needToProcess = !sentences.isEmpty();
                else if (this.startModule.equals(Module.posTagger))
                    needToProcess = tokenSequence != null;
                else if (this.startModule.equals(Module.parser))
                    needToProcess = posTagSequence != null;
            }
        // next sentence
        }
        // Check if there's any leftover output to output!
        if (rollingTextBlock.getLeftoverOriginalText().length() > 0)
            writer.append(rollingTextBlock.getLeftoverOriginalText());
    } finally {
        IOException exception = null;
        try {
            reader.close();
            writer.flush();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (SentenceProcessor processor : this.sentenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (TokenSequenceProcessor processor : this.tokenSequenceProcessors) try {
            processor.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        for (PosTagSequenceProcessor processor : this.posTagSequenceProcessors) {
            try {
                processor.onCompleteAnalysis();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        for (ParseConfigurationProcessor processor : this.parseConfigurationProcessors) {
            try {
                processor.onCompleteParse();
                processor.close();
            } catch (IOException e) {
                LogUtils.logError(LOG, e);
                exception = e;
            }
        }
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        LOG.debug("Total time for Talismane.process(): " + totalTime);
        try {
            writer.close();
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            exception = e;
        }
        if (exception != null)
            throw exception;
    }
}
Also used : TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) RawTextAnnotator(com.joliciel.talismane.rawText.RawTextAnnotator) URI(java.net.URI) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) Sentence(com.joliciel.talismane.rawText.Sentence) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) RollingTextBlock(com.joliciel.talismane.rawText.RollingTextBlock) IOException(java.io.IOException) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) LinkedList(java.util.LinkedList) IOException(java.io.IOException) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) CurrentFileProvider(com.joliciel.talismane.utils.io.CurrentFileProvider) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) File(java.io.File)

Example 2 with PosTagAnnotatedCorpusReader

use of com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader in project talismane by joliciel-informatique.

the class TalismaneMain method execute.

/**
 * Execute Talismane based on the configuration provided.
 *
 * @param sessionId
 *          The current session's id
 * @param inFile
 *          The file or directory to analyse
 * @param outFile
 *          The file or directory to write the analysis.
 * @param outDir
 *          The directory for writing additional output files (other than the
 *          main analysis).
 * @param keepDirectoryStructure
 *          For analyse and process: if true, and inFile is a directory,
 *          outFile will be interpreted as a directory and the inFile
 *          directory struture will be maintained
 * @param evalFile
 * @throws IOException
 * @throws ReflectiveOperationException
 * @throws TalismaneException
 *           if attempt is made to start and end on two unsupported modules.
 * @throws SentenceAnnotatorLoadException
 */
public void execute(String sessionId, File inFile, File outFile, File outDir, File evalFile, boolean keepDirectoryStructure) throws IOException, ReflectiveOperationException, TalismaneException, SentenceAnnotatorLoadException {
    long startTime = System.currentTimeMillis();
    TalismaneSession session = TalismaneSession.get(sessionId);
    session.setFileForBasename(inFile);
    Config config = ConfigFactory.load();
    try {
        switch(session.getCommand()) {
            case analyse:
                {
                    Module startModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.start-module"));
                    Module endModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.end-module"));
                    Reader reader = getReader(inFile, true, sessionId);
                    Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
                    if (startModule == Module.languageDetector) {
                        if (endModule != Module.languageDetector)
                            throw new TalismaneException("Talismane does not currently support analysis starting with " + startModule.name() + " and ending with another module.");
                        LanguageDetector languageDetector = LanguageDetector.getInstance(sessionId);
                        LanguageDetectorProcessor processor = LanguageDetectorProcessor.getProcessor(writer, sessionId);
                        SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".language-detector.input"), sessionId);
                        while (corpusReader.hasNextSentence()) {
                            String sentence = corpusReader.nextSentence().getText().toString();
                            List<WeightedOutcome<Locale>> results = languageDetector.detectLanguages(sentence);
                            processor.onNextText(sentence, results);
                        }
                    } else {
                        Mode mode = Mode.valueOf(config.getString("talismane.core." + sessionId + ".mode"));
                        switch(mode) {
                            case normal:
                                Talismane talismane = new Talismane(writer, outDir, sessionId);
                                talismane.analyse(reader);
                                break;
                            case server:
                                TalismaneServer talismaneServer = new TalismaneServer(sessionId);
                                talismaneServer.analyse();
                                break;
                        }
                    }
                    break;
                }
            case train:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    switch(session.getModule()) {
                        case languageDetector:
                            {
                                LanguageDetectorTrainer trainer = new LanguageDetectorTrainer(sessionId);
                                trainer.train();
                                break;
                            }
                        case sentenceDetector:
                            {
                                SentenceDetectorTrainer trainer = new SentenceDetectorTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case tokeniser:
                            {
                                PatternTokeniserTrainer trainer = new PatternTokeniserTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case posTagger:
                            {
                                PosTaggerTrainer trainer = new PosTaggerTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case parser:
                            {
                                ParserTrainer trainer = new ParserTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                    }
                    break;
                }
            case evaluate:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    switch(session.getModule()) {
                        case sentenceDetector:
                            {
                                SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case tokeniser:
                            {
                                TokeniserEvaluator evaluator = new TokeniserEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case posTagger:
                            {
                                PosTaggerEvaluator evaluator = new PosTaggerEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case parser:
                            {
                                ParserEvaluator evaluator = new ParserEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    break;
                }
            case compare:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    Reader evalReader = getReader(evalFile, false, sessionId);
                    switch(session.getModule()) {
                        case tokeniser:
                            {
                                TokenComparator comparator = new TokenComparator(reader, evalReader, outDir, sessionId);
                                comparator.compare();
                                break;
                            }
                        case posTagger:
                            {
                                PosTagComparator comparator = new PosTagComparator(reader, evalReader, outDir, sessionId);
                                comparator.evaluate();
                                break;
                            }
                        case parser:
                            {
                                ParseComparator comparator = new ParseComparator(reader, evalReader, outDir, sessionId);
                                comparator.evaluate();
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    break;
                }
            case process:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
                    File currentFile = null;
                    URI currentURI = null;
                    IOException ioException = null;
                    switch(session.getModule()) {
                        case sentenceDetector:
                            {
                                List<SentenceProcessor> processors = SentenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".sentence-detector.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        Sentence sentence = corpusReader.nextSentence();
                                        if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
                                            currentURI = sentence.getFileURI();
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (SentenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (SentenceProcessor processor : processors) processor.onNextSentence(sentence);
                                    }
                                } finally {
                                    for (SentenceProcessor processor : processors) {
                                        try {
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case tokeniser:
                            {
                                List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    TokeniserAnnotatedCorpusReader corpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        TokenSequence tokenSequence = corpusReader.nextTokenSequence();
                                        Sentence sentence = tokenSequence.getSentence();
                                        if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (TokenSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (TokenSequenceProcessor processor : processors) processor.onNextTokenSequence(tokenSequence);
                                    }
                                } finally {
                                    for (TokenSequenceProcessor processor : processors) {
                                        try {
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case posTagger:
                            {
                                List<PosTagSequenceProcessor> processors = PosTagSequenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    PosTagAnnotatedCorpusReader corpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        PosTagSequence posTagSequence = corpusReader.nextPosTagSequence();
                                        Sentence sentence = posTagSequence.getTokenSequence().getSentence();
                                        if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (PosTagSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (PosTagSequenceProcessor processor : processors) processor.onNextPosTagSequence(posTagSequence);
                                    }
                                } finally {
                                    for (PosTagSequenceProcessor processor : processors) {
                                        try {
                                            processor.onCompleteAnalysis();
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case parser:
                            {
                                List<ParseConfigurationProcessor> processors = ParseConfigurationProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    ParserAnnotatedCorpusReader corpusReader = ParserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".parser.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        ParseConfiguration configuration = corpusReader.nextConfiguration();
                                        Sentence sentence = configuration.getSentence();
                                        if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (ParseConfigurationProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (ParseConfigurationProcessor processor : processors) processor.onNextParseConfiguration(configuration);
                                    }
                                } finally {
                                    for (ParseConfigurationProcessor processor : processors) {
                                        try {
                                            processor.onCompleteParse();
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    if (ioException != null)
                        throw ioException;
                    break;
                }
        }
    } finally {
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        LOG.debug("Total time for Talismane.process(): " + totalTime);
        if (config.getBoolean("talismane.core." + sessionId + ".output.log-execution-time")) {
            try {
                CSVFormatter CSV = new CSVFormatter();
                Writer csvFileWriter = null;
                File csvFile = new File(outDir, session.getBaseName() + ".stats.csv");
                csvFile.delete();
                csvFile.createNewFile();
                csvFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
                csvFileWriter.write(CSV.format("total time") + CSV.format(totalTime) + "\n");
                csvFileWriter.flush();
                csvFileWriter.close();
            } catch (Exception e) {
                LogUtils.logError(LOG, e);
            }
        }
    }
}
Also used : Locale(java.util.Locale) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) SentenceDetectorTrainer(com.joliciel.talismane.sentenceDetector.SentenceDetectorTrainer) ParserTrainer(com.joliciel.talismane.parser.ParserTrainer) PosTagComparator(com.joliciel.talismane.posTagger.evaluate.PosTagComparator) List(java.util.List) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) PosTaggerTrainer(com.joliciel.talismane.posTagger.PosTaggerTrainer) Sentence(com.joliciel.talismane.rawText.Sentence) ParserEvaluator(com.joliciel.talismane.parser.evaluate.ParserEvaluator) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) PatternTokeniserTrainer(com.joliciel.talismane.tokeniser.patterns.PatternTokeniserTrainer) FileOutputStream(java.io.FileOutputStream) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) CSVFormatter(com.joliciel.talismane.utils.CSVFormatter) Module(com.joliciel.talismane.Talismane.Module) LanguageDetectorTrainer(com.joliciel.talismane.languageDetector.LanguageDetectorTrainer) ParseComparator(com.joliciel.talismane.parser.evaluate.ParseComparator) File(java.io.File) SentenceDetectorEvaluator(com.joliciel.talismane.sentenceDetector.SentenceDetectorEvaluator) TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) Config(com.typesafe.config.Config) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) LexiconReader(com.joliciel.talismane.lexicon.LexiconReader) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) SingleFileReader(com.joliciel.talismane.utils.io.SingleFileReader) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) Reader(java.io.Reader) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) DirectoryReader(com.joliciel.talismane.utils.io.DirectoryReader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) URI(java.net.URI) PosTaggerEvaluator(com.joliciel.talismane.posTagger.evaluate.PosTaggerEvaluator) TokenComparator(com.joliciel.talismane.tokeniser.evaluate.TokenComparator) BufferedWriter(java.io.BufferedWriter) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) LanguageDetectorProcessor(com.joliciel.talismane.languageDetector.LanguageDetectorProcessor) Mode(com.joliciel.talismane.Talismane.Mode) TokeniserEvaluator(com.joliciel.talismane.tokeniser.evaluate.TokeniserEvaluator) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) SentenceAnnotatorLoadException(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotatorLoadException) IOException(java.io.IOException) JoranException(ch.qos.logback.core.joran.spi.JoranException) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) LanguageDetector(com.joliciel.talismane.languageDetector.LanguageDetector) OutputStreamWriter(java.io.OutputStreamWriter) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) DirectoryWriter(com.joliciel.talismane.utils.io.DirectoryWriter)

Aggregations

ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)2 ParseConfigurationProcessor (com.joliciel.talismane.parser.output.ParseConfigurationProcessor)2 PosTagAnnotatedCorpusReader (com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader)2 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)2 PosTagSequenceProcessor (com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor)2 Sentence (com.joliciel.talismane.rawText.Sentence)2 SentenceProcessor (com.joliciel.talismane.sentenceDetector.SentenceProcessor)2 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)2 TokeniserAnnotatedCorpusReader (com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader)2 TokenSequenceProcessor (com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor)2 CurrentFileObserver (com.joliciel.talismane.utils.io.CurrentFileObserver)2 File (java.io.File)2 JoranException (ch.qos.logback.core.joran.spi.JoranException)1 Mode (com.joliciel.talismane.Talismane.Mode)1 Module (com.joliciel.talismane.Talismane.Module)1 LanguageDetector (com.joliciel.talismane.languageDetector.LanguageDetector)1 LanguageDetectorProcessor (com.joliciel.talismane.languageDetector.LanguageDetectorProcessor)1 LanguageDetectorTrainer (com.joliciel.talismane.languageDetector.LanguageDetectorTrainer)1 LexiconReader (com.joliciel.talismane.lexicon.LexiconReader)1 NonDeterministicParser (com.joliciel.talismane.parser.NonDeterministicParser)1