Search in sources :

Example 1 with Mode

use of com.joliciel.talismane.Talismane.Mode in project talismane by joliciel-informatique.

the class TalismaneMain method execute.

/**
 * Execute by processing command line options with a given default config.
 *
 * @param args
 *          the command-line options
 */
public void execute(String[] args) throws IOException, ReflectiveOperationException, TalismaneException, JoranException {
    if (args.length > 0) {
        Set<String> argSet = new HashSet<>(Arrays.asList(args));
        if (argSet.contains("--serializeLexicon")) {
            LexiconReader.main(args);
            return;
        }
        if (argSet.contains("--testLexicon")) {
            LexiconDeserializer.main(args);
            return;
        }
        if (argSet.contains("--serializeDiacriticizer")) {
            Diacriticizer.main(args);
            return;
        }
        if (argSet.contains("--testDiacriticizer")) {
            Diacriticizer.main(args);
            return;
        }
    }
    OptionSet options = parser.parse(args);
    if (args.length == 0 || options.has("help")) {
        parser.printHelpOn(System.out);
        return;
    }
    String sessionId = options.valueOf(sessionIdOption);
    Map<String, Object> values = new HashMap<>();
    if (options.has("analyse"))
        values.put("talismane.core." + sessionId + ".command", Command.analyse.name());
    if (options.has("train"))
        values.put("talismane.core." + sessionId + ".command", Command.train.name());
    if (options.has("evaluate"))
        values.put("talismane.core." + sessionId + ".command", Command.evaluate.name());
    if (options.has("compare"))
        values.put("talismane.core." + sessionId + ".command", Command.compare.name());
    if (options.has("process"))
        values.put("talismane.core." + sessionId + ".command", Command.process.name());
    if (options.has(moduleOption))
        values.put("talismane.core." + sessionId + ".module", options.valueOf(moduleOption).name());
    if (options.has(startModuleOption)) {
        values.put("talismane.core." + sessionId + ".analysis.start-module", options.valueOf(startModuleOption).name());
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.start-module", options.valueOf(startModuleOption).name());
        values.put("talismane.core." + sessionId + ".parser.evaluate.start-module", options.valueOf(startModuleOption).name());
    }
    if (options.has(endModuleOption))
        values.put("talismane.core." + sessionId + ".analysis.end-module", options.valueOf(endModuleOption).name());
    if (options.has(modeOption))
        values.put("talismane.core." + sessionId + ".mode", options.valueOf(modeOption).name());
    if (options.has(portOption))
        values.put("talismane.core." + sessionId + ".port", options.valueOf(portOption));
    if (options.has(localeOption))
        values.put("talismane.core." + sessionId + ".locale", options.valueOf(localeOption));
    if (options.has(encodingOption))
        values.put("talismane.core." + sessionId + ".encoding", options.valueOf(encodingOption));
    if (options.has(inputEncodingOption))
        values.put("talismane.core." + sessionId + ".input-encoding", options.valueOf(inputEncodingOption));
    if (options.has(outputEncodingOption))
        values.put("talismane.core." + sessionId + ".output-encoding", options.valueOf(outputEncodingOption));
    if (options.has(languageModelOption))
        values.put("talismane.core." + sessionId + ".language-detector.model", options.valueOf(languageModelOption).getPath());
    if (options.has(sentenceModelOption))
        values.put("talismane.core." + sessionId + ".sentence-detector.model", options.valueOf(sentenceModelOption).getPath());
    if (options.has(tokeniserModelOption))
        values.put("talismane.core." + sessionId + ".tokeniser.model", options.valueOf(tokeniserModelOption).getPath());
    if (options.has(posTaggerModelOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.model", options.valueOf(posTaggerModelOption).getPath());
    if (options.has(parserModelOption))
        values.put("talismane.core." + sessionId + ".parser.model", options.valueOf(parserModelOption).getPath());
    if (options.has(lexiconOption)) {
        List<String> lexiconPaths = options.valuesOf(lexiconOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".lexicons", lexiconPaths);
    }
    if (options.has(textAnnotatorsOption)) {
        List<String> textAnnotatorPaths = options.valuesOf(textAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".annotators.text-annotators", textAnnotatorPaths);
    }
    if (options.has(sentenceAnnotatorsOption)) {
        List<String> sentenceAnnotatorPaths = options.valuesOf(sentenceAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".annotators.sentence-annotators", sentenceAnnotatorPaths);
    }
    List<String> inputLocations = Arrays.asList("talismane.core." + sessionId + ".input", "talismane.core." + sessionId + ".language-detector.input", "talismane.core." + sessionId + ".language-detector.train", "talismane.core." + sessionId + ".language-detector.evaluate", "talismane.core." + sessionId + ".sentence-detector.input", "talismane.core." + sessionId + ".sentence-detector.train", "talismane.core." + sessionId + ".sentence-detector.evaluate", "talismane.core." + sessionId + ".tokeniser.input", "talismane.core." + sessionId + ".tokeniser.train", "talismane.core." + sessionId + ".tokeniser.evaluate", "talismane.core." + sessionId + ".pos-tagger.input", "talismane.core." + sessionId + ".pos-tagger.train", "talismane.core." + sessionId + ".pos-tagger.evaluate", "talismane.core." + sessionId + ".parser.input", "talismane.core." + sessionId + ".parser.train", "talismane.core." + sessionId + ".parser.evaluate");
    List<String> outputLocations = Arrays.asList("talismane.core." + sessionId + ".output", "talismane.core." + sessionId + ".language-detector.output", "talismane.core." + sessionId + ".sentence-detector.output", "talismane.core." + sessionId + ".tokeniser.output", "talismane.core." + sessionId + ".pos-tagger.output", "talismane.core." + sessionId + ".parser.output");
    if (options.has(newlineOption))
        values.put("talismane.core." + sessionId + ".newline", options.valueOf(newlineOption));
    if (options.has(processByDefaultOption))
        values.put("talismane.core." + sessionId + ".analysis.process-by-default", options.valueOf(processByDefaultOption));
    if (options.has(blockSizeOption))
        values.put("talismane.core." + sessionId + ".block-size", options.valueOf(blockSizeOption));
    if (options.has(sentenceCountOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".sentence-count", options.valueOf(sentenceCountOption));
    if (options.has(startSentenceOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".start-sentence", options.valueOf(startSentenceOption));
    if (options.has(crossValidationSizeOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.fold-count", options.valueOf(crossValidationSizeOption));
    if (options.has(includeIndexOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.include-index", options.valueOf(includeIndexOption));
    if (options.has(excludeIndexOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.exclude-index", options.valueOf(excludeIndexOption));
    if (options.has(builtInTemplateOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".built-in-template", options.valueOf(builtInTemplateOption).name());
    if (options.has(templateOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".template", options.valueOf(templateOption).getPath());
    if (options.has(posTaggerRulesOption)) {
        List<String> posTaggerRulePaths = options.valuesOf(posTaggerRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".pos-tagger.rules", posTaggerRulePaths);
    }
    if (options.has(parserRulesOption)) {
        List<String> parserRulePaths = options.valuesOf(parserRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".parser.rules", parserRulePaths);
    }
    if (options.has(suffixOption))
        values.put("talismane.core." + sessionId + ".suffix", options.valueOf(suffixOption));
    if (options.has(outputDividerOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".output-divider", options.valueOf(outputDividerOption));
    if (options.has(beamWidthOption)) {
        values.put("talismane.core." + sessionId + ".pos-tagger.beam-width", options.valueOf(beamWidthOption));
        values.put("talismane.core." + sessionId + ".parser.beam-width", options.valueOf(beamWidthOption));
    }
    if (options.has(tokeniserBeamWidthOption))
        values.put("talismane.core." + sessionId + ".tokeniser.beam-width", options.valueOf(tokeniserBeamWidthOption));
    if (options.has(propagateBeamOption))
        values.put("talismane.core." + sessionId + ".parser.propagate-pos-tagger-beam", options.valueOf(propagateBeamOption));
    if (options.has(maxParseAnalysisTimeOption))
        values.put("talismane.core." + sessionId + ".parser.max-analysis-time", options.valueOf(maxParseAnalysisTimeOption));
    if (options.has(minFreeMemoryOption))
        values.put("talismane.core." + sessionId + ".parser.min-free-memory", options.valueOf(minFreeMemoryOption));
    if (options.has(earlyStopOption))
        values.put("talismane.core." + sessionId + ".parser.early-stop", options.valueOf(earlyStopOption));
    if (options.has(inputPatternFileOption) || options.has(inputPatternOption)) {
        String inputRegex = null;
        if (options.has(inputPatternFileOption)) {
            InputStream inputPatternFile = new FileInputStream(options.valueOf(inputPatternFileOption));
            try (Scanner inputPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(inputPatternFile, "UTF-8")))) {
                if (inputPatternScanner.hasNextLine()) {
                    inputRegex = inputPatternScanner.nextLine();
                }
            }
            if (inputRegex == null)
                throw new TalismaneException("No input pattern found in " + options.valueOf(inputPatternFileOption).getPath());
        } else {
            inputRegex = options.valueOf(inputPatternOption);
        }
        for (String inputLocation : inputLocations) values.put(inputLocation + ".input-pattern", inputRegex);
    }
    if (options.has(evalPatternFileOption) || options.has(evalPatternOption)) {
        String evalRegex = null;
        if (options.has(evalPatternFileOption)) {
            InputStream evalPatternFile = new FileInputStream(options.valueOf(evalPatternFileOption));
            try (Scanner evalPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(evalPatternFile, "UTF-8")))) {
                if (evalPatternScanner.hasNextLine()) {
                    evalRegex = evalPatternScanner.nextLine();
                }
            }
            if (evalRegex == null)
                throw new TalismaneException("No eval pattern found in " + options.valueOf(evalPatternFileOption).getPath());
        } else {
            evalRegex = options.valueOf(evalPatternOption);
        }
        values.put("talismane.core." + sessionId + ".sentence-detector.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".tokeniser.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".parser.evaluate.input-pattern", evalRegex);
    }
    if (options.has(csvSeparatorOption))
        values.put("talismane.core." + sessionId + ".csv.separator", options.valueOf(csvSeparatorOption));
    if (options.has(csvEncodingOption))
        values.put("talismane.core." + sessionId + ".csv.encoding", options.valueOf(csvEncodingOption));
    if (options.has(csvLocaleOption))
        values.put("talismane.core." + sessionId + ".csv.locale", options.valueOf(csvLocaleOption));
    if (options.has(includeUnknownWordResultsOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-unknown-word-results", options.valueOf(includeUnknownWordResultsOption));
    if (options.has(includeLexiconCoverageOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-lexicon-coverage", options.valueOf(includeLexiconCoverageOption));
    if (options.has(labeledEvaluationOption))
        values.put("talismane.core." + sessionId + ".parser.evaluate.labeled-evaluation", options.valueOf(labeledEvaluationOption));
    if (options.has(processingOption))
        values.put("talismane.core." + sessionId + ".output.option", options.valueOf(processingOption).name());
    if (options.has(lexicalEntryRegexOption)) {
        values.put("talismane.core." + sessionId + ".pos-tagger.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
    }
    if (options.has(featuresOption)) {
        values.put("talismane.core." + sessionId + ".language-detector.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".sentence-detector.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".tokeniser.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".pos-tagger.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.train.features", options.valueOf(featuresOption).getPath());
    }
    if (options.has(tokeniserPatternsOption))
        values.put("talismane.core." + sessionId + ".tokeniser.train.patterns", options.valueOf(tokeniserPatternsOption).getPath());
    if (options.has(sentenceFileOption)) {
        values.put("talismane.core." + sessionId + ".tokeniser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
        values.put("talismane.core." + sessionId + ".pos-tagger.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
    }
    if (options.has(languageCorpusMapOption))
        values.put("talismane.core." + sessionId + ".language-detector.train.language-corpus-map", options.valueOf(languageCorpusMapOption).getPath());
    if (options.has(predictTransitionsOption))
        values.put("talismane.core." + sessionId + ".parser.input.predict-transitions", options.valueOf(predictTransitionsOption));
    if (options.has(testWordsOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.output.test-words", options.valuesOf(testWordsOption));
    if (options.has(algorithmOption)) {
        values.put("talismane.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
    }
    if (options.has(cutoffOption)) {
        values.put("talismane.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
    }
    if (options.has(linearSVMEpsilonOption)) {
        values.put("talismane.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
    }
    if (options.has(linearSVMCostOption)) {
        values.put("talismane.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
    }
    if (options.has(oneVsRestOption)) {
        values.put("talismane.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
    }
    if (options.has(iterationsOption)) {
        values.put("talismane.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.iterations", options.valueOf(iterationsOption));
    }
    if (options.has(logConfigFileSpec))
        LogUtils.configureLogging(options.valueOf(logConfigFileSpec));
    File inFile = null;
    File outFile = null;
    File outDir = null;
    if (options.has(inFileOption))
        inFile = options.valueOf(inFileOption);
    if (options.has(outFileOption))
        outFile = options.valueOf(outFileOption);
    if (options.has(outDirOption))
        outDir = options.valueOf(outDirOption);
    File evalFile = inFile;
    if (options.has(evalFileOption))
        evalFile = options.valueOf(evalFileOption);
    boolean keepDirectoryStructure = outFile != null && !outFile.getName().contains(".");
    if (options.has(keepDirStructureOption))
        keepDirectoryStructure = options.valueOf(keepDirStructureOption);
    // System properties override configuration file keys when ConfigFactory.load() is called.
    values.forEach((k, v) -> System.setProperty(k, v.toString()));
    ConfigFactory.invalidateCaches();
    this.execute(sessionId, inFile, outFile, outDir, evalFile, keepDirectoryStructure);
}
Also used : Arrays(java.util.Arrays) LanguageDetectorTrainer(com.joliciel.talismane.languageDetector.LanguageDetectorTrainer) LoggerFactory(org.slf4j.LoggerFactory) PosTaggerTrainer(com.joliciel.talismane.posTagger.PosTaggerTrainer) Scanner(java.util.Scanner) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) CSVFormatter(com.joliciel.talismane.utils.CSVFormatter) Module(com.joliciel.talismane.Talismane.Module) Locale(java.util.Locale) Map(java.util.Map) LexiconReader(com.joliciel.talismane.lexicon.LexiconReader) URI(java.net.URI) LexiconDeserializer(com.joliciel.talismane.lexicon.LexiconDeserializer) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) SingleFileReader(com.joliciel.talismane.utils.io.SingleFileReader) Set(java.util.Set) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) SentenceDetectorEvaluator(com.joliciel.talismane.sentenceDetector.SentenceDetectorEvaluator) Reader(java.io.Reader) LanguageDetector(com.joliciel.talismane.languageDetector.LanguageDetector) Collectors(java.util.stream.Collectors) Mode(com.joliciel.talismane.Talismane.Mode) FileNotFoundException(java.io.FileNotFoundException) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) List(java.util.List) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) ProcessingOption(com.joliciel.talismane.Talismane.ProcessingOption) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) SentenceDetectorTrainer(com.joliciel.talismane.sentenceDetector.SentenceDetectorTrainer) Writer(java.io.Writer) SentenceAnnotatorLoadException(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotatorLoadException) TokenComparator(com.joliciel.talismane.tokeniser.evaluate.TokenComparator) Sentence(com.joliciel.talismane.rawText.Sentence) MachineLearningAlgorithm(com.joliciel.talismane.machineLearning.MachineLearningAlgorithm) PatternTokeniserTrainer(com.joliciel.talismane.tokeniser.patterns.PatternTokeniserTrainer) PredictTransitions(com.joliciel.talismane.parser.Parser.PredictTransitions) PosTagComparator(com.joliciel.talismane.posTagger.evaluate.PosTagComparator) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) HashMap(java.util.HashMap) ParserTrainer(com.joliciel.talismane.parser.ParserTrainer) HashSet(java.util.HashSet) LogUtils(com.joliciel.talismane.utils.LogUtils) joptsimple(joptsimple) OutputStreamWriter(java.io.OutputStreamWriter) ConfigFactory(com.typesafe.config.ConfigFactory) TokeniserEvaluator(com.joliciel.talismane.tokeniser.evaluate.TokeniserEvaluator) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) BufferedWriter(java.io.BufferedWriter) DirectoryReader(com.joliciel.talismane.utils.io.DirectoryReader) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) BuiltInTemplate(com.joliciel.talismane.Talismane.BuiltInTemplate) FileInputStream(java.io.FileInputStream) InputStreamReader(java.io.InputStreamReader) File(java.io.File) LanguageDetectorProcessor(com.joliciel.talismane.languageDetector.LanguageDetectorProcessor) PosTaggerEvaluator(com.joliciel.talismane.posTagger.evaluate.PosTaggerEvaluator) ParserEvaluator(com.joliciel.talismane.parser.evaluate.ParserEvaluator) JoranException(ch.qos.logback.core.joran.spi.JoranException) DirectoryWriter(com.joliciel.talismane.utils.io.DirectoryWriter) BufferedReader(java.io.BufferedReader) Command(com.joliciel.talismane.Talismane.Command) InputStream(java.io.InputStream) ParseComparator(com.joliciel.talismane.parser.evaluate.ParseComparator) Scanner(java.util.Scanner) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileInputStream(java.io.FileInputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) HashSet(java.util.HashSet)

Example 2 with Mode

use of com.joliciel.talismane.Talismane.Mode in project talismane by joliciel-informatique.

the class TalismaneMain method execute.

/**
 * Execute Talismane based on the configuration provided.
 *
 * @param sessionId
 *          The current session's id
 * @param inFile
 *          The file or directory to analyse
 * @param outFile
 *          The file or directory to write the analysis.
 * @param outDir
 *          The directory for writing additional output files (other than the
 *          main analysis).
 * @param keepDirectoryStructure
 *          For analyse and process: if true, and inFile is a directory,
 *          outFile will be interpreted as a directory and the inFile
 *          directory struture will be maintained
 * @param evalFile
 * @throws IOException
 * @throws ReflectiveOperationException
 * @throws TalismaneException
 *           if attempt is made to start and end on two unsupported modules.
 * @throws SentenceAnnotatorLoadException
 */
public void execute(String sessionId, File inFile, File outFile, File outDir, File evalFile, boolean keepDirectoryStructure) throws IOException, ReflectiveOperationException, TalismaneException, SentenceAnnotatorLoadException {
    long startTime = System.currentTimeMillis();
    TalismaneSession session = TalismaneSession.get(sessionId);
    session.setFileForBasename(inFile);
    Config config = ConfigFactory.load();
    try {
        switch(session.getCommand()) {
            case analyse:
                {
                    Module startModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.start-module"));
                    Module endModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.end-module"));
                    Reader reader = getReader(inFile, true, sessionId);
                    Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
                    if (startModule == Module.languageDetector) {
                        if (endModule != Module.languageDetector)
                            throw new TalismaneException("Talismane does not currently support analysis starting with " + startModule.name() + " and ending with another module.");
                        LanguageDetector languageDetector = LanguageDetector.getInstance(sessionId);
                        LanguageDetectorProcessor processor = LanguageDetectorProcessor.getProcessor(writer, sessionId);
                        SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".language-detector.input"), sessionId);
                        while (corpusReader.hasNextSentence()) {
                            String sentence = corpusReader.nextSentence().getText().toString();
                            List<WeightedOutcome<Locale>> results = languageDetector.detectLanguages(sentence);
                            processor.onNextText(sentence, results);
                        }
                    } else {
                        Mode mode = Mode.valueOf(config.getString("talismane.core." + sessionId + ".mode"));
                        switch(mode) {
                            case normal:
                                Talismane talismane = new Talismane(writer, outDir, sessionId);
                                talismane.analyse(reader);
                                break;
                            case server:
                                TalismaneServer talismaneServer = new TalismaneServer(sessionId);
                                talismaneServer.analyse();
                                break;
                        }
                    }
                    break;
                }
            case train:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    switch(session.getModule()) {
                        case languageDetector:
                            {
                                LanguageDetectorTrainer trainer = new LanguageDetectorTrainer(sessionId);
                                trainer.train();
                                break;
                            }
                        case sentenceDetector:
                            {
                                SentenceDetectorTrainer trainer = new SentenceDetectorTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case tokeniser:
                            {
                                PatternTokeniserTrainer trainer = new PatternTokeniserTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case posTagger:
                            {
                                PosTaggerTrainer trainer = new PosTaggerTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                        case parser:
                            {
                                ParserTrainer trainer = new ParserTrainer(reader, sessionId);
                                trainer.train();
                                break;
                            }
                    }
                    break;
                }
            case evaluate:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    switch(session.getModule()) {
                        case sentenceDetector:
                            {
                                SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case tokeniser:
                            {
                                TokeniserEvaluator evaluator = new TokeniserEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case posTagger:
                            {
                                PosTaggerEvaluator evaluator = new PosTaggerEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        case parser:
                            {
                                ParserEvaluator evaluator = new ParserEvaluator(reader, outDir, sessionId);
                                evaluator.evaluate();
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    break;
                }
            case compare:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    Reader evalReader = getReader(evalFile, false, sessionId);
                    switch(session.getModule()) {
                        case tokeniser:
                            {
                                TokenComparator comparator = new TokenComparator(reader, evalReader, outDir, sessionId);
                                comparator.compare();
                                break;
                            }
                        case posTagger:
                            {
                                PosTagComparator comparator = new PosTagComparator(reader, evalReader, outDir, sessionId);
                                comparator.evaluate();
                                break;
                            }
                        case parser:
                            {
                                ParseComparator comparator = new ParseComparator(reader, evalReader, outDir, sessionId);
                                comparator.evaluate();
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    break;
                }
            case process:
                {
                    Reader reader = getReader(inFile, false, sessionId);
                    Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
                    File currentFile = null;
                    URI currentURI = null;
                    IOException ioException = null;
                    switch(session.getModule()) {
                        case sentenceDetector:
                            {
                                List<SentenceProcessor> processors = SentenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".sentence-detector.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        Sentence sentence = corpusReader.nextSentence();
                                        if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
                                            currentURI = sentence.getFileURI();
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (SentenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (SentenceProcessor processor : processors) processor.onNextSentence(sentence);
                                    }
                                } finally {
                                    for (SentenceProcessor processor : processors) {
                                        try {
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case tokeniser:
                            {
                                List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    TokeniserAnnotatedCorpusReader corpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        TokenSequence tokenSequence = corpusReader.nextTokenSequence();
                                        Sentence sentence = tokenSequence.getSentence();
                                        if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (TokenSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (TokenSequenceProcessor processor : processors) processor.onNextTokenSequence(tokenSequence);
                                    }
                                } finally {
                                    for (TokenSequenceProcessor processor : processors) {
                                        try {
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case posTagger:
                            {
                                List<PosTagSequenceProcessor> processors = PosTagSequenceProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    PosTagAnnotatedCorpusReader corpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        PosTagSequence posTagSequence = corpusReader.nextPosTagSequence();
                                        Sentence sentence = posTagSequence.getTokenSequence().getSentence();
                                        if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (PosTagSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (PosTagSequenceProcessor processor : processors) processor.onNextPosTagSequence(posTagSequence);
                                    }
                                } finally {
                                    for (PosTagSequenceProcessor processor : processors) {
                                        try {
                                            processor.onCompleteAnalysis();
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        case parser:
                            {
                                List<ParseConfigurationProcessor> processors = ParseConfigurationProcessor.getProcessors(writer, outDir, sessionId);
                                try {
                                    ParserAnnotatedCorpusReader corpusReader = ParserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".parser.input"), sessionId);
                                    while (corpusReader.hasNextSentence()) {
                                        ParseConfiguration configuration = corpusReader.nextConfiguration();
                                        Sentence sentence = configuration.getSentence();
                                        if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
                                            currentFile = sentence.getFile();
                                            if (writer instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) writer).onNextFile(currentFile);
                                            for (ParseConfigurationProcessor processor : processors) if (processor instanceof CurrentFileObserver)
                                                ((CurrentFileObserver) processor).onNextFile(currentFile);
                                        }
                                        for (ParseConfigurationProcessor processor : processors) processor.onNextParseConfiguration(configuration);
                                    }
                                } finally {
                                    for (ParseConfigurationProcessor processor : processors) {
                                        try {
                                            processor.onCompleteParse();
                                            processor.close();
                                        } catch (IOException e) {
                                            LogUtils.logError(LOG, e);
                                            ioException = e;
                                        }
                                    }
                                }
                                break;
                            }
                        default:
                            throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
                    }
                    if (ioException != null)
                        throw ioException;
                    break;
                }
        }
    } finally {
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        LOG.debug("Total time for Talismane.process(): " + totalTime);
        if (config.getBoolean("talismane.core." + sessionId + ".output.log-execution-time")) {
            try {
                CSVFormatter CSV = new CSVFormatter();
                Writer csvFileWriter = null;
                File csvFile = new File(outDir, session.getBaseName() + ".stats.csv");
                csvFile.delete();
                csvFile.createNewFile();
                csvFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
                csvFileWriter.write(CSV.format("total time") + CSV.format(totalTime) + "\n");
                csvFileWriter.flush();
                csvFileWriter.close();
            } catch (Exception e) {
                LogUtils.logError(LOG, e);
            }
        }
    }
}
Also used : Locale(java.util.Locale) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) SentenceDetectorTrainer(com.joliciel.talismane.sentenceDetector.SentenceDetectorTrainer) ParserTrainer(com.joliciel.talismane.parser.ParserTrainer) PosTagComparator(com.joliciel.talismane.posTagger.evaluate.PosTagComparator) List(java.util.List) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) PosTaggerTrainer(com.joliciel.talismane.posTagger.PosTaggerTrainer) Sentence(com.joliciel.talismane.rawText.Sentence) ParserEvaluator(com.joliciel.talismane.parser.evaluate.ParserEvaluator) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) PatternTokeniserTrainer(com.joliciel.talismane.tokeniser.patterns.PatternTokeniserTrainer) FileOutputStream(java.io.FileOutputStream) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) CSVFormatter(com.joliciel.talismane.utils.CSVFormatter) Module(com.joliciel.talismane.Talismane.Module) LanguageDetectorTrainer(com.joliciel.talismane.languageDetector.LanguageDetectorTrainer) ParseComparator(com.joliciel.talismane.parser.evaluate.ParseComparator) File(java.io.File) SentenceDetectorEvaluator(com.joliciel.talismane.sentenceDetector.SentenceDetectorEvaluator) TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) Config(com.typesafe.config.Config) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) LexiconReader(com.joliciel.talismane.lexicon.LexiconReader) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) SingleFileReader(com.joliciel.talismane.utils.io.SingleFileReader) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) Reader(java.io.Reader) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) DirectoryReader(com.joliciel.talismane.utils.io.DirectoryReader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) URI(java.net.URI) PosTaggerEvaluator(com.joliciel.talismane.posTagger.evaluate.PosTaggerEvaluator) TokenComparator(com.joliciel.talismane.tokeniser.evaluate.TokenComparator) BufferedWriter(java.io.BufferedWriter) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) LanguageDetectorProcessor(com.joliciel.talismane.languageDetector.LanguageDetectorProcessor) Mode(com.joliciel.talismane.Talismane.Mode) TokeniserEvaluator(com.joliciel.talismane.tokeniser.evaluate.TokeniserEvaluator) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) SentenceAnnotatorLoadException(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotatorLoadException) IOException(java.io.IOException) JoranException(ch.qos.logback.core.joran.spi.JoranException) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) LanguageDetector(com.joliciel.talismane.languageDetector.LanguageDetector) OutputStreamWriter(java.io.OutputStreamWriter) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) DirectoryWriter(com.joliciel.talismane.utils.io.DirectoryWriter)

Aggregations

JoranException (ch.qos.logback.core.joran.spi.JoranException)2 Mode (com.joliciel.talismane.Talismane.Mode)2 Module (com.joliciel.talismane.Talismane.Module)2 LanguageDetector (com.joliciel.talismane.languageDetector.LanguageDetector)2 LanguageDetectorProcessor (com.joliciel.talismane.languageDetector.LanguageDetectorProcessor)2 LanguageDetectorTrainer (com.joliciel.talismane.languageDetector.LanguageDetectorTrainer)2 LexiconReader (com.joliciel.talismane.lexicon.LexiconReader)2 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)2 ParserAnnotatedCorpusReader (com.joliciel.talismane.parser.ParserAnnotatedCorpusReader)2 ParserTrainer (com.joliciel.talismane.parser.ParserTrainer)2 ParseComparator (com.joliciel.talismane.parser.evaluate.ParseComparator)2 ParserEvaluator (com.joliciel.talismane.parser.evaluate.ParserEvaluator)2 ParseConfigurationProcessor (com.joliciel.talismane.parser.output.ParseConfigurationProcessor)2 PosTagAnnotatedCorpusReader (com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader)2 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)2 PosTaggerTrainer (com.joliciel.talismane.posTagger.PosTaggerTrainer)2 PosTagComparator (com.joliciel.talismane.posTagger.evaluate.PosTagComparator)2 PosTaggerEvaluator (com.joliciel.talismane.posTagger.evaluate.PosTaggerEvaluator)2 PosTagSequenceProcessor (com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor)2 Sentence (com.joliciel.talismane.rawText.Sentence)2