Search in sources :

Example 1 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TalismaneMain method execute.

/**
 * Execute by processing command line options with a given default config.
 *
 * @param args
 *          the command-line options
 */
public void execute(String[] args) throws IOException, ReflectiveOperationException, TalismaneException, JoranException {
    if (args.length > 0) {
        Set<String> argSet = new HashSet<>(Arrays.asList(args));
        if (argSet.contains("--serializeLexicon")) {
            LexiconReader.main(args);
            return;
        }
        if (argSet.contains("--testLexicon")) {
            LexiconDeserializer.main(args);
            return;
        }
        if (argSet.contains("--serializeDiacriticizer")) {
            Diacriticizer.main(args);
            return;
        }
        if (argSet.contains("--testDiacriticizer")) {
            Diacriticizer.main(args);
            return;
        }
    }
    OptionSet options = parser.parse(args);
    if (args.length == 0 || options.has("help")) {
        parser.printHelpOn(System.out);
        return;
    }
    String sessionId = options.valueOf(sessionIdOption);
    Map<String, Object> values = new HashMap<>();
    if (options.has("analyse"))
        values.put("talismane.core." + sessionId + ".command", Command.analyse.name());
    if (options.has("train"))
        values.put("talismane.core." + sessionId + ".command", Command.train.name());
    if (options.has("evaluate"))
        values.put("talismane.core." + sessionId + ".command", Command.evaluate.name());
    if (options.has("compare"))
        values.put("talismane.core." + sessionId + ".command", Command.compare.name());
    if (options.has("process"))
        values.put("talismane.core." + sessionId + ".command", Command.process.name());
    if (options.has(moduleOption))
        values.put("talismane.core." + sessionId + ".module", options.valueOf(moduleOption).name());
    if (options.has(startModuleOption)) {
        values.put("talismane.core." + sessionId + ".analysis.start-module", options.valueOf(startModuleOption).name());
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.start-module", options.valueOf(startModuleOption).name());
        values.put("talismane.core." + sessionId + ".parser.evaluate.start-module", options.valueOf(startModuleOption).name());
    }
    if (options.has(endModuleOption))
        values.put("talismane.core." + sessionId + ".analysis.end-module", options.valueOf(endModuleOption).name());
    if (options.has(modeOption))
        values.put("talismane.core." + sessionId + ".mode", options.valueOf(modeOption).name());
    if (options.has(portOption))
        values.put("talismane.core." + sessionId + ".port", options.valueOf(portOption));
    if (options.has(localeOption))
        values.put("talismane.core." + sessionId + ".locale", options.valueOf(localeOption));
    if (options.has(encodingOption))
        values.put("talismane.core." + sessionId + ".encoding", options.valueOf(encodingOption));
    if (options.has(inputEncodingOption))
        values.put("talismane.core." + sessionId + ".input-encoding", options.valueOf(inputEncodingOption));
    if (options.has(outputEncodingOption))
        values.put("talismane.core." + sessionId + ".output-encoding", options.valueOf(outputEncodingOption));
    if (options.has(languageModelOption))
        values.put("talismane.core." + sessionId + ".language-detector.model", options.valueOf(languageModelOption).getPath());
    if (options.has(sentenceModelOption))
        values.put("talismane.core." + sessionId + ".sentence-detector.model", options.valueOf(sentenceModelOption).getPath());
    if (options.has(tokeniserModelOption))
        values.put("talismane.core." + sessionId + ".tokeniser.model", options.valueOf(tokeniserModelOption).getPath());
    if (options.has(posTaggerModelOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.model", options.valueOf(posTaggerModelOption).getPath());
    if (options.has(parserModelOption))
        values.put("talismane.core." + sessionId + ".parser.model", options.valueOf(parserModelOption).getPath());
    if (options.has(lexiconOption)) {
        List<String> lexiconPaths = options.valuesOf(lexiconOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".lexicons", lexiconPaths);
    }
    if (options.has(textAnnotatorsOption)) {
        List<String> textAnnotatorPaths = options.valuesOf(textAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".annotators.text-annotators", textAnnotatorPaths);
    }
    if (options.has(sentenceAnnotatorsOption)) {
        List<String> sentenceAnnotatorPaths = options.valuesOf(sentenceAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".annotators.sentence-annotators", sentenceAnnotatorPaths);
    }
    List<String> inputLocations = Arrays.asList("talismane.core." + sessionId + ".input", "talismane.core." + sessionId + ".language-detector.input", "talismane.core." + sessionId + ".language-detector.train", "talismane.core." + sessionId + ".language-detector.evaluate", "talismane.core." + sessionId + ".sentence-detector.input", "talismane.core." + sessionId + ".sentence-detector.train", "talismane.core." + sessionId + ".sentence-detector.evaluate", "talismane.core." + sessionId + ".tokeniser.input", "talismane.core." + sessionId + ".tokeniser.train", "talismane.core." + sessionId + ".tokeniser.evaluate", "talismane.core." + sessionId + ".pos-tagger.input", "talismane.core." + sessionId + ".pos-tagger.train", "talismane.core." + sessionId + ".pos-tagger.evaluate", "talismane.core." + sessionId + ".parser.input", "talismane.core." + sessionId + ".parser.train", "talismane.core." + sessionId + ".parser.evaluate");
    List<String> outputLocations = Arrays.asList("talismane.core." + sessionId + ".output", "talismane.core." + sessionId + ".language-detector.output", "talismane.core." + sessionId + ".sentence-detector.output", "talismane.core." + sessionId + ".tokeniser.output", "talismane.core." + sessionId + ".pos-tagger.output", "talismane.core." + sessionId + ".parser.output");
    if (options.has(newlineOption))
        values.put("talismane.core." + sessionId + ".newline", options.valueOf(newlineOption));
    if (options.has(processByDefaultOption))
        values.put("talismane.core." + sessionId + ".analysis.process-by-default", options.valueOf(processByDefaultOption));
    if (options.has(blockSizeOption))
        values.put("talismane.core." + sessionId + ".block-size", options.valueOf(blockSizeOption));
    if (options.has(sentenceCountOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".sentence-count", options.valueOf(sentenceCountOption));
    if (options.has(startSentenceOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".start-sentence", options.valueOf(startSentenceOption));
    if (options.has(crossValidationSizeOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.fold-count", options.valueOf(crossValidationSizeOption));
    if (options.has(includeIndexOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.include-index", options.valueOf(includeIndexOption));
    if (options.has(excludeIndexOption))
        for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.exclude-index", options.valueOf(excludeIndexOption));
    if (options.has(builtInTemplateOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".built-in-template", options.valueOf(builtInTemplateOption).name());
    if (options.has(templateOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".template", options.valueOf(templateOption).getPath());
    if (options.has(posTaggerRulesOption)) {
        List<String> posTaggerRulePaths = options.valuesOf(posTaggerRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".pos-tagger.rules", posTaggerRulePaths);
    }
    if (options.has(parserRulesOption)) {
        List<String> parserRulePaths = options.valuesOf(parserRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
        values.put("talismane.core." + sessionId + ".parser.rules", parserRulePaths);
    }
    if (options.has(suffixOption))
        values.put("talismane.core." + sessionId + ".suffix", options.valueOf(suffixOption));
    if (options.has(outputDividerOption))
        for (String outputLocation : outputLocations) values.put(outputLocation + ".output-divider", options.valueOf(outputDividerOption));
    if (options.has(beamWidthOption)) {
        values.put("talismane.core." + sessionId + ".pos-tagger.beam-width", options.valueOf(beamWidthOption));
        values.put("talismane.core." + sessionId + ".parser.beam-width", options.valueOf(beamWidthOption));
    }
    if (options.has(tokeniserBeamWidthOption))
        values.put("talismane.core." + sessionId + ".tokeniser.beam-width", options.valueOf(tokeniserBeamWidthOption));
    if (options.has(propagateBeamOption))
        values.put("talismane.core." + sessionId + ".parser.propagate-pos-tagger-beam", options.valueOf(propagateBeamOption));
    if (options.has(maxParseAnalysisTimeOption))
        values.put("talismane.core." + sessionId + ".parser.max-analysis-time", options.valueOf(maxParseAnalysisTimeOption));
    if (options.has(minFreeMemoryOption))
        values.put("talismane.core." + sessionId + ".parser.min-free-memory", options.valueOf(minFreeMemoryOption));
    if (options.has(earlyStopOption))
        values.put("talismane.core." + sessionId + ".parser.early-stop", options.valueOf(earlyStopOption));
    if (options.has(inputPatternFileOption) || options.has(inputPatternOption)) {
        String inputRegex = null;
        if (options.has(inputPatternFileOption)) {
            InputStream inputPatternFile = new FileInputStream(options.valueOf(inputPatternFileOption));
            try (Scanner inputPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(inputPatternFile, "UTF-8")))) {
                if (inputPatternScanner.hasNextLine()) {
                    inputRegex = inputPatternScanner.nextLine();
                }
            }
            if (inputRegex == null)
                throw new TalismaneException("No input pattern found in " + options.valueOf(inputPatternFileOption).getPath());
        } else {
            inputRegex = options.valueOf(inputPatternOption);
        }
        for (String inputLocation : inputLocations) values.put(inputLocation + ".input-pattern", inputRegex);
    }
    if (options.has(evalPatternFileOption) || options.has(evalPatternOption)) {
        String evalRegex = null;
        if (options.has(evalPatternFileOption)) {
            InputStream evalPatternFile = new FileInputStream(options.valueOf(evalPatternFileOption));
            try (Scanner evalPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(evalPatternFile, "UTF-8")))) {
                if (evalPatternScanner.hasNextLine()) {
                    evalRegex = evalPatternScanner.nextLine();
                }
            }
            if (evalRegex == null)
                throw new TalismaneException("No eval pattern found in " + options.valueOf(evalPatternFileOption).getPath());
        } else {
            evalRegex = options.valueOf(evalPatternOption);
        }
        values.put("talismane.core." + sessionId + ".sentence-detector.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".tokeniser.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.input-pattern", evalRegex);
        values.put("talismane.core." + sessionId + ".parser.evaluate.input-pattern", evalRegex);
    }
    if (options.has(csvSeparatorOption))
        values.put("talismane.core." + sessionId + ".csv.separator", options.valueOf(csvSeparatorOption));
    if (options.has(csvEncodingOption))
        values.put("talismane.core." + sessionId + ".csv.encoding", options.valueOf(csvEncodingOption));
    if (options.has(csvLocaleOption))
        values.put("talismane.core." + sessionId + ".csv.locale", options.valueOf(csvLocaleOption));
    if (options.has(includeUnknownWordResultsOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-unknown-word-results", options.valueOf(includeUnknownWordResultsOption));
    if (options.has(includeLexiconCoverageOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-lexicon-coverage", options.valueOf(includeLexiconCoverageOption));
    if (options.has(labeledEvaluationOption))
        values.put("talismane.core." + sessionId + ".parser.evaluate.labeled-evaluation", options.valueOf(labeledEvaluationOption));
    if (options.has(processingOption))
        values.put("talismane.core." + sessionId + ".output.option", options.valueOf(processingOption).name());
    if (options.has(lexicalEntryRegexOption)) {
        values.put("talismane.core." + sessionId + ".pos-tagger.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
    }
    if (options.has(featuresOption)) {
        values.put("talismane.core." + sessionId + ".language-detector.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".sentence-detector.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".tokeniser.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".pos-tagger.train.features", options.valueOf(featuresOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.train.features", options.valueOf(featuresOption).getPath());
    }
    if (options.has(tokeniserPatternsOption))
        values.put("talismane.core." + sessionId + ".tokeniser.train.patterns", options.valueOf(tokeniserPatternsOption).getPath());
    if (options.has(sentenceFileOption)) {
        values.put("talismane.core." + sessionId + ".tokeniser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
        values.put("talismane.core." + sessionId + ".pos-tagger.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
        values.put("talismane.core." + sessionId + ".parser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
    }
    if (options.has(languageCorpusMapOption))
        values.put("talismane.core." + sessionId + ".language-detector.train.language-corpus-map", options.valueOf(languageCorpusMapOption).getPath());
    if (options.has(predictTransitionsOption))
        values.put("talismane.core." + sessionId + ".parser.input.predict-transitions", options.valueOf(predictTransitionsOption));
    if (options.has(testWordsOption))
        values.put("talismane.core." + sessionId + ".pos-tagger.output.test-words", options.valuesOf(testWordsOption));
    if (options.has(algorithmOption)) {
        values.put("talismane.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
    }
    if (options.has(cutoffOption)) {
        values.put("talismane.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.cutoff", options.valueOf(cutoffOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
    }
    if (options.has(linearSVMEpsilonOption)) {
        values.put("talismane.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
    }
    if (options.has(linearSVMCostOption)) {
        values.put("talismane.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
    }
    if (options.has(oneVsRestOption)) {
        values.put("talismane.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
    }
    if (options.has(iterationsOption)) {
        values.put("talismane.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.iterations", options.valueOf(iterationsOption));
        values.put("talismane.core." + sessionId + ".parser.train.machine-learning.iterations", options.valueOf(iterationsOption));
    }
    if (options.has(logConfigFileSpec))
        LogUtils.configureLogging(options.valueOf(logConfigFileSpec));
    File inFile = null;
    File outFile = null;
    File outDir = null;
    if (options.has(inFileOption))
        inFile = options.valueOf(inFileOption);
    if (options.has(outFileOption))
        outFile = options.valueOf(outFileOption);
    if (options.has(outDirOption))
        outDir = options.valueOf(outDirOption);
    File evalFile = inFile;
    if (options.has(evalFileOption))
        evalFile = options.valueOf(evalFileOption);
    boolean keepDirectoryStructure = outFile != null && !outFile.getName().contains(".");
    if (options.has(keepDirStructureOption))
        keepDirectoryStructure = options.valueOf(keepDirStructureOption);
    // System properties override configuration file keys when ConfigFactory.load() is called.
    values.forEach((k, v) -> System.setProperty(k, v.toString()));
    ConfigFactory.invalidateCaches();
    this.execute(sessionId, inFile, outFile, outDir, evalFile, keepDirectoryStructure);
}
Also used : Arrays(java.util.Arrays) LanguageDetectorTrainer(com.joliciel.talismane.languageDetector.LanguageDetectorTrainer) LoggerFactory(org.slf4j.LoggerFactory) PosTaggerTrainer(com.joliciel.talismane.posTagger.PosTaggerTrainer) Scanner(java.util.Scanner) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) CSVFormatter(com.joliciel.talismane.utils.CSVFormatter) Module(com.joliciel.talismane.Talismane.Module) Locale(java.util.Locale) Map(java.util.Map) LexiconReader(com.joliciel.talismane.lexicon.LexiconReader) URI(java.net.URI) LexiconDeserializer(com.joliciel.talismane.lexicon.LexiconDeserializer) Diacriticizer(com.joliciel.talismane.lexicon.Diacriticizer) PosTagAnnotatedCorpusReader(com.joliciel.talismane.posTagger.PosTagAnnotatedCorpusReader) TokenSequenceProcessor(com.joliciel.talismane.tokeniser.output.TokenSequenceProcessor) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) SentenceProcessor(com.joliciel.talismane.sentenceDetector.SentenceProcessor) SingleFileReader(com.joliciel.talismane.utils.io.SingleFileReader) Set(java.util.Set) ParserAnnotatedCorpusReader(com.joliciel.talismane.parser.ParserAnnotatedCorpusReader) SentenceDetectorEvaluator(com.joliciel.talismane.sentenceDetector.SentenceDetectorEvaluator) Reader(java.io.Reader) LanguageDetector(com.joliciel.talismane.languageDetector.LanguageDetector) Collectors(java.util.stream.Collectors) Mode(com.joliciel.talismane.Talismane.Mode) FileNotFoundException(java.io.FileNotFoundException) CurrentFileObserver(com.joliciel.talismane.utils.io.CurrentFileObserver) List(java.util.List) SentenceDetectorAnnotatedCorpusReader(com.joliciel.talismane.sentenceDetector.SentenceDetectorAnnotatedCorpusReader) ProcessingOption(com.joliciel.talismane.Talismane.ProcessingOption) PosTagSequenceProcessor(com.joliciel.talismane.posTagger.output.PosTagSequenceProcessor) ParseConfigurationProcessor(com.joliciel.talismane.parser.output.ParseConfigurationProcessor) SentenceDetectorTrainer(com.joliciel.talismane.sentenceDetector.SentenceDetectorTrainer) Writer(java.io.Writer) SentenceAnnotatorLoadException(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotatorLoadException) TokenComparator(com.joliciel.talismane.tokeniser.evaluate.TokenComparator) Sentence(com.joliciel.talismane.rawText.Sentence) MachineLearningAlgorithm(com.joliciel.talismane.machineLearning.MachineLearningAlgorithm) PatternTokeniserTrainer(com.joliciel.talismane.tokeniser.patterns.PatternTokeniserTrainer) PredictTransitions(com.joliciel.talismane.parser.Parser.PredictTransitions) PosTagComparator(com.joliciel.talismane.posTagger.evaluate.PosTagComparator) TokeniserAnnotatedCorpusReader(com.joliciel.talismane.tokeniser.TokeniserAnnotatedCorpusReader) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) HashMap(java.util.HashMap) ParserTrainer(com.joliciel.talismane.parser.ParserTrainer) HashSet(java.util.HashSet) LogUtils(com.joliciel.talismane.utils.LogUtils) joptsimple(joptsimple) OutputStreamWriter(java.io.OutputStreamWriter) ConfigFactory(com.typesafe.config.ConfigFactory) TokeniserEvaluator(com.joliciel.talismane.tokeniser.evaluate.TokeniserEvaluator) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) BufferedWriter(java.io.BufferedWriter) DirectoryReader(com.joliciel.talismane.utils.io.DirectoryReader) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) BuiltInTemplate(com.joliciel.talismane.Talismane.BuiltInTemplate) FileInputStream(java.io.FileInputStream) InputStreamReader(java.io.InputStreamReader) File(java.io.File) LanguageDetectorProcessor(com.joliciel.talismane.languageDetector.LanguageDetectorProcessor) PosTaggerEvaluator(com.joliciel.talismane.posTagger.evaluate.PosTaggerEvaluator) ParserEvaluator(com.joliciel.talismane.parser.evaluate.ParserEvaluator) JoranException(ch.qos.logback.core.joran.spi.JoranException) DirectoryWriter(com.joliciel.talismane.utils.io.DirectoryWriter) BufferedReader(java.io.BufferedReader) Command(com.joliciel.talismane.Talismane.Command) InputStream(java.io.InputStream) ParseComparator(com.joliciel.talismane.parser.evaluate.ParseComparator) Scanner(java.util.Scanner) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileInputStream(java.io.FileInputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) HashSet(java.util.HashSet)

Example 2 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TokenPerLineCorpusReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        while (sentenceLines == null) {
            List<UnprocessedLine> lines = new ArrayList<>();
            int skippedLineCount = 0;
            if (!this.hasNextLine())
                break;
            while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
                String line = "";
                if (this.hasNextLine())
                    line = this.nextLine().replace("\r", "");
                lineNumber++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Line " + lineNumber + ": " + line);
                if (line.length() > 0) {
                    boolean skip = false;
                    for (Pattern skipLinePattern : skipLinePatterns) {
                        if (skipLinePattern.matcher(line).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
                            skip = true;
                            skippedLineCount++;
                            break;
                        }
                    }
                    List<CorpusSentenceRule> myRules = new ArrayList<>();
                    List<Matcher> myMatchers = new ArrayList<>();
                    for (CorpusSentenceRule sentenceRule : sentenceRules) {
                        Matcher matcher = sentenceRule.getPattern().matcher(line);
                        if (matcher.matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Matched rule: " + sentenceRule);
                            myRules.add(sentenceRule);
                            myMatchers.add(matcher);
                        }
                    }
                    UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
                    lines.add(unprocessedLine);
                } else {
                    if (lines.size() == 0 || lines.size() == skippedLineCount) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    // end of sentence
                    boolean includeMe = true;
                    // check cross-validation
                    if (this.getCrossValidationSize() > 0) {
                        if (this.getIncludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
                                includeMe = false;
                            }
                        } else if (this.getExcludeIndex() >= 0) {
                            if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
                                includeMe = false;
                            }
                        }
                    }
                    if (this.getStartSentence() > sentenceCount) {
                        includeMe = false;
                    }
                    sentenceCount++;
                    LOG.debug("sentenceCount: " + sentenceCount);
                    if (!includeMe) {
                        lines = new ArrayList<>();
                        skippedLineCount = 0;
                        continue;
                    }
                    sentenceLines = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (!unprocessedLine.skip) {
                            CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
                            sentenceLines.add(corpusLine);
                            if (this.lexicalEntryReader != null) {
                                WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
                                this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
                                corpusLine.setLexicalEntry(lexicalEntry);
                            }
                        }
                    }
                    List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
                    for (UnprocessedLine unprocessedLine : lines) {
                        if (LOG.isTraceEnabled())
                            LOG.trace("Line " + unprocessedLine);
                        for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
                            CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
                            Matcher matcher = unprocessedLine.matchers.get(i);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Testing rule " + sentenceRule);
                            CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
                            if (LOG.isTraceEnabled())
                                LOG.trace("Result: " + action);
                            if (action != null) {
                                if (action instanceof MergeAction)
                                    mergeActions.add((MergeAction) action);
                                break;
                            }
                        }
                    }
                    if (mergeActions.size() > 0) {
                        List<CorpusLine> newSentenceLines = new ArrayList<>();
                        Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
                        for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
                            for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
                                indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
                            }
                        }
                        int i = 1;
                        Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
                        int nextIndexToMerge = iIndexToMerge.next();
                        int linesRemoved = 0;
                        Map<Integer, Integer> indexChangeMap = new HashMap<>();
                        indexChangeMap.put(0, 0);
                        for (CorpusLine corpusLine : sentenceLines) {
                            if (i == nextIndexToMerge) {
                                MergeAction mergeAction = indexesToMerge.get(i);
                                if (i == mergeAction.getFirstIndex()) {
                                    newSentenceLines.add(mergeAction.getMergedLine());
                                    linesRemoved -= 1;
                                }
                                linesRemoved += 1;
                                if (iIndexToMerge.hasNext())
                                    nextIndexToMerge = iIndexToMerge.next();
                                else
                                    nextIndexToMerge = -1;
                            } else {
                                newSentenceLines.add(corpusLine);
                            }
                            indexChangeMap.put(i, i - linesRemoved);
                            i++;
                        }
                        for (CorpusLine corpusLine : newSentenceLines) {
                            corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
                            int governorIndex = corpusLine.getGovernorIndex();
                            if (governorIndex >= 0)
                                corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
                            int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
                            if (nonProjGovernorIndex >= 0)
                                corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
                        }
                        sentenceLines = newSentenceLines;
                    }
                    Sentence sentence = null;
                    if (sentenceReader != null && sentenceReader.hasNextSentence()) {
                        sentence = sentenceReader.nextSentence();
                    } else {
                        LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
                        if (rules == null)
                            throw new TalismaneException("Linguistic rules have not been set.");
                        String text = "";
                        for (CorpusLine corpusLine : sentenceLines) {
                            String word = corpusLine.getElement(CorpusElement.TOKEN);
                            if (rules.shouldAddSpace(text, word))
                                text += " ";
                            text += word;
                        }
                        sentence = new Sentence(text, currentFile, sessionId);
                    }
                    for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                        sentenceAnnotator.annotate(sentence);
                    }
                    this.processSentence(sentence, sentenceLines);
                }
            }
        }
    }
    return (sentenceLines != null);
}
Also used : Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) ArrayList(java.util.ArrayList) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) Sentence(com.joliciel.talismane.rawText.Sentence) Pattern(java.util.regex.Pattern) MergeAction(com.joliciel.talismane.corpus.CorpusSentenceRule.MergeAction) TreeMap(java.util.TreeMap) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)

Example 3 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class SentenceDetector method detectSentences.

/**
 * Detect sentences within an annotated text. Sentences are added in the form
 * of an Annotation around a {@link SentenceBoundary}, with the start position
 * (relative to the start of the annotated text) at the start of the sentence
 * and the end position immediately after the end of the sentence. <br>
 * <br>
 * Sentence boundaries will not be detected within any annotation of type
 * {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
 * after the {@link AnnotatedText#getAnalysisStart()} and
 * {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
 * <br>
 * If the text contained existing {@link SentenceBoundary} annotations before
 * analysis start, the first sentence will begin where the last existing
 * annotation ended. Otherwise, the first boundary will begin at position 0.
 * <br>
 * <br>
 * If the text's analysis end is equal to the text length, it is assumed that
 * the text end is a sentence boundary. In this case, an additional sentence
 * is added starting at the final detected boundary and ending at text end.
 *
 * @param text
 *          the annotated text in which we need to detect sentences.
 * @return in addition to the annotations added, we return a List of integers
 *         marking the end position of each sentence boundary.
 */
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
    LOG.debug("detectSentences");
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
    Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
    List<Integer> possibleBoundaries = new ArrayList<>();
    while (matcher.find()) {
        if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
            boolean noSentences = false;
            int position = matcher.start();
            for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
                if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
                    noSentences = true;
                    break;
                }
            }
            if (!noSentences)
                possibleBoundaries.add(position);
        }
    }
    // collect all deterministic sentence boundaries
    List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
    Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
    // Share one token sequence for all possible boundaries, to avoid tokenising
    // multiple times
    Sentence sentence = new Sentence(text.getText(), sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
    for (int possibleBoundary : possibleBoundaries) {
        PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Testing boundary: " + boundary);
            LOG.trace(" at position: " + possibleBoundary);
        }
        List<FeatureResult<?>> featureResults = new ArrayList<>();
        for (SentenceDetectorFeature<?> feature : features) {
            RuntimeEnvironment env = new RuntimeEnvironment();
            FeatureResult<?> featureResult = feature.check(boundary, env);
            if (featureResult != null)
                featureResults.add(featureResult);
        }
        if (LOG.isTraceEnabled()) {
            SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
            for (String featureResultString : featureResultSet) {
                LOG.trace(featureResultString);
            }
        }
        List<Decision> decisions = this.decisionMaker.decide(featureResults);
        if (LOG.isTraceEnabled()) {
            for (Decision decision : decisions) {
                LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
            }
        }
        if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Adding boundary: " + possibleBoundary + 1);
            }
            guessedBoundaries.add(possibleBoundary + 1);
            boundaries.add(boundary);
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
        for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
    }
    if (LOG.isDebugEnabled())
        LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
    List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
    int lastBoundary = 0;
    List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
    if (existingBoundaries.size() > 0) {
        lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
    }
    // advance boundary start until a non space character is encountered
    while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
        lastBoundary++;
    }
    for (int guessedBoundary : guessedBoundaries) {
        if (guessedBoundary > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added boundary: " + sentenceBoundary);
            }
            lastBoundary = guessedBoundary;
        }
    }
    if (text.getAnalysisEnd() == text.getText().length()) {
        if (text.getAnalysisEnd() > lastBoundary) {
            Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
            newBoundaries.add(sentenceBoundary);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Added final boundary: " + sentenceBoundary);
            }
        }
    }
    text.addAnnotations(newBoundaries);
    return new ArrayList<>(guessedBoundaries);
}
Also used : ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) SentenceDetectorFeatureParser(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeatureParser) Matcher(java.util.regex.Matcher) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResourceFinder(com.joliciel.talismane.machineLearning.ExternalResourceFinder) AnnotatedText(com.joliciel.talismane.AnnotatedText) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) RawTextSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextSentenceBreakMarker) List(java.util.List) Annotation(com.joliciel.talismane.Annotation) Annotator(com.joliciel.talismane.Annotator) Pattern(java.util.regex.Pattern) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) TreeSet(java.util.TreeSet) Sentence(com.joliciel.talismane.rawText.Sentence) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) Annotation(com.joliciel.talismane.Annotation) Decision(com.joliciel.talismane.machineLearning.Decision) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 4 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class ParserEvaluator method evaluate.

/**
 * @throws TalismaneException
 *           if an attempt is made to evaluate with a tokeniser but no
 *           pos-tagger
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
        List<PosTagSequence> posTagSequences = null;
        List<TokenSequence> tokenSequences = null;
        if (tokeniser != null) {
            if (posTagger == null)
                throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
            Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
            // annotate the sentence for pre token filters
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("TokenFilter: " + annotator);
                    LOG.trace("annotations: " + sentence.getAnnotations());
                }
            }
            tokenSequences = tokeniser.tokenise(sentence);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
            posTagSequence.removeRoot();
            tokenSequences.add(posTagSequence.getTokenSequence());
        }
        if (posTagger != null) {
            if (posTagger instanceof NonDeterministicPosTagger) {
                NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            } else {
                posTagSequences = new ArrayList<PosTagSequence>();
                PosTagSequence posTagSequence = null;
                posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
                posTagSequences.add(posTagSequence);
            }
        } else {
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
            posTagSequences = new ArrayList<PosTagSequence>();
            posTagSequences.add(posTagSequence);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseStart(realConfiguration, posTagSequences);
        }
        List<ParseConfiguration> guessedConfigurations = null;
        if (parser instanceof NonDeterministicParser) {
            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
            guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
        } else {
            ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
            guessedConfigurations = new ArrayList<ParseConfiguration>();
            guessedConfigurations.add(bestGuess);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseEnd(realConfiguration, guessedConfigurations);
        }
    }
    for (ParseEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 5 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class PosTaggerEvaluator method evaluate.

/**
 * Evaluate a given pos tagger.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        PosTagSequence realPosTagSequence = corpusReader.nextPosTagSequence();
        List<TokenSequence> tokenSequences = null;
        List<PosTagSequence> guessedSequences = null;
        TokenSequence tokenSequence = realPosTagSequence.getTokenSequence();
        PosTagSequence guessedSequence = null;
        if (this.tokeniser != null) {
            Sentence sentence = tokenSequence.getSentence();
            tokenSequences = tokeniser.tokenise(sentence);
            tokenSequence = tokenSequences.get(0);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            tokenSequences.add(tokenSequence);
        }
        if (posTagger instanceof NonDeterministicPosTagger) {
            NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
            guessedSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            guessedSequence = guessedSequences.get(0);
        } else {
            guessedSequence = posTagger.tagSentence(tokenSequence);
        }
        if (LOG.isDebugEnabled()) {
            StringBuilder stringBuilder = new StringBuilder();
            for (PosTaggedToken posTaggedToken : guessedSequence) {
                Set<String> lemmas = new TreeSet<String>();
                stringBuilder.append(posTaggedToken.getToken().getOriginalText());
                stringBuilder.append("[" + posTaggedToken.getTag());
                List<LexicalEntry> entries = posTaggedToken.getLexicalEntries();
                boolean dropCurrentWord = false;
                if (entries.size() > 1)
                    dropCurrentWord = true;
                for (LexicalEntry entry : posTaggedToken.getLexicalEntries()) {
                    if (!lemmas.contains(entry.getLemma())) {
                        if (dropCurrentWord && posTaggedToken.getToken().getText().equals(entry.getLemma())) {
                            dropCurrentWord = false;
                            continue;
                        }
                        stringBuilder.append("|" + entry.getLemma());
                        // stringBuilder.append("/" + entry.getCategory());
                        stringBuilder.append("/" + entry.getMorphology());
                        lemmas.add(entry.getLemma());
                    }
                }
                stringBuilder.append("] ");
            }
            LOG.debug(stringBuilder.toString());
        }
        for (PosTagEvaluationObserver observer : this.observers) {
            observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
        }
    }
    for (PosTagEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) TreeSet(java.util.TreeSet) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)43 Config (com.typesafe.config.Config)31 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)25 Token (com.joliciel.talismane.tokeniser.Token)14 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)13 Annotation (com.joliciel.talismane.Annotation)12 Decision (com.joliciel.talismane.machineLearning.Decision)11 ArrayList (java.util.ArrayList)9 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 TalismaneException (com.joliciel.talismane.TalismaneException)6 HashMap (java.util.HashMap)6 List (java.util.List)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5