use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TalismaneMain method execute.
/**
* Execute by processing command line options with a given default config.
*
* @param args
* the command-line options
*/
public void execute(String[] args) throws IOException, ReflectiveOperationException, TalismaneException, JoranException {
if (args.length > 0) {
Set<String> argSet = new HashSet<>(Arrays.asList(args));
if (argSet.contains("--serializeLexicon")) {
LexiconReader.main(args);
return;
}
if (argSet.contains("--testLexicon")) {
LexiconDeserializer.main(args);
return;
}
if (argSet.contains("--serializeDiacriticizer")) {
Diacriticizer.main(args);
return;
}
if (argSet.contains("--testDiacriticizer")) {
Diacriticizer.main(args);
return;
}
}
OptionSet options = parser.parse(args);
if (args.length == 0 || options.has("help")) {
parser.printHelpOn(System.out);
return;
}
String sessionId = options.valueOf(sessionIdOption);
Map<String, Object> values = new HashMap<>();
if (options.has("analyse"))
values.put("talismane.core." + sessionId + ".command", Command.analyse.name());
if (options.has("train"))
values.put("talismane.core." + sessionId + ".command", Command.train.name());
if (options.has("evaluate"))
values.put("talismane.core." + sessionId + ".command", Command.evaluate.name());
if (options.has("compare"))
values.put("talismane.core." + sessionId + ".command", Command.compare.name());
if (options.has("process"))
values.put("talismane.core." + sessionId + ".command", Command.process.name());
if (options.has(moduleOption))
values.put("talismane.core." + sessionId + ".module", options.valueOf(moduleOption).name());
if (options.has(startModuleOption)) {
values.put("talismane.core." + sessionId + ".analysis.start-module", options.valueOf(startModuleOption).name());
values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.start-module", options.valueOf(startModuleOption).name());
values.put("talismane.core." + sessionId + ".parser.evaluate.start-module", options.valueOf(startModuleOption).name());
}
if (options.has(endModuleOption))
values.put("talismane.core." + sessionId + ".analysis.end-module", options.valueOf(endModuleOption).name());
if (options.has(modeOption))
values.put("talismane.core." + sessionId + ".mode", options.valueOf(modeOption).name());
if (options.has(portOption))
values.put("talismane.core." + sessionId + ".port", options.valueOf(portOption));
if (options.has(localeOption))
values.put("talismane.core." + sessionId + ".locale", options.valueOf(localeOption));
if (options.has(encodingOption))
values.put("talismane.core." + sessionId + ".encoding", options.valueOf(encodingOption));
if (options.has(inputEncodingOption))
values.put("talismane.core." + sessionId + ".input-encoding", options.valueOf(inputEncodingOption));
if (options.has(outputEncodingOption))
values.put("talismane.core." + sessionId + ".output-encoding", options.valueOf(outputEncodingOption));
if (options.has(languageModelOption))
values.put("talismane.core." + sessionId + ".language-detector.model", options.valueOf(languageModelOption).getPath());
if (options.has(sentenceModelOption))
values.put("talismane.core." + sessionId + ".sentence-detector.model", options.valueOf(sentenceModelOption).getPath());
if (options.has(tokeniserModelOption))
values.put("talismane.core." + sessionId + ".tokeniser.model", options.valueOf(tokeniserModelOption).getPath());
if (options.has(posTaggerModelOption))
values.put("talismane.core." + sessionId + ".pos-tagger.model", options.valueOf(posTaggerModelOption).getPath());
if (options.has(parserModelOption))
values.put("talismane.core." + sessionId + ".parser.model", options.valueOf(parserModelOption).getPath());
if (options.has(lexiconOption)) {
List<String> lexiconPaths = options.valuesOf(lexiconOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
values.put("talismane.core." + sessionId + ".lexicons", lexiconPaths);
}
if (options.has(textAnnotatorsOption)) {
List<String> textAnnotatorPaths = options.valuesOf(textAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
values.put("talismane.core." + sessionId + ".annotators.text-annotators", textAnnotatorPaths);
}
if (options.has(sentenceAnnotatorsOption)) {
List<String> sentenceAnnotatorPaths = options.valuesOf(sentenceAnnotatorsOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
values.put("talismane.core." + sessionId + ".annotators.sentence-annotators", sentenceAnnotatorPaths);
}
List<String> inputLocations = Arrays.asList("talismane.core." + sessionId + ".input", "talismane.core." + sessionId + ".language-detector.input", "talismane.core." + sessionId + ".language-detector.train", "talismane.core." + sessionId + ".language-detector.evaluate", "talismane.core." + sessionId + ".sentence-detector.input", "talismane.core." + sessionId + ".sentence-detector.train", "talismane.core." + sessionId + ".sentence-detector.evaluate", "talismane.core." + sessionId + ".tokeniser.input", "talismane.core." + sessionId + ".tokeniser.train", "talismane.core." + sessionId + ".tokeniser.evaluate", "talismane.core." + sessionId + ".pos-tagger.input", "talismane.core." + sessionId + ".pos-tagger.train", "talismane.core." + sessionId + ".pos-tagger.evaluate", "talismane.core." + sessionId + ".parser.input", "talismane.core." + sessionId + ".parser.train", "talismane.core." + sessionId + ".parser.evaluate");
List<String> outputLocations = Arrays.asList("talismane.core." + sessionId + ".output", "talismane.core." + sessionId + ".language-detector.output", "talismane.core." + sessionId + ".sentence-detector.output", "talismane.core." + sessionId + ".tokeniser.output", "talismane.core." + sessionId + ".pos-tagger.output", "talismane.core." + sessionId + ".parser.output");
if (options.has(newlineOption))
values.put("talismane.core." + sessionId + ".newline", options.valueOf(newlineOption));
if (options.has(processByDefaultOption))
values.put("talismane.core." + sessionId + ".analysis.process-by-default", options.valueOf(processByDefaultOption));
if (options.has(blockSizeOption))
values.put("talismane.core." + sessionId + ".block-size", options.valueOf(blockSizeOption));
if (options.has(sentenceCountOption))
for (String inputLocation : inputLocations) values.put(inputLocation + ".sentence-count", options.valueOf(sentenceCountOption));
if (options.has(startSentenceOption))
for (String inputLocation : inputLocations) values.put(inputLocation + ".start-sentence", options.valueOf(startSentenceOption));
if (options.has(crossValidationSizeOption))
for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.fold-count", options.valueOf(crossValidationSizeOption));
if (options.has(includeIndexOption))
for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.include-index", options.valueOf(includeIndexOption));
if (options.has(excludeIndexOption))
for (String inputLocation : inputLocations) values.put(inputLocation + ".cross-validation.exclude-index", options.valueOf(excludeIndexOption));
if (options.has(builtInTemplateOption))
for (String outputLocation : outputLocations) values.put(outputLocation + ".built-in-template", options.valueOf(builtInTemplateOption).name());
if (options.has(templateOption))
for (String outputLocation : outputLocations) values.put(outputLocation + ".template", options.valueOf(templateOption).getPath());
if (options.has(posTaggerRulesOption)) {
List<String> posTaggerRulePaths = options.valuesOf(posTaggerRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
values.put("talismane.core." + sessionId + ".pos-tagger.rules", posTaggerRulePaths);
}
if (options.has(parserRulesOption)) {
List<String> parserRulePaths = options.valuesOf(parserRulesOption).stream().map(f -> f.getPath()).collect(Collectors.toList());
values.put("talismane.core." + sessionId + ".parser.rules", parserRulePaths);
}
if (options.has(suffixOption))
values.put("talismane.core." + sessionId + ".suffix", options.valueOf(suffixOption));
if (options.has(outputDividerOption))
for (String outputLocation : outputLocations) values.put(outputLocation + ".output-divider", options.valueOf(outputDividerOption));
if (options.has(beamWidthOption)) {
values.put("talismane.core." + sessionId + ".pos-tagger.beam-width", options.valueOf(beamWidthOption));
values.put("talismane.core." + sessionId + ".parser.beam-width", options.valueOf(beamWidthOption));
}
if (options.has(tokeniserBeamWidthOption))
values.put("talismane.core." + sessionId + ".tokeniser.beam-width", options.valueOf(tokeniserBeamWidthOption));
if (options.has(propagateBeamOption))
values.put("talismane.core." + sessionId + ".parser.propagate-pos-tagger-beam", options.valueOf(propagateBeamOption));
if (options.has(maxParseAnalysisTimeOption))
values.put("talismane.core." + sessionId + ".parser.max-analysis-time", options.valueOf(maxParseAnalysisTimeOption));
if (options.has(minFreeMemoryOption))
values.put("talismane.core." + sessionId + ".parser.min-free-memory", options.valueOf(minFreeMemoryOption));
if (options.has(earlyStopOption))
values.put("talismane.core." + sessionId + ".parser.early-stop", options.valueOf(earlyStopOption));
if (options.has(inputPatternFileOption) || options.has(inputPatternOption)) {
String inputRegex = null;
if (options.has(inputPatternFileOption)) {
InputStream inputPatternFile = new FileInputStream(options.valueOf(inputPatternFileOption));
try (Scanner inputPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(inputPatternFile, "UTF-8")))) {
if (inputPatternScanner.hasNextLine()) {
inputRegex = inputPatternScanner.nextLine();
}
}
if (inputRegex == null)
throw new TalismaneException("No input pattern found in " + options.valueOf(inputPatternFileOption).getPath());
} else {
inputRegex = options.valueOf(inputPatternOption);
}
for (String inputLocation : inputLocations) values.put(inputLocation + ".input-pattern", inputRegex);
}
if (options.has(evalPatternFileOption) || options.has(evalPatternOption)) {
String evalRegex = null;
if (options.has(evalPatternFileOption)) {
InputStream evalPatternFile = new FileInputStream(options.valueOf(evalPatternFileOption));
try (Scanner evalPatternScanner = new Scanner(new BufferedReader(new InputStreamReader(evalPatternFile, "UTF-8")))) {
if (evalPatternScanner.hasNextLine()) {
evalRegex = evalPatternScanner.nextLine();
}
}
if (evalRegex == null)
throw new TalismaneException("No eval pattern found in " + options.valueOf(evalPatternFileOption).getPath());
} else {
evalRegex = options.valueOf(evalPatternOption);
}
values.put("talismane.core." + sessionId + ".sentence-detector.evaluate.input-pattern", evalRegex);
values.put("talismane.core." + sessionId + ".tokeniser.evaluate.input-pattern", evalRegex);
values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.input-pattern", evalRegex);
values.put("talismane.core." + sessionId + ".parser.evaluate.input-pattern", evalRegex);
}
if (options.has(csvSeparatorOption))
values.put("talismane.core." + sessionId + ".csv.separator", options.valueOf(csvSeparatorOption));
if (options.has(csvEncodingOption))
values.put("talismane.core." + sessionId + ".csv.encoding", options.valueOf(csvEncodingOption));
if (options.has(csvLocaleOption))
values.put("talismane.core." + sessionId + ".csv.locale", options.valueOf(csvLocaleOption));
if (options.has(includeUnknownWordResultsOption))
values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-unknown-word-results", options.valueOf(includeUnknownWordResultsOption));
if (options.has(includeLexiconCoverageOption))
values.put("talismane.core." + sessionId + ".pos-tagger.evaluate.include-lexicon-coverage", options.valueOf(includeLexiconCoverageOption));
if (options.has(labeledEvaluationOption))
values.put("talismane.core." + sessionId + ".parser.evaluate.labeled-evaluation", options.valueOf(labeledEvaluationOption));
if (options.has(processingOption))
values.put("talismane.core." + sessionId + ".output.option", options.valueOf(processingOption).name());
if (options.has(lexicalEntryRegexOption)) {
values.put("talismane.core." + sessionId + ".pos-tagger.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
values.put("talismane.core." + sessionId + ".parser.input.corpus-lexical-entry-regex", options.valueOf(lexicalEntryRegexOption).getPath());
}
if (options.has(featuresOption)) {
values.put("talismane.core." + sessionId + ".language-detector.train.features", options.valueOf(featuresOption).getPath());
values.put("talismane.core." + sessionId + ".sentence-detector.train.features", options.valueOf(featuresOption).getPath());
values.put("talismane.core." + sessionId + ".tokeniser.train.features", options.valueOf(featuresOption).getPath());
values.put("talismane.core." + sessionId + ".pos-tagger.train.features", options.valueOf(featuresOption).getPath());
values.put("talismane.core." + sessionId + ".parser.train.features", options.valueOf(featuresOption).getPath());
}
if (options.has(tokeniserPatternsOption))
values.put("talismane.core." + sessionId + ".tokeniser.train.patterns", options.valueOf(tokeniserPatternsOption).getPath());
if (options.has(sentenceFileOption)) {
values.put("talismane.core." + sessionId + ".tokeniser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
values.put("talismane.core." + sessionId + ".pos-tagger.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
values.put("talismane.core." + sessionId + ".parser.input.sentence-file", options.valueOf(sentenceFileOption).getPath());
}
if (options.has(languageCorpusMapOption))
values.put("talismane.core." + sessionId + ".language-detector.train.language-corpus-map", options.valueOf(languageCorpusMapOption).getPath());
if (options.has(predictTransitionsOption))
values.put("talismane.core." + sessionId + ".parser.input.predict-transitions", options.valueOf(predictTransitionsOption));
if (options.has(testWordsOption))
values.put("talismane.core." + sessionId + ".pos-tagger.output.test-words", options.valuesOf(testWordsOption));
if (options.has(algorithmOption)) {
values.put("talismane.machine-learning.algorithm", options.valueOf(algorithmOption).name());
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.algorithm", options.valueOf(algorithmOption).name());
}
if (options.has(cutoffOption)) {
values.put("talismane.machine-learning.cutoff", options.valueOf(cutoffOption));
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.cutoff", options.valueOf(cutoffOption));
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.cutoff", options.valueOf(cutoffOption));
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.cutoff", options.valueOf(cutoffOption));
}
if (options.has(linearSVMEpsilonOption)) {
values.put("talismane.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.epsilon", options.valueOf(linearSVMEpsilonOption));
}
if (options.has(linearSVMCostOption)) {
values.put("talismane.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.cost", options.valueOf(linearSVMCostOption));
}
if (options.has(oneVsRestOption)) {
values.put("talismane.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.LinearSVM.one-vs-rest", options.valueOf(oneVsRestOption));
}
if (options.has(iterationsOption)) {
values.put("talismane.machine-learning.iterations", options.valueOf(iterationsOption));
values.put("talismane.core." + sessionId + ".language-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
values.put("talismane.core." + sessionId + ".sentence-detector.train.machine-learning.iterations", options.valueOf(iterationsOption));
values.put("talismane.core." + sessionId + ".tokeniser.train.machine-learning.iterations", options.valueOf(iterationsOption));
values.put("talismane.core." + sessionId + ".pos-tagger.train.machine-learning.iterations", options.valueOf(iterationsOption));
values.put("talismane.core." + sessionId + ".parser.train.machine-learning.iterations", options.valueOf(iterationsOption));
}
if (options.has(logConfigFileSpec))
LogUtils.configureLogging(options.valueOf(logConfigFileSpec));
File inFile = null;
File outFile = null;
File outDir = null;
if (options.has(inFileOption))
inFile = options.valueOf(inFileOption);
if (options.has(outFileOption))
outFile = options.valueOf(outFileOption);
if (options.has(outDirOption))
outDir = options.valueOf(outDirOption);
File evalFile = inFile;
if (options.has(evalFileOption))
evalFile = options.valueOf(evalFileOption);
boolean keepDirectoryStructure = outFile != null && !outFile.getName().contains(".");
if (options.has(keepDirStructureOption))
keepDirectoryStructure = options.valueOf(keepDirStructureOption);
// System properties override configuration file keys when ConfigFactory.load() is called.
values.forEach((k, v) -> System.setProperty(k, v.toString()));
ConfigFactory.invalidateCaches();
this.execute(sessionId, inFile, outFile, outDir, evalFile, keepDirectoryStructure);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenPerLineCorpusReader method hasNextSentence.
@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
// we've reached the end, do nothing
} else {
while (sentenceLines == null) {
List<UnprocessedLine> lines = new ArrayList<>();
int skippedLineCount = 0;
if (!this.hasNextLine())
break;
while ((this.hasNextLine() || lines.size() > 0) && sentenceLines == null) {
String line = "";
if (this.hasNextLine())
line = this.nextLine().replace("\r", "");
lineNumber++;
if (LOG.isTraceEnabled())
LOG.trace("Line " + lineNumber + ": " + line);
if (line.length() > 0) {
boolean skip = false;
for (Pattern skipLinePattern : skipLinePatterns) {
if (skipLinePattern.matcher(line).matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Skipping by pattern: " + skipLinePattern.pattern());
skip = true;
skippedLineCount++;
break;
}
}
List<CorpusSentenceRule> myRules = new ArrayList<>();
List<Matcher> myMatchers = new ArrayList<>();
for (CorpusSentenceRule sentenceRule : sentenceRules) {
Matcher matcher = sentenceRule.getPattern().matcher(line);
if (matcher.matches()) {
if (LOG.isTraceEnabled())
LOG.trace("Matched rule: " + sentenceRule);
myRules.add(sentenceRule);
myMatchers.add(matcher);
}
}
UnprocessedLine unprocessedLine = new UnprocessedLine(line, lineNumber, skip, myRules, myMatchers);
lines.add(unprocessedLine);
} else {
if (lines.size() == 0 || lines.size() == skippedLineCount) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
// end of sentence
boolean includeMe = true;
// check cross-validation
if (this.getCrossValidationSize() > 0) {
if (this.getIncludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() != this.getIncludeIndex()) {
includeMe = false;
}
} else if (this.getExcludeIndex() >= 0) {
if (sentenceCount % this.getCrossValidationSize() == this.getExcludeIndex()) {
includeMe = false;
}
}
}
if (this.getStartSentence() > sentenceCount) {
includeMe = false;
}
sentenceCount++;
LOG.debug("sentenceCount: " + sentenceCount);
if (!includeMe) {
lines = new ArrayList<>();
skippedLineCount = 0;
continue;
}
sentenceLines = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (!unprocessedLine.skip) {
CorpusLine corpusLine = corpusLineReader.read(unprocessedLine.line, unprocessedLine.lineNumber);
sentenceLines.add(corpusLine);
if (this.lexicalEntryReader != null) {
WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
this.lexicalEntryReader.readEntry(unprocessedLine.line, lexicalEntry);
corpusLine.setLexicalEntry(lexicalEntry);
}
}
}
List<CorpusSentenceRule.MergeAction> mergeActions = new ArrayList<>();
for (UnprocessedLine unprocessedLine : lines) {
if (LOG.isTraceEnabled())
LOG.trace("Line " + unprocessedLine);
for (int i = 0; i < unprocessedLine.sentenceRules.size(); i++) {
CorpusSentenceRule sentenceRule = unprocessedLine.sentenceRules.get(i);
Matcher matcher = unprocessedLine.matchers.get(i);
if (LOG.isTraceEnabled())
LOG.trace("Testing rule " + sentenceRule);
CorpusSentenceRule.Action action = sentenceRule.apply(unprocessedLine.line, unprocessedLine.lineNumber, matcher, sentenceLines);
if (LOG.isTraceEnabled())
LOG.trace("Result: " + action);
if (action != null) {
if (action instanceof MergeAction)
mergeActions.add((MergeAction) action);
break;
}
}
}
if (mergeActions.size() > 0) {
List<CorpusLine> newSentenceLines = new ArrayList<>();
Map<Integer, MergeAction> indexesToMerge = new TreeMap<>();
for (CorpusSentenceRule.MergeAction mergeAction : mergeActions) {
for (CorpusLine lineToMerge : mergeAction.getLinesToMerge()) {
indexesToMerge.put(lineToMerge.getIndex(), mergeAction);
}
}
int i = 1;
Iterator<Integer> iIndexToMerge = indexesToMerge.keySet().iterator();
int nextIndexToMerge = iIndexToMerge.next();
int linesRemoved = 0;
Map<Integer, Integer> indexChangeMap = new HashMap<>();
indexChangeMap.put(0, 0);
for (CorpusLine corpusLine : sentenceLines) {
if (i == nextIndexToMerge) {
MergeAction mergeAction = indexesToMerge.get(i);
if (i == mergeAction.getFirstIndex()) {
newSentenceLines.add(mergeAction.getMergedLine());
linesRemoved -= 1;
}
linesRemoved += 1;
if (iIndexToMerge.hasNext())
nextIndexToMerge = iIndexToMerge.next();
else
nextIndexToMerge = -1;
} else {
newSentenceLines.add(corpusLine);
}
indexChangeMap.put(i, i - linesRemoved);
i++;
}
for (CorpusLine corpusLine : newSentenceLines) {
corpusLine.setElement(CorpusElement.INDEX, "" + indexChangeMap.get(corpusLine.getIndex()));
int governorIndex = corpusLine.getGovernorIndex();
if (governorIndex >= 0)
corpusLine.setElement(CorpusElement.GOVERNOR, "" + indexChangeMap.get(corpusLine.getGovernorIndex()));
int nonProjGovernorIndex = corpusLine.getNonProjGovernorIndex();
if (nonProjGovernorIndex >= 0)
corpusLine.setElement(CorpusElement.NON_PROJ_GOVERNOR, "" + indexChangeMap.get(corpusLine.getNonProjGovernorIndex()));
}
sentenceLines = newSentenceLines;
}
Sentence sentence = null;
if (sentenceReader != null && sentenceReader.hasNextSentence()) {
sentence = sentenceReader.nextSentence();
} else {
LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
if (rules == null)
throw new TalismaneException("Linguistic rules have not been set.");
String text = "";
for (CorpusLine corpusLine : sentenceLines) {
String word = corpusLine.getElement(CorpusElement.TOKEN);
if (rules.shouldAddSpace(text, word))
text += " ";
text += word;
}
sentence = new Sentence(text, currentFile, sessionId);
}
for (SentenceAnnotator sentenceAnnotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
sentenceAnnotator.annotate(sentence);
}
this.processSentence(sentence, sentenceLines);
}
}
}
}
return (sentenceLines != null);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class SentenceDetector method detectSentences.
/**
* Detect sentences within an annotated text. Sentences are added in the form
* of an Annotation around a {@link SentenceBoundary}, with the start position
* (relative to the start of the annotated text) at the start of the sentence
* and the end position immediately after the end of the sentence. <br>
* <br>
* Sentence boundaries will not be detected within any annotation of type
* {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
* after the {@link AnnotatedText#getAnalysisStart()} and
* {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
* <br>
* If the text contained existing {@link SentenceBoundary} annotations before
* analysis start, the first sentence will begin where the last existing
* annotation ended. Otherwise, the first boundary will begin at position 0.
* <br>
* <br>
* If the text's analysis end is equal to the text length, it is assumed that
* the text end is a sentence boundary. In this case, an additional sentence
* is added starting at the final detected boundary and ending at text end.
*
* @param text
* the annotated text in which we need to detect sentences.
* @return in addition to the annotations added, we return a List of integers
* marking the end position of each sentence boundary.
*/
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
LOG.debug("detectSentences");
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
List<Integer> possibleBoundaries = new ArrayList<>();
while (matcher.find()) {
if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
boolean noSentences = false;
int position = matcher.start();
for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
noSentences = true;
break;
}
}
if (!noSentences)
possibleBoundaries.add(position);
}
}
// collect all deterministic sentence boundaries
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
// Share one token sequence for all possible boundaries, to avoid tokenising
// multiple times
Sentence sentence = new Sentence(text.getText(), sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
for (int possibleBoundary : possibleBoundaries) {
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Testing boundary: " + boundary);
LOG.trace(" at position: " + possibleBoundary);
}
List<FeatureResult<?>> featureResults = new ArrayList<>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(featureResults);
if (LOG.isTraceEnabled()) {
for (Decision decision : decisions) {
LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
}
}
if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
if (LOG.isTraceEnabled()) {
LOG.trace("Adding boundary: " + possibleBoundary + 1);
}
guessedBoundaries.add(possibleBoundary + 1);
boundaries.add(boundary);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
}
if (LOG.isDebugEnabled())
LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
int lastBoundary = 0;
List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
if (existingBoundaries.size() > 0) {
lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
}
// advance boundary start until a non space character is encountered
while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
lastBoundary++;
}
for (int guessedBoundary : guessedBoundaries) {
if (guessedBoundary > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added boundary: " + sentenceBoundary);
}
lastBoundary = guessedBoundary;
}
}
if (text.getAnalysisEnd() == text.getText().length()) {
if (text.getAnalysisEnd() > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added final boundary: " + sentenceBoundary);
}
}
}
text.addAnnotations(newBoundaries);
return new ArrayList<>(guessedBoundaries);
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class ParserEvaluator method evaluate.
/**
* @throws TalismaneException
* if an attempt is made to evaluate with a tokeniser but no
* pos-tagger
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
List<PosTagSequence> posTagSequences = null;
List<TokenSequence> tokenSequences = null;
if (tokeniser != null) {
if (posTagger == null)
throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
// annotate the sentence for pre token filters
for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
annotator.annotate(sentence);
if (LOG.isTraceEnabled()) {
LOG.trace("TokenFilter: " + annotator);
LOG.trace("annotations: " + sentence.getAnnotations());
}
}
tokenSequences = tokeniser.tokenise(sentence);
} else {
tokenSequences = new ArrayList<TokenSequence>();
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
posTagSequence.removeRoot();
tokenSequences.add(posTagSequence.getTokenSequence());
}
if (posTagger != null) {
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
} else {
posTagSequences = new ArrayList<PosTagSequence>();
PosTagSequence posTagSequence = null;
posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
posTagSequences.add(posTagSequence);
}
} else {
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
posTagSequences = new ArrayList<PosTagSequence>();
posTagSequences.add(posTagSequence);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseStart(realConfiguration, posTagSequences);
}
List<ParseConfiguration> guessedConfigurations = null;
if (parser instanceof NonDeterministicParser) {
NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
} else {
ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
guessedConfigurations = new ArrayList<ParseConfiguration>();
guessedConfigurations.add(bestGuess);
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onParseEnd(realConfiguration, guessedConfigurations);
}
}
for (ParseEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class PosTaggerEvaluator method evaluate.
/**
* Evaluate a given pos tagger.
*
* @throws TalismaneException
* @throws IOException
*/
public void evaluate() throws TalismaneException, IOException {
while (corpusReader.hasNextSentence()) {
PosTagSequence realPosTagSequence = corpusReader.nextPosTagSequence();
List<TokenSequence> tokenSequences = null;
List<PosTagSequence> guessedSequences = null;
TokenSequence tokenSequence = realPosTagSequence.getTokenSequence();
PosTagSequence guessedSequence = null;
if (this.tokeniser != null) {
Sentence sentence = tokenSequence.getSentence();
tokenSequences = tokeniser.tokenise(sentence);
tokenSequence = tokenSequences.get(0);
} else {
tokenSequences = new ArrayList<TokenSequence>();
tokenSequences.add(tokenSequence);
}
if (posTagger instanceof NonDeterministicPosTagger) {
NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
guessedSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
guessedSequence = guessedSequences.get(0);
} else {
guessedSequence = posTagger.tagSentence(tokenSequence);
}
if (LOG.isDebugEnabled()) {
StringBuilder stringBuilder = new StringBuilder();
for (PosTaggedToken posTaggedToken : guessedSequence) {
Set<String> lemmas = new TreeSet<String>();
stringBuilder.append(posTaggedToken.getToken().getOriginalText());
stringBuilder.append("[" + posTaggedToken.getTag());
List<LexicalEntry> entries = posTaggedToken.getLexicalEntries();
boolean dropCurrentWord = false;
if (entries.size() > 1)
dropCurrentWord = true;
for (LexicalEntry entry : posTaggedToken.getLexicalEntries()) {
if (!lemmas.contains(entry.getLemma())) {
if (dropCurrentWord && posTaggedToken.getToken().getText().equals(entry.getLemma())) {
dropCurrentWord = false;
continue;
}
stringBuilder.append("|" + entry.getLemma());
// stringBuilder.append("/" + entry.getCategory());
stringBuilder.append("/" + entry.getMorphology());
lemmas.add(entry.getLemma());
}
}
stringBuilder.append("] ");
}
LOG.debug(stringBuilder.toString());
}
for (PosTagEvaluationObserver observer : this.observers) {
observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
}
}
for (PosTagEvaluationObserver observer : this.observers) {
observer.onEvaluationComplete();
}
}
Aggregations