use of com.joliciel.talismane.sentenceDetector.SentenceDetectorTrainer in project talismane by joliciel-informatique.
the class TalismaneMain method execute.
/**
* Execute Talismane based on the configuration provided.
*
* @param sessionId
* The current session's id
* @param inFile
* The file or directory to analyse
* @param outFile
* The file or directory to write the analysis.
* @param outDir
* The directory for writing additional output files (other than the
* main analysis).
* @param keepDirectoryStructure
* For analyse and process: if true, and inFile is a directory,
* outFile will be interpreted as a directory and the inFile
* directory struture will be maintained
* @param evalFile
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if attempt is made to start and end on two unsupported modules.
* @throws SentenceAnnotatorLoadException
*/
public void execute(String sessionId, File inFile, File outFile, File outDir, File evalFile, boolean keepDirectoryStructure) throws IOException, ReflectiveOperationException, TalismaneException, SentenceAnnotatorLoadException {
long startTime = System.currentTimeMillis();
TalismaneSession session = TalismaneSession.get(sessionId);
session.setFileForBasename(inFile);
Config config = ConfigFactory.load();
try {
switch(session.getCommand()) {
case analyse:
{
Module startModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.start-module"));
Module endModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.end-module"));
Reader reader = getReader(inFile, true, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
if (startModule == Module.languageDetector) {
if (endModule != Module.languageDetector)
throw new TalismaneException("Talismane does not currently support analysis starting with " + startModule.name() + " and ending with another module.");
LanguageDetector languageDetector = LanguageDetector.getInstance(sessionId);
LanguageDetectorProcessor processor = LanguageDetectorProcessor.getProcessor(writer, sessionId);
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".language-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
String sentence = corpusReader.nextSentence().getText().toString();
List<WeightedOutcome<Locale>> results = languageDetector.detectLanguages(sentence);
processor.onNextText(sentence, results);
}
} else {
Mode mode = Mode.valueOf(config.getString("talismane.core." + sessionId + ".mode"));
switch(mode) {
case normal:
Talismane talismane = new Talismane(writer, outDir, sessionId);
talismane.analyse(reader);
break;
case server:
TalismaneServer talismaneServer = new TalismaneServer(sessionId);
talismaneServer.analyse();
break;
}
}
break;
}
case train:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case languageDetector:
{
LanguageDetectorTrainer trainer = new LanguageDetectorTrainer(sessionId);
trainer.train();
break;
}
case sentenceDetector:
{
SentenceDetectorTrainer trainer = new SentenceDetectorTrainer(reader, sessionId);
trainer.train();
break;
}
case tokeniser:
{
PatternTokeniserTrainer trainer = new PatternTokeniserTrainer(reader, sessionId);
trainer.train();
break;
}
case posTagger:
{
PosTaggerTrainer trainer = new PosTaggerTrainer(reader, sessionId);
trainer.train();
break;
}
case parser:
{
ParserTrainer trainer = new ParserTrainer(reader, sessionId);
trainer.train();
break;
}
}
break;
}
case evaluate:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case sentenceDetector:
{
SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case tokeniser:
{
TokeniserEvaluator evaluator = new TokeniserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case posTagger:
{
PosTaggerEvaluator evaluator = new PosTaggerEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case parser:
{
ParserEvaluator evaluator = new ParserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case compare:
{
Reader reader = getReader(inFile, false, sessionId);
Reader evalReader = getReader(evalFile, false, sessionId);
switch(session.getModule()) {
case tokeniser:
{
TokenComparator comparator = new TokenComparator(reader, evalReader, outDir, sessionId);
comparator.compare();
break;
}
case posTagger:
{
PosTagComparator comparator = new PosTagComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
case parser:
{
ParseComparator comparator = new ParseComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case process:
{
Reader reader = getReader(inFile, false, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
File currentFile = null;
URI currentURI = null;
IOException ioException = null;
switch(session.getModule()) {
case sentenceDetector:
{
List<SentenceProcessor> processors = SentenceProcessor.getProcessors(writer, outDir, sessionId);
try {
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".sentence-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
Sentence sentence = corpusReader.nextSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (SentenceProcessor processor : processors) processor.onNextSentence(sentence);
}
} finally {
for (SentenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case tokeniser:
{
List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
TokeniserAnnotatedCorpusReader corpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
TokenSequence tokenSequence = corpusReader.nextTokenSequence();
Sentence sentence = tokenSequence.getSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (TokenSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (TokenSequenceProcessor processor : processors) processor.onNextTokenSequence(tokenSequence);
}
} finally {
for (TokenSequenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case posTagger:
{
List<PosTagSequenceProcessor> processors = PosTagSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
PosTagAnnotatedCorpusReader corpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
while (corpusReader.hasNextSentence()) {
PosTagSequence posTagSequence = corpusReader.nextPosTagSequence();
Sentence sentence = posTagSequence.getTokenSequence().getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (PosTagSequenceProcessor processor : processors) processor.onNextPosTagSequence(posTagSequence);
}
} finally {
for (PosTagSequenceProcessor processor : processors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case parser:
{
List<ParseConfigurationProcessor> processors = ParseConfigurationProcessor.getProcessors(writer, outDir, sessionId);
try {
ParserAnnotatedCorpusReader corpusReader = ParserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".parser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
ParseConfiguration configuration = corpusReader.nextConfiguration();
Sentence sentence = configuration.getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (ParseConfigurationProcessor processor : processors) processor.onNextParseConfiguration(configuration);
}
} finally {
for (ParseConfigurationProcessor processor : processors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
if (ioException != null)
throw ioException;
break;
}
}
} finally {
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
if (config.getBoolean("talismane.core." + sessionId + ".output.log-execution-time")) {
try {
CSVFormatter CSV = new CSVFormatter();
Writer csvFileWriter = null;
File csvFile = new File(outDir, session.getBaseName() + ".stats.csv");
csvFile.delete();
csvFile.createNewFile();
csvFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
csvFileWriter.write(CSV.format("total time") + CSV.format(totalTime) + "\n");
csvFileWriter.flush();
csvFileWriter.close();
} catch (Exception e) {
LogUtils.logError(LOG, e);
}
}
}
}
Aggregations