use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testApplyWithConsecutiveDollars.
@Test
public void testApplyWithConsecutiveDollars() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "\\b([\\w.%-]+)(@[-.\\w]+\\.[A-Za-z]{2,4})\\b";
String replacement = "\\$Email$2$1";
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
Sentence text = new Sentence("My address is joe.schmoe@test.com.", sessionId);
filter.annotate(text);
List<Annotation<TokenPlaceholder>> placeholders = text.getAnnotations(TokenPlaceholder.class);
LOG.debug(placeholders.toString());
assertEquals(1, placeholders.size());
Annotation<TokenPlaceholder> placeholder = placeholders.get(0);
assertEquals(14, placeholder.getStart());
assertEquals(33, placeholder.getEnd());
assertEquals("$Email@test.comjoe.schmoe", placeholder.getData().getReplacement());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class RegexTokenAnnotatorTest method testStartOfInput.
@Test
public void testStartOfInput() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
String regex = "^Résumé\\.";
String replacement = null;
RegexTokenAnnotator filter = new RegexTokenAnnotator(regex, replacement, null, sessionId);
filter.addAttribute("TAG", new StringAttribute("TAG", "skip"));
Sentence text = new Sentence("Résumé. Résumé des attaques", sessionId);
filter.annotate(text);
@SuppressWarnings("rawtypes") List<Annotation<TokenAttribute>> annotations = text.getAnnotations(TokenAttribute.class);
LOG.debug(annotations.toString());
assertEquals(1, annotations.size());
@SuppressWarnings("rawtypes") Annotation<TokenAttribute> placeholder = annotations.get(0);
assertEquals(0, placeholder.getStart());
assertEquals(7, placeholder.getEnd());
assertEquals("TAG", placeholder.getData().getKey());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class PretokenisedSequenceTest method testAddTokenString.
@Test
public void testAddTokenString() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("« Il est là. »", sessionId);
PretokenisedSequence sequence = new PretokenisedSequence(sentence, sessionId);
sequence.addToken("« ");
sequence.addToken("Il");
sequence.addToken("est");
sequence.addToken("là");
sequence.addToken(".");
sequence.addToken(" »");
System.out.println(sequence.toString());
assertEquals(6, sequence.size());
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TalismaneMain method execute.
/**
* Execute Talismane based on the configuration provided.
*
* @param sessionId
* The current session's id
* @param inFile
* The file or directory to analyse
* @param outFile
* The file or directory to write the analysis.
* @param outDir
* The directory for writing additional output files (other than the
* main analysis).
* @param keepDirectoryStructure
* For analyse and process: if true, and inFile is a directory,
* outFile will be interpreted as a directory and the inFile
* directory struture will be maintained
* @param evalFile
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if attempt is made to start and end on two unsupported modules.
* @throws SentenceAnnotatorLoadException
*/
public void execute(String sessionId, File inFile, File outFile, File outDir, File evalFile, boolean keepDirectoryStructure) throws IOException, ReflectiveOperationException, TalismaneException, SentenceAnnotatorLoadException {
long startTime = System.currentTimeMillis();
TalismaneSession session = TalismaneSession.get(sessionId);
session.setFileForBasename(inFile);
Config config = ConfigFactory.load();
try {
switch(session.getCommand()) {
case analyse:
{
Module startModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.start-module"));
Module endModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.end-module"));
Reader reader = getReader(inFile, true, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
if (startModule == Module.languageDetector) {
if (endModule != Module.languageDetector)
throw new TalismaneException("Talismane does not currently support analysis starting with " + startModule.name() + " and ending with another module.");
LanguageDetector languageDetector = LanguageDetector.getInstance(sessionId);
LanguageDetectorProcessor processor = LanguageDetectorProcessor.getProcessor(writer, sessionId);
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".language-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
String sentence = corpusReader.nextSentence().getText().toString();
List<WeightedOutcome<Locale>> results = languageDetector.detectLanguages(sentence);
processor.onNextText(sentence, results);
}
} else {
Mode mode = Mode.valueOf(config.getString("talismane.core." + sessionId + ".mode"));
switch(mode) {
case normal:
Talismane talismane = new Talismane(writer, outDir, sessionId);
talismane.analyse(reader);
break;
case server:
TalismaneServer talismaneServer = new TalismaneServer(sessionId);
talismaneServer.analyse();
break;
}
}
break;
}
case train:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case languageDetector:
{
LanguageDetectorTrainer trainer = new LanguageDetectorTrainer(sessionId);
trainer.train();
break;
}
case sentenceDetector:
{
SentenceDetectorTrainer trainer = new SentenceDetectorTrainer(reader, sessionId);
trainer.train();
break;
}
case tokeniser:
{
PatternTokeniserTrainer trainer = new PatternTokeniserTrainer(reader, sessionId);
trainer.train();
break;
}
case posTagger:
{
PosTaggerTrainer trainer = new PosTaggerTrainer(reader, sessionId);
trainer.train();
break;
}
case parser:
{
ParserTrainer trainer = new ParserTrainer(reader, sessionId);
trainer.train();
break;
}
}
break;
}
case evaluate:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case sentenceDetector:
{
SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case tokeniser:
{
TokeniserEvaluator evaluator = new TokeniserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case posTagger:
{
PosTaggerEvaluator evaluator = new PosTaggerEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case parser:
{
ParserEvaluator evaluator = new ParserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case compare:
{
Reader reader = getReader(inFile, false, sessionId);
Reader evalReader = getReader(evalFile, false, sessionId);
switch(session.getModule()) {
case tokeniser:
{
TokenComparator comparator = new TokenComparator(reader, evalReader, outDir, sessionId);
comparator.compare();
break;
}
case posTagger:
{
PosTagComparator comparator = new PosTagComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
case parser:
{
ParseComparator comparator = new ParseComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case process:
{
Reader reader = getReader(inFile, false, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
File currentFile = null;
URI currentURI = null;
IOException ioException = null;
switch(session.getModule()) {
case sentenceDetector:
{
List<SentenceProcessor> processors = SentenceProcessor.getProcessors(writer, outDir, sessionId);
try {
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".sentence-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
Sentence sentence = corpusReader.nextSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (SentenceProcessor processor : processors) processor.onNextSentence(sentence);
}
} finally {
for (SentenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case tokeniser:
{
List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
TokeniserAnnotatedCorpusReader corpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
TokenSequence tokenSequence = corpusReader.nextTokenSequence();
Sentence sentence = tokenSequence.getSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (TokenSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (TokenSequenceProcessor processor : processors) processor.onNextTokenSequence(tokenSequence);
}
} finally {
for (TokenSequenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case posTagger:
{
List<PosTagSequenceProcessor> processors = PosTagSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
PosTagAnnotatedCorpusReader corpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
while (corpusReader.hasNextSentence()) {
PosTagSequence posTagSequence = corpusReader.nextPosTagSequence();
Sentence sentence = posTagSequence.getTokenSequence().getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (PosTagSequenceProcessor processor : processors) processor.onNextPosTagSequence(posTagSequence);
}
} finally {
for (PosTagSequenceProcessor processor : processors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case parser:
{
List<ParseConfigurationProcessor> processors = ParseConfigurationProcessor.getProcessors(writer, outDir, sessionId);
try {
ParserAnnotatedCorpusReader corpusReader = ParserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".parser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
ParseConfiguration configuration = corpusReader.nextConfiguration();
Sentence sentence = configuration.getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (ParseConfigurationProcessor processor : processors) processor.onNextParseConfiguration(configuration);
}
} finally {
for (ParseConfigurationProcessor processor : processors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
if (ioException != null)
throw ioException;
break;
}
}
} finally {
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
if (config.getBoolean("talismane.core." + sessionId + ".output.log-execution-time")) {
try {
CSVFormatter CSV = new CSVFormatter();
Writer csvFileWriter = null;
File csvFile = new File(outDir, session.getBaseName() + ".stats.csv");
csvFile.delete();
csvFile.createNewFile();
csvFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
csvFileWriter.write(CSV.format("total time") + CSV.format(totalTime) + "\n");
csvFileWriter.flush();
csvFileWriter.close();
} catch (Exception e) {
LogUtils.logError(LOG, e);
}
}
}
}
use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.
the class TokenPatternTest method testMatch4.
@Test
public void testMatch4() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
final Sentence sentence = new Sentence("Aix-les-Bains", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.findDefaultTokens();
TokenPattern tokenPattern = new TokenPattern(".+-{(ce|je|la|le|les|leur|lui|moi|nous|toi|tu)[^-]}", Tokeniser.getTokenSeparators(sessionId));
List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence);
assertEquals(0, patternMatches.size());
}
Aggregations