use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class LexicalAttributeFeatureTest method testCheckInternalMultipleEntries.
@Test
public void testCheckInternalMultipleEntries() throws Exception {
System.setProperty("config.file", "src/test/resources/testWithLex.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
Sentence sentence = new Sentence("je demande", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
Token token = new Token("demande", tokenSequence, 1, "je ".length(), "je demande".length(), sessionId);
Decision decision = new Decision("V", 1.0);
final PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, sessionId);
PosTaggedTokenAddressFunction<PosTaggerContext> addressFunction = new AbstractPosTaggedTokenAddressFunction() {
@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) {
return this.generateResult(posTaggedToken);
}
};
StringLiteralFeature<PosTaggedTokenWrapper> person = new StringLiteralFeature<>(LexicalAttribute.Person.name());
LexicalAttributeFeature<PosTaggerContext> feature = new LexicalAttributeFeature<>(addressFunction, person);
PosTagSequence history = new PosTagSequence(tokenSequence);
PosTaggerContext context = new PosTaggerContextImpl(token, history);
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<List<WeightedOutcome<String>>> featureResult = feature.checkInternal(context, env);
List<WeightedOutcome<String>> outcomes = featureResult.getOutcome();
System.out.println(outcomes);
for (WeightedOutcome<String> outcome : outcomes) {
assertTrue("1".equals(outcome.getOutcome()) || "3".equals(outcome.getOutcome()));
}
assertEquals(2, outcomes.size());
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class TalismaneAPIExamples method example1.
/**
* An example tokenising, pos-tagging and parsing a pre-existing sentence.
*/
public static void example1(String sessionId) throws Exception {
String text = "Les amoureux qui se bécotent sur les bancs publics ont des petites gueules bien sympathiques.";
// tokenise the text
Tokeniser tokeniser = Tokeniser.getInstance(sessionId);
TokenSequence tokenSequence = tokeniser.tokeniseText(text);
// pos-tag the token sequence
PosTagger posTagger = PosTaggers.getPosTagger(sessionId);
PosTagSequence posTagSequence = posTagger.tagSentence(tokenSequence);
System.out.println(posTagSequence);
// parse the pos-tag sequence
Parser parser = Parsers.getParser(sessionId);
ParseConfiguration parseConfiguration = parser.parseSentence(posTagSequence);
ParseTree parseTree = new ParseTree(parseConfiguration, true);
System.out.println(parseTree);
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class PosTagEvaluationSentenceWriter method onNextPosTagSequence.
@Override
public void onNextPosTagSequence(PosTagSequence realSequence, List<PosTagSequence> guessedSequences) throws IOException {
for (int i = 0; i < realSequence.size(); i++) {
String token = realSequence.get(i).getToken().getAnalyisText();
writer.write(CSV.format(token));
}
writer.write("\n");
for (int i = 0; i < realSequence.size(); i++) writer.write(CSV.format(realSequence.get(i).getTag().getCode()));
writer.write("\n");
for (int k = 0; k < guessCount; k++) {
PosTagSequence posTagSequence = null;
if (k < guessedSequences.size()) {
posTagSequence = guessedSequences.get(k);
} else {
writer.write("\n");
writer.write("\n");
continue;
}
int j = 0;
String probs = "";
for (int i = 0; i < realSequence.size(); i++) {
TaggedToken<PosTag> realToken = realSequence.get(i);
TaggedToken<PosTag> testToken = posTagSequence.get(j);
boolean tokenError = false;
if (realToken.getToken().getStartIndex() == testToken.getToken().getStartIndex() && realToken.getToken().getEndIndex() == testToken.getToken().getEndIndex()) {
// no token error
j++;
if (j == posTagSequence.size()) {
j--;
}
} else {
tokenError = true;
while (realToken.getToken().getEndIndex() >= testToken.getToken().getEndIndex()) {
j++;
if (j == posTagSequence.size()) {
j--;
break;
}
testToken = posTagSequence.get(j);
}
}
if (tokenError) {
writer.write(CSV.format("BAD_TOKEN"));
} else {
writer.write(CSV.format(testToken.getTag().getCode()));
}
probs += CSV.format(testToken.getDecision().getProbability());
}
writer.write("\n");
writer.write(probs + "\n");
}
writer.flush();
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class ParseEvaluationSentenceWriter method onParseEnd.
@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws IOException {
TreeSet<Integer> startIndexes = new TreeSet<Integer>();
for (PosTaggedToken posTaggedToken : realConfiguration.getPosTagSequence()) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
Token token = posTaggedToken.getToken();
startIndexes.add(token.getStartIndex());
}
}
if (hasTokeniser || hasPosTagger) {
int i = 0;
for (ParseConfiguration guessedConfiguration : guessedConfigurations) {
for (PosTaggedToken posTaggedToken : guessedConfiguration.getPosTagSequence()) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
Token token = posTaggedToken.getToken();
startIndexes.add(token.getStartIndex());
}
}
i++;
if (i == guessCount)
break;
}
}
Map<Integer, Integer> startIndexMap = new HashMap<Integer, Integer>();
int j = 0;
for (int startIndex : startIndexes) {
startIndexMap.put(startIndex, j++);
}
PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
PosTaggedToken[] realTokens = new PosTaggedToken[startIndexes.size()];
for (PosTaggedToken posTaggedToken : posTagSequence) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
realTokens[startIndexMap.get(posTaggedToken.getToken().getStartIndex())] = posTaggedToken;
}
}
for (PosTaggedToken posTaggedToken : realTokens) {
if (posTaggedToken != null) {
csvFileWriter.write(CSV.format(posTaggedToken.getToken().getOriginalText()));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (PosTaggedToken posTaggedToken : realTokens) {
if (posTaggedToken != null) {
csvFileWriter.write(CSV.format(posTaggedToken.getTag().getCode()));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (PosTaggedToken posTaggedToken : realTokens) {
if (posTaggedToken != null) {
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
String realLabel = realArc.getLabel() == null ? "null" : realArc.getLabel();
csvFileWriter.write(CSV.format(realLabel));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (PosTaggedToken posTaggedToken : realTokens) {
if (posTaggedToken != null) {
DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken);
int startIndex = -1;
if (realArc != null) {
PosTaggedToken head = realArc.getHead();
if (!head.getTag().equals(PosTag.ROOT_POS_TAG)) {
startIndex = head.getToken().getStartIndex();
}
}
if (startIndex < 0)
csvFileWriter.write(CSV.format("ROOT"));
else
csvFileWriter.write(CSV.getColumnLabel(startIndexMap.get(startIndex)) + CSV.getCsvSeparator());
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (int i = 0; i < guessCount; i++) {
if (i < guessedConfigurations.size()) {
ParseConfiguration guessedConfiguration = guessedConfigurations.get(i);
PosTaggedToken[] guessedTokens = new PosTaggedToken[startIndexes.size()];
for (PosTaggedToken posTaggedToken : guessedConfiguration.getPosTagSequence()) {
if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
guessedTokens[startIndexMap.get(posTaggedToken.getToken().getStartIndex())] = posTaggedToken;
}
}
if (hasTokeniser) {
for (PosTaggedToken posTaggedToken : guessedTokens) {
if (posTaggedToken != null) {
csvFileWriter.write(CSV.format(posTaggedToken.getToken().getOriginalText()));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
}
if (hasPosTagger) {
for (PosTaggedToken posTaggedToken : guessedTokens) {
if (posTaggedToken != null) {
csvFileWriter.write(CSV.format(posTaggedToken.getTag().getCode()));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
}
for (PosTaggedToken posTaggedToken : guessedTokens) {
if (posTaggedToken != null) {
DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
String guessedLabel = "";
if (guessedArc != null) {
guessedLabel = guessedArc.getLabel() == null ? "null" : guessedArc.getLabel();
}
csvFileWriter.write(CSV.format(guessedLabel));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (PosTaggedToken posTaggedToken : guessedTokens) {
if (posTaggedToken != null) {
DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
int startIndex = -1;
if (guessedArc != null) {
PosTaggedToken head = guessedArc.getHead();
if (!head.getTag().equals(PosTag.ROOT_POS_TAG)) {
startIndex = head.getToken().getStartIndex();
}
}
if (startIndex < 0)
csvFileWriter.write(CSV.format("ROOT"));
else
csvFileWriter.write(CSV.getColumnLabel(startIndexMap.get(startIndex)) + CSV.getCsvSeparator());
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
for (PosTaggedToken posTaggedToken : guessedTokens) {
if (posTaggedToken != null) {
DependencyArc guessedArc = guessedConfiguration.getGoverningDependency(posTaggedToken);
double prob = 1.0;
if (guessedArc != null) {
Transition transition = guessedConfiguration.getTransition(guessedArc);
if (transition != null)
prob = transition.getDecision().getProbability();
}
csvFileWriter.write(CSV.format(prob));
} else {
csvFileWriter.write(CSV.getCsvSeparator());
}
}
csvFileWriter.write("\n");
} else {
csvFileWriter.write("\n");
csvFileWriter.write("\n");
}
// have more configurations
}
// next guessed configuration
csvFileWriter.flush();
}
use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.
the class TalismaneMain method execute.
/**
* Execute Talismane based on the configuration provided.
*
* @param sessionId
* The current session's id
* @param inFile
* The file or directory to analyse
* @param outFile
* The file or directory to write the analysis.
* @param outDir
* The directory for writing additional output files (other than the
* main analysis).
* @param keepDirectoryStructure
* For analyse and process: if true, and inFile is a directory,
* outFile will be interpreted as a directory and the inFile
* directory struture will be maintained
* @param evalFile
* @throws IOException
* @throws ReflectiveOperationException
* @throws TalismaneException
* if attempt is made to start and end on two unsupported modules.
* @throws SentenceAnnotatorLoadException
*/
public void execute(String sessionId, File inFile, File outFile, File outDir, File evalFile, boolean keepDirectoryStructure) throws IOException, ReflectiveOperationException, TalismaneException, SentenceAnnotatorLoadException {
long startTime = System.currentTimeMillis();
TalismaneSession session = TalismaneSession.get(sessionId);
session.setFileForBasename(inFile);
Config config = ConfigFactory.load();
try {
switch(session.getCommand()) {
case analyse:
{
Module startModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.start-module"));
Module endModule = Module.valueOf(config.getString("talismane.core." + sessionId + ".analysis.end-module"));
Reader reader = getReader(inFile, true, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
if (startModule == Module.languageDetector) {
if (endModule != Module.languageDetector)
throw new TalismaneException("Talismane does not currently support analysis starting with " + startModule.name() + " and ending with another module.");
LanguageDetector languageDetector = LanguageDetector.getInstance(sessionId);
LanguageDetectorProcessor processor = LanguageDetectorProcessor.getProcessor(writer, sessionId);
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".language-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
String sentence = corpusReader.nextSentence().getText().toString();
List<WeightedOutcome<Locale>> results = languageDetector.detectLanguages(sentence);
processor.onNextText(sentence, results);
}
} else {
Mode mode = Mode.valueOf(config.getString("talismane.core." + sessionId + ".mode"));
switch(mode) {
case normal:
Talismane talismane = new Talismane(writer, outDir, sessionId);
talismane.analyse(reader);
break;
case server:
TalismaneServer talismaneServer = new TalismaneServer(sessionId);
talismaneServer.analyse();
break;
}
}
break;
}
case train:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case languageDetector:
{
LanguageDetectorTrainer trainer = new LanguageDetectorTrainer(sessionId);
trainer.train();
break;
}
case sentenceDetector:
{
SentenceDetectorTrainer trainer = new SentenceDetectorTrainer(reader, sessionId);
trainer.train();
break;
}
case tokeniser:
{
PatternTokeniserTrainer trainer = new PatternTokeniserTrainer(reader, sessionId);
trainer.train();
break;
}
case posTagger:
{
PosTaggerTrainer trainer = new PosTaggerTrainer(reader, sessionId);
trainer.train();
break;
}
case parser:
{
ParserTrainer trainer = new ParserTrainer(reader, sessionId);
trainer.train();
break;
}
}
break;
}
case evaluate:
{
Reader reader = getReader(inFile, false, sessionId);
switch(session.getModule()) {
case sentenceDetector:
{
SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case tokeniser:
{
TokeniserEvaluator evaluator = new TokeniserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case posTagger:
{
PosTaggerEvaluator evaluator = new PosTaggerEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
case parser:
{
ParserEvaluator evaluator = new ParserEvaluator(reader, outDir, sessionId);
evaluator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case compare:
{
Reader reader = getReader(inFile, false, sessionId);
Reader evalReader = getReader(evalFile, false, sessionId);
switch(session.getModule()) {
case tokeniser:
{
TokenComparator comparator = new TokenComparator(reader, evalReader, outDir, sessionId);
comparator.compare();
break;
}
case posTagger:
{
PosTagComparator comparator = new PosTagComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
case parser:
{
ParseComparator comparator = new ParseComparator(reader, evalReader, outDir, sessionId);
comparator.evaluate();
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
break;
}
case process:
{
Reader reader = getReader(inFile, false, sessionId);
Writer writer = getWriter(outFile, inFile, keepDirectoryStructure, reader, sessionId);
File currentFile = null;
URI currentURI = null;
IOException ioException = null;
switch(session.getModule()) {
case sentenceDetector:
{
List<SentenceProcessor> processors = SentenceProcessor.getProcessors(writer, outDir, sessionId);
try {
SentenceDetectorAnnotatedCorpusReader corpusReader = SentenceDetectorAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".sentence-detector.input"), sessionId);
while (corpusReader.hasNextSentence()) {
Sentence sentence = corpusReader.nextSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentURI)) {
currentURI = sentence.getFileURI();
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (SentenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (SentenceProcessor processor : processors) processor.onNextSentence(sentence);
}
} finally {
for (SentenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case tokeniser:
{
List<TokenSequenceProcessor> processors = TokenSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
TokeniserAnnotatedCorpusReader corpusReader = TokeniserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".tokeniser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
TokenSequence tokenSequence = corpusReader.nextTokenSequence();
Sentence sentence = tokenSequence.getSentence();
if (sentence.getFileURI() != null && !sentence.getFileURI().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (TokenSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (TokenSequenceProcessor processor : processors) processor.onNextTokenSequence(tokenSequence);
}
} finally {
for (TokenSequenceProcessor processor : processors) {
try {
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case posTagger:
{
List<PosTagSequenceProcessor> processors = PosTagSequenceProcessor.getProcessors(writer, outDir, sessionId);
try {
PosTagAnnotatedCorpusReader corpusReader = PosTagAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".pos-tagger.input"), sessionId);
while (corpusReader.hasNextSentence()) {
PosTagSequence posTagSequence = corpusReader.nextPosTagSequence();
Sentence sentence = posTagSequence.getTokenSequence().getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (PosTagSequenceProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (PosTagSequenceProcessor processor : processors) processor.onNextPosTagSequence(posTagSequence);
}
} finally {
for (PosTagSequenceProcessor processor : processors) {
try {
processor.onCompleteAnalysis();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
case parser:
{
List<ParseConfigurationProcessor> processors = ParseConfigurationProcessor.getProcessors(writer, outDir, sessionId);
try {
ParserAnnotatedCorpusReader corpusReader = ParserAnnotatedCorpusReader.getCorpusReader(reader, config.getConfig("talismane.core." + sessionId + ".parser.input"), sessionId);
while (corpusReader.hasNextSentence()) {
ParseConfiguration configuration = corpusReader.nextConfiguration();
Sentence sentence = configuration.getSentence();
if (sentence.getFile() != null && !sentence.getFile().equals(currentFile)) {
currentFile = sentence.getFile();
if (writer instanceof CurrentFileObserver)
((CurrentFileObserver) writer).onNextFile(currentFile);
for (ParseConfigurationProcessor processor : processors) if (processor instanceof CurrentFileObserver)
((CurrentFileObserver) processor).onNextFile(currentFile);
}
for (ParseConfigurationProcessor processor : processors) processor.onNextParseConfiguration(configuration);
}
} finally {
for (ParseConfigurationProcessor processor : processors) {
try {
processor.onCompleteParse();
processor.close();
} catch (IOException e) {
LogUtils.logError(LOG, e);
ioException = e;
}
}
}
break;
}
default:
throw new TalismaneException("Command '" + session.getCommand() + "' does not yet support module: " + session.getModule());
}
if (ioException != null)
throw ioException;
break;
}
}
} finally {
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
LOG.debug("Total time for Talismane.process(): " + totalTime);
if (config.getBoolean("talismane.core." + sessionId + ".output.log-execution-time")) {
try {
CSVFormatter CSV = new CSVFormatter();
Writer csvFileWriter = null;
File csvFile = new File(outDir, session.getBaseName() + ".stats.csv");
csvFile.delete();
csvFile.createNewFile();
csvFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
csvFileWriter.write(CSV.format("total time") + CSV.format(totalTime) + "\n");
csvFileWriter.flush();
csvFileWriter.close();
} catch (Exception e) {
LogUtils.logError(LOG, e);
}
}
}
}
Aggregations