use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class SentenceDetectorTest method testDetectSentences.
@Test
public void testDetectSentences() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
final Config config = ConfigFactory.load();
final String sessionId = "test";
DecisionMaker decisionMaker = new DecisionMaker() {
@Override
public ScoringStrategy<ClassificationSolution> getDefaultScoringStrategy() {
return new GeometricMeanScoringStrategy();
}
@Override
public List<Decision> decide(List<FeatureResult<?>> featureResults) {
List<Decision> decisions = new ArrayList<>();
Decision decision = new Decision(SentenceDetectorOutcome.IS_BOUNDARY.name(), 1.0);
decisions.add(decision);
return decisions;
}
};
String[] labels = new String[0];
Set<SentenceDetectorFeature<?>> features = new HashSet<>();
SentenceDetector sentenceDetector = new SentenceDetector(decisionMaker, features, sessionId);
String text = "Before analysis. Hello Mr. Jones. How are you, Mr. Jones? After analysis.";
AnnotatedText annotatedText = new AnnotatedText(text, "Before analysis. ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length());
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = new ArrayList<>();
noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello ".length(), "Before analysis. Hello Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello Mr. Jones. How are you, ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
annotatedText.addAnnotations(noSentenceBreakMarkers);
List<Integer> sentenceBreaks = sentenceDetector.detectSentences(annotatedText);
assertEquals(2, sentenceBreaks.size());
assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBreaks.get(0).intValue());
assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBreaks.get(1).intValue());
List<Annotation<SentenceBoundary>> sentenceBoundaries = annotatedText.getAnnotations(SentenceBoundary.class);
assertEquals(2, sentenceBoundaries.size());
assertEquals("".length(), sentenceBoundaries.get(0).getStart());
assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(0).getEnd());
assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(1).getStart());
assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBoundaries.get(1).getEnd());
}
use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class SerializationTest method testSerialize.
@Test
public void testSerialize() throws Exception {
System.setProperty("config.file", "src/test/resources/test.conf");
ConfigFactory.invalidateCaches();
String sessionId = "test";
Sentence sentence = new Sentence("Il aime les pommes", sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
tokenSequence.addToken("".length(), "Il".length());
tokenSequence.addToken("Il ".length(), "Il aime".length());
tokenSequence.addToken("Il aime ".length(), "Il aime les".length());
tokenSequence.addToken("Il aime les ".length(), "Il aime les pommes".length());
PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("CLS", 0.90), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("V", 0.70), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("DET", 0.60), sessionId));
posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("NC", 0.80), sessionId));
posTagSequence.prependRoot();
ParseConfiguration configuration = new ParseConfiguration(posTagSequence);
LOG.debug(configuration.toString());
// ROOT ... il
new ShiftTransition().apply(configuration);
LOG.debug("Shift -> " + configuration.toString());
// ROOT il <- aime
new LeftArcEagerTransition("suj").apply(configuration);
LOG.debug("Left -> " + configuration.toString());
// ROOT -> aime
new RightArcEagerTransition("root").apply(configuration);
LOG.debug("Right -> " + configuration.toString());
// ROOT aime ... les
new ShiftTransition().apply(configuration);
LOG.debug("Shift -> " + configuration.toString());
// ROOT aime les <- pommes
new LeftArcEagerTransition("det").apply(configuration);
LOG.debug("Left -> " + configuration.toString());
// ROOT aime -> pommes
new RightArcEagerTransition("obj").apply(configuration);
LOG.debug("Right -> " + configuration.toString());
ParseTree parseTree = new ParseTree(configuration, true);
LOG.debug(parseTree.toString());
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
oos.writeObject(sentence);
oos.writeObject(tokenSequence);
oos.writeObject(posTagSequence);
oos.writeObject(configuration);
oos.writeObject(parseTree);
byte[] bytes = bos.toByteArray();
ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
Sentence sentence2 = (Sentence) ois.readObject();
TokenSequence tokenSequence2 = (TokenSequence) ois.readObject();
PosTagSequence posTagSequence2 = (PosTagSequence) ois.readObject();
ParseConfiguration configuration2 = (ParseConfiguration) ois.readObject();
ParseTree parseTree2 = (ParseTree) ois.readObject();
assertEquals(sentence, sentence2);
assertEquals(tokenSequence, tokenSequence2);
assertEquals(posTagSequence, posTagSequence2);
assertEquals(configuration, configuration2);
assertEquals(parseTree, parseTree2);
}
use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class PerceptronDetailedAnalysisWriter method onAnalyse.
/*
* (non-Javadoc)
*
* @see com.joliciel.talismane.maxent.MaxentObserver#onAnalyse(java.util.List,
* java.util.Collection)
*/
@Override
public void onAnalyse(Object event, List<FeatureResult<?>> featureResults, Collection<Decision> decisions) throws IOException {
Map<String, Double> outcomeTotals = new TreeMap<String, Double>();
for (String outcome : modelParams.getOutcomes()) outcomeTotals.put(outcome, 0.0);
writer.append("####### Event: " + event.toString() + "\n");
writer.append("### Feature results:\n");
for (FeatureResult<?> featureResult : featureResults) {
if (featureResult.getOutcome() instanceof List) {
@SuppressWarnings("unchecked") FeatureResult<List<WeightedOutcome<String>>> stringCollectionResult = (FeatureResult<List<WeightedOutcome<String>>>) featureResult;
for (WeightedOutcome<String> stringOutcome : stringCollectionResult.getOutcome()) {
String featureName = featureResult.getTrainingName() + "|" + featureResult.getTrainingOutcome(stringOutcome.getOutcome());
String featureOutcome = stringOutcome.getOutcome();
double value = stringOutcome.getWeight();
this.writeFeatureResult(featureName, featureOutcome, value, outcomeTotals);
}
} else {
double value = 1.0;
if (featureResult.getFeature() instanceof DoubleFeature) {
value = (Double) featureResult.getOutcome();
}
this.writeFeatureResult(featureResult.getTrainingName(), featureResult.getOutcome().toString(), value, outcomeTotals);
}
}
List<Integer> featureIndexList = new ArrayList<Integer>();
List<Double> featureValueList = new ArrayList<Double>();
modelParams.prepareData(featureResults, featureIndexList, featureValueList);
double[] results = decisionMaker.predict(featureIndexList, featureValueList);
writer.append("### Outcome totals:\n");
writer.append(String.format("%1$-30s", "outcome") + String.format("%1$#15s", "total") + String.format("%1$#15s", "normalised") + "\n");
int j = 0;
for (String outcome : modelParams.getOutcomes()) {
double total = outcomeTotals.get(outcome);
double normalised = results[j++];
writer.append(String.format("%1$-30s", outcome) + String.format("%1$#15s", decFormat.format(total)) + String.format("%1$#15s", decFormat.format(normalised)) + "\n");
}
writer.append("\n");
Map<String, Double> outcomeWeights = new TreeMap<String, Double>();
for (Decision decision : decisions) {
outcomeWeights.put(decision.getOutcome(), decision.getProbability());
}
writer.append("### Outcome list:\n");
Set<WeightedOutcome<String>> weightedOutcomes = new TreeSet<WeightedOutcome<String>>();
for (String outcome : modelParams.getOutcomes()) {
Double weightObj = outcomeWeights.get(outcome);
double weight = (weightObj == null ? 0.0 : weightObj.doubleValue());
WeightedOutcome<String> weightedOutcome = new WeightedOutcome<String>(outcome, weight);
weightedOutcomes.add(weightedOutcome);
}
for (WeightedOutcome<String> weightedOutcome : weightedOutcomes) {
writer.append(String.format("%1$-30s", weightedOutcome.getOutcome()) + String.format("%1$#15s", decFormat.format(weightedOutcome.getWeight())) + "\n");
}
writer.append("\n");
writer.flush();
}
use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class TokenComparator method compare.
/**
* Evaluate the evaluation corpus against the reference corpus.
*
* @throws TalismaneException
* @throws IOException
*/
public void compare() throws TalismaneException, IOException {
while (referenceCorpusReader.hasNextSentence()) {
TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
TokenSequence guessedSequence = null;
if (evaluationCorpusReader.hasNextSentence())
guessedSequence = evaluationCorpusReader.nextTokenSequence();
else {
throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
}
Sentence sentence = realSequence.getSentence();
// Initially, separate the sentence into tokens using the separators
// provided
TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
realAtomicSequence.findDefaultTokens();
TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
guessedAtomicSequence.findDefaultTokens();
List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
Set<Token> matchedTokens = new HashSet<Token>();
for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
matchingSequences.add(matchSequence);
matchedTokens.addAll(matchSequence.getTokensToCheck());
Token token = null;
for (Token aToken : matchSequence.getTokensToCheck()) {
token = aToken;
if (!aToken.isWhiteSpace()) {
break;
}
}
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences == null) {
matchSequences = new TreeSet<TokenPatternMatchSequence>();
tokenMatchSequenceMap.put(token, matchSequences);
}
matchSequences.add(matchSequence);
}
}
TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
int i = 0;
int mismatches = 0;
for (Token token : realAtomicSequence) {
if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
// skipped stuff at start of sentence on guess, if it's been
// through the parser
TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
mismatches++;
LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
if (mismatches > 6) {
LOG.info("Real sequence: " + realSequence.getSentence().getText());
LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
}
continue;
}
TokeniserOutcome outcome = TokeniserOutcome.JOIN;
if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
outcome = TokeniserOutcome.SEPARATE;
}
Decision decision = new Decision(outcome.name());
decision.addAuthority("_" + this.getClass().getSimpleName());
Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
if (matchSequences != null) {
decision.addAuthority("_Patterns");
for (TokenPatternMatchSequence matchSequence : matchSequences) {
decision.addAuthority(matchSequence.getTokenPattern().getName());
}
}
guess.addTaggedToken(token, decision, outcome);
i++;
}
List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
guessedAtomicSequences.add(guess);
for (TokenEvaluationObserver observer : observers) {
observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
}
}
for (TokenEvaluationObserver observer : observers) {
observer.onEvaluationComplete();
}
}
use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.
the class SimpleTokeniser method tokeniseInternal.
@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) {
List<TokenisedAtomicTokenSequence> sequences = null;
sequences = new ArrayList<TokenisedAtomicTokenSequence>();
TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
for (Token token : initialSequence.listWithWhiteSpace()) {
Decision tokeniserDecision = new Decision(TokeniserOutcome.SEPARATE.name());
TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<TokeniserOutcome>(token, tokeniserDecision, TokeniserOutcome.valueOf(tokeniserDecision.getOutcome()));
defaultSequence.add(taggedToken);
}
sequences.add(defaultSequence);
return sequences;
}
Aggregations