Search in sources :

Example 26 with Decision

use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.

the class SentenceDetectorTest method testDetectSentences.

@Test
public void testDetectSentences() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    DecisionMaker decisionMaker = new DecisionMaker() {

        @Override
        public ScoringStrategy<ClassificationSolution> getDefaultScoringStrategy() {
            return new GeometricMeanScoringStrategy();
        }

        @Override
        public List<Decision> decide(List<FeatureResult<?>> featureResults) {
            List<Decision> decisions = new ArrayList<>();
            Decision decision = new Decision(SentenceDetectorOutcome.IS_BOUNDARY.name(), 1.0);
            decisions.add(decision);
            return decisions;
        }
    };
    String[] labels = new String[0];
    Set<SentenceDetectorFeature<?>> features = new HashSet<>();
    SentenceDetector sentenceDetector = new SentenceDetector(decisionMaker, features, sessionId);
    String text = "Before analysis. Hello Mr. Jones. How are you, Mr. Jones? After analysis.";
    AnnotatedText annotatedText = new AnnotatedText(text, "Before analysis. ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length());
    List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = new ArrayList<>();
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello ".length(), "Before analysis. Hello Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    noSentenceBreakMarkers.add(new Annotation<>("Before analysis. Hello Mr. Jones. How are you, ".length(), "Before analysis. Hello Mr. Jones. How are you, Mr.".length(), new RawTextNoSentenceBreakMarker("me"), labels));
    annotatedText.addAnnotations(noSentenceBreakMarkers);
    List<Integer> sentenceBreaks = sentenceDetector.detectSentences(annotatedText);
    assertEquals(2, sentenceBreaks.size());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBreaks.get(0).intValue());
    assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBreaks.get(1).intValue());
    List<Annotation<SentenceBoundary>> sentenceBoundaries = annotatedText.getAnnotations(SentenceBoundary.class);
    assertEquals(2, sentenceBoundaries.size());
    assertEquals("".length(), sentenceBoundaries.get(0).getStart());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(0).getEnd());
    assertEquals("Before analysis. Hello Mr. Jones.".length(), sentenceBoundaries.get(1).getStart());
    assertEquals("Before analysis. Hello Mr. Jones. How are you, Mr. Jones?".length(), sentenceBoundaries.get(1).getEnd());
}
Also used : SentenceDetectorFeature(com.joliciel.talismane.sentenceDetector.features.SentenceDetectorFeature) AnnotatedText(com.joliciel.talismane.AnnotatedText) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) ClassificationSolution(com.joliciel.talismane.machineLearning.ClassificationSolution) GeometricMeanScoringStrategy(com.joliciel.talismane.machineLearning.GeometricMeanScoringStrategy) Decision(com.joliciel.talismane.machineLearning.Decision) Annotation(com.joliciel.talismane.Annotation) RawTextNoSentenceBreakMarker(com.joliciel.talismane.rawText.RawTextMarker.RawTextNoSentenceBreakMarker) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 27 with Decision

use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.

the class SerializationTest method testSerialize.

@Test
public void testSerialize() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    String sessionId = "test";
    Sentence sentence = new Sentence("Il aime les pommes", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "Il".length());
    tokenSequence.addToken("Il ".length(), "Il aime".length());
    tokenSequence.addToken("Il aime ".length(), "Il aime les".length());
    tokenSequence.addToken("Il aime les ".length(), "Il aime les pommes".length());
    PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("CLS", 0.90), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("V", 0.70), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("DET", 0.60), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("NC", 0.80), sessionId));
    posTagSequence.prependRoot();
    ParseConfiguration configuration = new ParseConfiguration(posTagSequence);
    LOG.debug(configuration.toString());
    // ROOT ... il
    new ShiftTransition().apply(configuration);
    LOG.debug("Shift -> " + configuration.toString());
    // ROOT il <- aime
    new LeftArcEagerTransition("suj").apply(configuration);
    LOG.debug("Left -> " + configuration.toString());
    // ROOT -> aime
    new RightArcEagerTransition("root").apply(configuration);
    LOG.debug("Right -> " + configuration.toString());
    // ROOT aime ... les
    new ShiftTransition().apply(configuration);
    LOG.debug("Shift -> " + configuration.toString());
    // ROOT aime les <- pommes
    new LeftArcEagerTransition("det").apply(configuration);
    LOG.debug("Left -> " + configuration.toString());
    // ROOT aime -> pommes
    new RightArcEagerTransition("obj").apply(configuration);
    LOG.debug("Right -> " + configuration.toString());
    ParseTree parseTree = new ParseTree(configuration, true);
    LOG.debug(parseTree.toString());
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(bos);
    oos.writeObject(sentence);
    oos.writeObject(tokenSequence);
    oos.writeObject(posTagSequence);
    oos.writeObject(configuration);
    oos.writeObject(parseTree);
    byte[] bytes = bos.toByteArray();
    ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
    Sentence sentence2 = (Sentence) ois.readObject();
    TokenSequence tokenSequence2 = (TokenSequence) ois.readObject();
    PosTagSequence posTagSequence2 = (PosTagSequence) ois.readObject();
    ParseConfiguration configuration2 = (ParseConfiguration) ois.readObject();
    ParseTree parseTree2 = (ParseTree) ois.readObject();
    assertEquals(sentence, sentence2);
    assertEquals(tokenSequence, tokenSequence2);
    assertEquals(posTagSequence, posTagSequence2);
    assertEquals(configuration, configuration2);
    assertEquals(parseTree, parseTree2);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) Decision(com.joliciel.talismane.machineLearning.Decision) ByteArrayInputStream(java.io.ByteArrayInputStream) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) ObjectInputStream(java.io.ObjectInputStream) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 28 with Decision

use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.

the class PerceptronDetailedAnalysisWriter method onAnalyse.

/*
   * (non-Javadoc)
   * 
   * @see com.joliciel.talismane.maxent.MaxentObserver#onAnalyse(java.util.List,
   * java.util.Collection)
   */
@Override
public void onAnalyse(Object event, List<FeatureResult<?>> featureResults, Collection<Decision> decisions) throws IOException {
    Map<String, Double> outcomeTotals = new TreeMap<String, Double>();
    for (String outcome : modelParams.getOutcomes()) outcomeTotals.put(outcome, 0.0);
    writer.append("####### Event: " + event.toString() + "\n");
    writer.append("### Feature results:\n");
    for (FeatureResult<?> featureResult : featureResults) {
        if (featureResult.getOutcome() instanceof List) {
            @SuppressWarnings("unchecked") FeatureResult<List<WeightedOutcome<String>>> stringCollectionResult = (FeatureResult<List<WeightedOutcome<String>>>) featureResult;
            for (WeightedOutcome<String> stringOutcome : stringCollectionResult.getOutcome()) {
                String featureName = featureResult.getTrainingName() + "|" + featureResult.getTrainingOutcome(stringOutcome.getOutcome());
                String featureOutcome = stringOutcome.getOutcome();
                double value = stringOutcome.getWeight();
                this.writeFeatureResult(featureName, featureOutcome, value, outcomeTotals);
            }
        } else {
            double value = 1.0;
            if (featureResult.getFeature() instanceof DoubleFeature) {
                value = (Double) featureResult.getOutcome();
            }
            this.writeFeatureResult(featureResult.getTrainingName(), featureResult.getOutcome().toString(), value, outcomeTotals);
        }
    }
    List<Integer> featureIndexList = new ArrayList<Integer>();
    List<Double> featureValueList = new ArrayList<Double>();
    modelParams.prepareData(featureResults, featureIndexList, featureValueList);
    double[] results = decisionMaker.predict(featureIndexList, featureValueList);
    writer.append("### Outcome totals:\n");
    writer.append(String.format("%1$-30s", "outcome") + String.format("%1$#15s", "total") + String.format("%1$#15s", "normalised") + "\n");
    int j = 0;
    for (String outcome : modelParams.getOutcomes()) {
        double total = outcomeTotals.get(outcome);
        double normalised = results[j++];
        writer.append(String.format("%1$-30s", outcome) + String.format("%1$#15s", decFormat.format(total)) + String.format("%1$#15s", decFormat.format(normalised)) + "\n");
    }
    writer.append("\n");
    Map<String, Double> outcomeWeights = new TreeMap<String, Double>();
    for (Decision decision : decisions) {
        outcomeWeights.put(decision.getOutcome(), decision.getProbability());
    }
    writer.append("### Outcome list:\n");
    Set<WeightedOutcome<String>> weightedOutcomes = new TreeSet<WeightedOutcome<String>>();
    for (String outcome : modelParams.getOutcomes()) {
        Double weightObj = outcomeWeights.get(outcome);
        double weight = (weightObj == null ? 0.0 : weightObj.doubleValue());
        WeightedOutcome<String> weightedOutcome = new WeightedOutcome<String>(outcome, weight);
        weightedOutcomes.add(weightedOutcome);
    }
    for (WeightedOutcome<String> weightedOutcome : weightedOutcomes) {
        writer.append(String.format("%1$-30s", weightedOutcome.getOutcome()) + String.format("%1$#15s", decFormat.format(weightedOutcome.getWeight())) + "\n");
    }
    writer.append("\n");
    writer.flush();
}
Also used : ArrayList(java.util.ArrayList) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) TreeMap(java.util.TreeMap) DoubleFeature(com.joliciel.talismane.machineLearning.features.DoubleFeature) Decision(com.joliciel.talismane.machineLearning.Decision) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 29 with Decision

use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 30 with Decision

use of com.joliciel.talismane.machineLearning.Decision in project talismane by joliciel-informatique.

the class SimpleTokeniser method tokeniseInternal.

@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) {
    List<TokenisedAtomicTokenSequence> sequences = null;
    sequences = new ArrayList<TokenisedAtomicTokenSequence>();
    TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
    for (Token token : initialSequence.listWithWhiteSpace()) {
        Decision tokeniserDecision = new Decision(TokeniserOutcome.SEPARATE.name());
        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<TokeniserOutcome>(token, tokeniserDecision, TokeniserOutcome.valueOf(tokeniserDecision.getOutcome()));
        defaultSequence.add(taggedToken);
    }
    sequences.add(defaultSequence);
    return sequences;
}
Also used : Decision(com.joliciel.talismane.machineLearning.Decision)

Aggregations

Decision (com.joliciel.talismane.machineLearning.Decision)37 ArrayList (java.util.ArrayList)24 Config (com.typesafe.config.Config)15 TreeSet (java.util.TreeSet)15 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)13 Token (com.joliciel.talismane.tokeniser.Token)12 Test (org.junit.Test)12 Sentence (com.joliciel.talismane.rawText.Sentence)11 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)11 List (java.util.List)11 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)10 TalismaneTest (com.joliciel.talismane.TalismaneTest)9 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)9 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 TalismaneException (com.joliciel.talismane.TalismaneException)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)7 HashSet (java.util.HashSet)7 Shape (com.joliciel.jochre.graphics.Shape)6 HashMap (java.util.HashMap)6