Search in sources :

Example 76 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class PosTaggerStatisticsWriter method onNextPosTagSequence.

@Override
public void onNextPosTagSequence(PosTagSequence posTagSequence) throws TalismaneException {
    stats.sentenceCount++;
    stats.sentenceLengthStats.addValue(posTagSequence.size());
    for (PosTaggedToken posTaggedToken : posTagSequence) {
        if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG))
            continue;
        Token token = posTaggedToken.getToken();
        boolean knownInRefCorpus = false;
        boolean knownInLexicon = false;
        if (token.getPossiblePosTags().size() > 0)
            knownInLexicon = true;
        String word = token.getOriginalText();
        stats.words.add(word);
        if (referenceStats != null)
            if (referenceStats.words.contains(word))
                knownInRefCorpus = true;
        if (!knownInLexicon) {
            stats.unknownInLexiconCount++;
        }
        if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.CLOSED) {
            stats.closedClassCount++;
            if (!knownInRefCorpus)
                stats.closedClassUnknownInRefCorpus++;
            if (!knownInLexicon)
                stats.closedClassUnknownInLexicon++;
        } else if (posTaggedToken.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.OPEN) {
            stats.openClassCount++;
            if (!knownInRefCorpus)
                stats.openClassUnknownInRefCorpus++;
            if (!knownInLexicon)
                stats.openClassUnknownInLexicon++;
        }
        if (!knownInRefCorpus)
            stats.unknownTokenCount++;
        if (alphanumeric.matcher(token.getOriginalText()).find()) {
            String lowercase = word.toLowerCase(TalismaneSession.get(sessionId).getLocale());
            stats.lowerCaseWords.add(lowercase);
            stats.alphanumericCount++;
            if (!knownInRefCorpus)
                stats.unknownAlphanumericCount++;
            if (!knownInLexicon)
                stats.unknownAlphaInLexiconCount++;
        }
        stats.tokenCount++;
        Integer countObj = stats.posTagCounts.get(posTaggedToken.getTag().getCode());
        int count = countObj == null ? 0 : countObj.intValue();
        count++;
        stats.posTagCounts.put(posTaggedToken.getTag().getCode(), count);
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token)

Example 77 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class SerializationTest method testSerialize.

@Test
public void testSerialize() throws Exception {
    System.setProperty("config.file", "src/test/resources/test.conf");
    ConfigFactory.invalidateCaches();
    String sessionId = "test";
    Sentence sentence = new Sentence("Il aime les pommes", sessionId);
    TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
    tokenSequence.addToken("".length(), "Il".length());
    tokenSequence.addToken("Il ".length(), "Il aime".length());
    tokenSequence.addToken("Il aime ".length(), "Il aime les".length());
    tokenSequence.addToken("Il aime les ".length(), "Il aime les pommes".length());
    PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("CLS", 0.90), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("V", 0.70), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("DET", 0.60), sessionId));
    posTagSequence.addPosTaggedToken(new PosTaggedToken(posTagSequence.getNextToken(), new Decision("NC", 0.80), sessionId));
    posTagSequence.prependRoot();
    ParseConfiguration configuration = new ParseConfiguration(posTagSequence);
    LOG.debug(configuration.toString());
    // ROOT ... il
    new ShiftTransition().apply(configuration);
    LOG.debug("Shift -> " + configuration.toString());
    // ROOT il <- aime
    new LeftArcEagerTransition("suj").apply(configuration);
    LOG.debug("Left -> " + configuration.toString());
    // ROOT -> aime
    new RightArcEagerTransition("root").apply(configuration);
    LOG.debug("Right -> " + configuration.toString());
    // ROOT aime ... les
    new ShiftTransition().apply(configuration);
    LOG.debug("Shift -> " + configuration.toString());
    // ROOT aime les <- pommes
    new LeftArcEagerTransition("det").apply(configuration);
    LOG.debug("Left -> " + configuration.toString());
    // ROOT aime -> pommes
    new RightArcEagerTransition("obj").apply(configuration);
    LOG.debug("Right -> " + configuration.toString());
    ParseTree parseTree = new ParseTree(configuration, true);
    LOG.debug(parseTree.toString());
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(bos);
    oos.writeObject(sentence);
    oos.writeObject(tokenSequence);
    oos.writeObject(posTagSequence);
    oos.writeObject(configuration);
    oos.writeObject(parseTree);
    byte[] bytes = bos.toByteArray();
    ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
    Sentence sentence2 = (Sentence) ois.readObject();
    TokenSequence tokenSequence2 = (TokenSequence) ois.readObject();
    PosTagSequence posTagSequence2 = (PosTagSequence) ois.readObject();
    ParseConfiguration configuration2 = (ParseConfiguration) ois.readObject();
    ParseTree parseTree2 = (ParseTree) ois.readObject();
    assertEquals(sentence, sentence2);
    assertEquals(tokenSequence, tokenSequence2);
    assertEquals(posTagSequence, posTagSequence2);
    assertEquals(configuration, configuration2);
    assertEquals(parseTree, parseTree2);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) Decision(com.joliciel.talismane.machineLearning.Decision) ByteArrayInputStream(java.io.ByteArrayInputStream) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) Sentence(com.joliciel.talismane.rawText.Sentence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) ObjectInputStream(java.io.ObjectInputStream) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Aggregations

PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)77 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)24 PosTaggedTokenWrapper (com.joliciel.talismane.posTagger.features.PosTaggedTokenWrapper)20 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)14 Token (com.joliciel.talismane.tokeniser.Token)11 DependencyArc (com.joliciel.talismane.parser.DependencyArc)9 TalismaneException (com.joliciel.talismane.TalismaneException)8 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 Sentence (com.joliciel.talismane.rawText.Sentence)8 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)8 HashMap (java.util.HashMap)7 List (java.util.List)7 TalismaneTest (com.joliciel.talismane.TalismaneTest)6 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)6 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)6 Config (com.typesafe.config.Config)6 ArrayList (java.util.ArrayList)6 Test (org.junit.Test)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5