Examples with TokenizerTextAnnotationBuilder - edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder

Example 16 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testDateTokenization.

/**
 * Parse out a date, which will hopefully look like a date.
 */
@Test
public void testDateTokenization() {
    TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
    String tmp = "One two, three-four-five 10/23/2018 at 5:20pm one? Of course not! Be well, stranger. Bye-bye!";
    TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
    String[] toks = taA.getTokens();
    assertEquals(toks[8], "10/23/2018");
}

Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Test(org.junit.Test)

Example 17 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testPeriodContract.

/**
 * Test file extensions.
 */
@Test
public void testPeriodContract() {
    TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
    String tmp = "Info is in tokenizer.pdf or the palatar.MOV file. The next sentence is a structure unto itself.";
    TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
    String[] toks = taA.getTokens();
    assertEquals(toks[3], "tokenizer.pdf");
    assertEquals(toks[6], "palatar.MOV");
    tmp = "I am the man from U.N.C.L.E., but you are not at the U.N. now.";
    taA = bldr.createTextAnnotation("test", "test", tmp);
    toks = taA.getTokens();
    assertEquals(toks[5], "U.N.C.L.E.");
    assertEquals(toks[13], "U.N.");
    tmp = "The head of Inefficient Machine Co. Edward Doolally later relented.";
    taA = bldr.createTextAnnotation("test", "test", tmp);
    toks = taA.getTokens();
    assertEquals(taA.getNumberOfSentences(), 1);
}

Example 18 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testDecimalNotation.

/**
 * Parse an empty string.
 */
@Test
public void testDecimalNotation() {
    TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
    String text = "$1.09 percent like me.";
    TextAnnotation taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 1);
    String[] toks = taA.getTokens();
    assertEquals(toks[0], "$1.09");
    text = "Take the $.10 tour.";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 1);
    toks = taA.getTokens();
    assertEquals(toks[2], "$.10");
    text = "Take the $10B tour.";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 1);
    toks = taA.getTokens();
    assertEquals(toks[2], "$10B");
    text = "\n(\n$.10)";
    taA = bldr.createTextAnnotation("test", "test", text);
    assertEquals(taA.getNumberOfSentences(), 1);
    toks = taA.getTokens();
    assertEquals(toks[1], "$.10");
    assertEquals(toks[0], "(");
    assertEquals(toks[2], ")");
    assertEquals(taA.getNumberOfSentences(), 1);
    taA = bldr.createTextAnnotation("test", "test", "Bill was traveling at .34km an hour.");
    toks = taA.getTokens();
    assertEquals(toks[4], ".34km");
}

Example 19 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.

/**
 * test whether the mapping between character offset and token index is correct.
 */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}

Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 20 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class TextAnnotationTest method testCharacterOffsetToTokenIndex.

/**
 * test whether the mapping between character offset and token index is correct.
 */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}

Aggregations

TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)42 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)31 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)29 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)19 Test (org.junit.Test)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)11 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Properties (java.util.Properties)7 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)6 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2