use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testDateTokenization.
/**
* Parse out a date, which will hopefully look like a date.
*/
@Test
public void testDateTokenization() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String tmp = "One two, three-four-five 10/23/2018 at 5:20pm one? Of course not! Be well, stranger. Bye-bye!";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
String[] toks = taA.getTokens();
assertEquals(toks[8], "10/23/2018");
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testPeriodContract.
/**
* Test file extensions.
*/
@Test
public void testPeriodContract() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String tmp = "Info is in tokenizer.pdf or the palatar.MOV file. The next sentence is a structure unto itself.";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", tmp);
String[] toks = taA.getTokens();
assertEquals(toks[3], "tokenizer.pdf");
assertEquals(toks[6], "palatar.MOV");
tmp = "I am the man from U.N.C.L.E., but you are not at the U.N. now.";
taA = bldr.createTextAnnotation("test", "test", tmp);
toks = taA.getTokens();
assertEquals(toks[5], "U.N.C.L.E.");
assertEquals(toks[13], "U.N.");
tmp = "The head of Inefficient Machine Co. Edward Doolally later relented.";
taA = bldr.createTextAnnotation("test", "test", tmp);
toks = taA.getTokens();
assertEquals(taA.getNumberOfSentences(), 1);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testDecimalNotation.
/**
* Parse an empty string.
*/
@Test
public void testDecimalNotation() {
TokenizerTextAnnotationBuilder bldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, true));
String text = "$1.09 percent like me.";
TextAnnotation taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 1);
String[] toks = taA.getTokens();
assertEquals(toks[0], "$1.09");
text = "Take the $.10 tour.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 1);
toks = taA.getTokens();
assertEquals(toks[2], "$.10");
text = "Take the $10B tour.";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 1);
toks = taA.getTokens();
assertEquals(toks[2], "$10B");
text = "\n(\n$.10)";
taA = bldr.createTextAnnotation("test", "test", text);
assertEquals(taA.getNumberOfSentences(), 1);
toks = taA.getTokens();
assertEquals(toks[1], "$.10");
assertEquals(toks[0], "(");
assertEquals(toks[2], ")");
assertEquals(taA.getNumberOfSentences(), 1);
taA = bldr.createTextAnnotation("test", "test", "Bill was traveling at .34km an hour.");
toks = taA.getTokens();
assertEquals(toks[4], ".34km");
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TextAnnotationTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
Aggregations