use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class IllinoisTokenizerTest method testIllinoisTokenizerMultiline.
@Test
public void testIllinoisTokenizerMultiline() {
Tokenizer tkr = new IllinoisTokenizer();
String text = "Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists. " + System.lineSeparator() + "He is intolerant of intolerance!";
Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
int[] sentEndOffsets = tknzn.getSentenceEndTokenIndexes();
assertEquals(2, sentEndOffsets.length);
assertEquals(12, sentEndOffsets[0]);
assertEquals(18, sentEndOffsets[1]);
String[] tokens = tknzn.getTokens();
assertEquals("--", tokens[6]);
assertEquals("of", tokens[15]);
IntPair[] tokenOffsets = tknzn.getCharacterOffsets();
int notIndex = 8;
IntPair notOffsets = new IntPair(42, 45);
assertEquals(notOffsets, tokenOffsets[notIndex]);
int intolerantIndex = 14;
IntPair intolerantOffsets = new IntPair(77, 87);
assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class IllinoisTokenizerTest method testIllinoisTokenizer.
/**
* Test method for {@link IllinoisTokenizer} .
*/
@Test
public void testIllinoisTokenizer() {
Tokenizer tokenizer = new IllinoisTokenizer();
String sentence = "This is a test.";
String[] tokens = { "This", "is", "a", "test", "." };
IntPair[] offsets = new IntPair[tokens.length];
offsets[0] = new IntPair(0, 4);
offsets[1] = new IntPair(5, 7);
offsets[2] = new IntPair(8, 9);
offsets[3] = new IntPair(12, 16);
offsets[4] = new IntPair(16, 17);
doTokenizerTest(tokenizer, sentence, tokens, offsets);
sentence = "Hello, world! I am at UIUC.";
tokens = new String[] { "Hello", ",", "world", "!", "I", "am", "at", "UIUC", "." };
offsets = new IntPair[tokens.length];
offsets[0] = new IntPair(0, 5);
offsets[1] = new IntPair(5, 6);
offsets[2] = new IntPair(7, 12);
offsets[3] = new IntPair(12, 13);
offsets[4] = new IntPair(14, 15);
offsets[5] = new IntPair(16, 18);
offsets[6] = new IntPair(19, 21);
offsets[7] = new IntPair(22, 26);
offsets[8] = new IntPair(26, 27);
doTokenizerTest(tokenizer, sentence, tokens, offsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testWhitespaceBehavior.
/**
* Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
* xml markup has been replaced with whitespace of equal span.
*/
@Test
public void testWhitespaceBehavior() {
String origText = null;
try {
origText = LineIO.slurp(INFILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
Matcher xmlMatcher = xmlTagPattern.matcher(origText);
StringBuilder cleanTextBldr = new StringBuilder();
int lastAppendedCharOffset = 0;
while (xmlMatcher.find()) {
int start = xmlMatcher.start();
int end = xmlMatcher.end();
cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
lastAppendedCharOffset = end;
}
cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
String cleanText = cleanTextBldr.toString();
// count whitespace chars in string
// check token offsets in tokens returned by SentenceSplitter
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(cleanText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
Sentence[] sents = splitter.splitAll();
Sentence s = sents[0];
LinkedVector words = s.wordSplit();
for (int i = 0; i < words.size(); ++i) {
Word firstWord = (Word) words.get(0);
if ("Sun".equals(firstWord.form)) {
IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
assertTrue(sunSpans.contains(tokenCharOffsets));
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
String tok = tokenInfo.getTokens()[i];
if (tok.equals("Sun")) {
IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
if (!sunSpans.contains(tokCharOffsets)) {
String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
}
assertTrue(sunSpans.contains(tokCharOffsets));
}
}
TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
assertNotNull(statefulTa);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TextAnnotationTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
Aggregations