use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testWhitespaceBehavior.
/**
* Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
* xml markup has been replaced with whitespace of equal span.
*/
@Test
public void testWhitespaceBehavior() {
String origText = null;
try {
origText = LineIO.slurp(INFILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail(e.getMessage());
}
Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
Matcher xmlMatcher = xmlTagPattern.matcher(origText);
StringBuilder cleanTextBldr = new StringBuilder();
int lastAppendedCharOffset = 0;
while (xmlMatcher.find()) {
int start = xmlMatcher.start();
int end = xmlMatcher.end();
cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
lastAppendedCharOffset = end;
}
cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
String cleanText = cleanTextBldr.toString();
// count whitespace chars in string
// check token offsets in tokens returned by SentenceSplitter
Pattern sun = Pattern.compile("\\w*Sun\\w*");
Matcher sunMatcher = sun.matcher(cleanText);
Set<IntPair> sunSpans = new HashSet<>();
while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
Sentence[] sents = splitter.splitAll();
Sentence s = sents[0];
LinkedVector words = s.wordSplit();
for (int i = 0; i < words.size(); ++i) {
Word firstWord = (Word) words.get(0);
if ("Sun".equals(firstWord.form)) {
IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
assertTrue(sunSpans.contains(tokenCharOffsets));
}
}
StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
String tok = tokenInfo.getTokens()[i];
if (tok.equals("Sun")) {
IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
if (!sunSpans.contains(tokCharOffsets)) {
String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
}
assertTrue(sunSpans.contains(tokCharOffsets));
}
}
TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
assertNotNull(statefulTa);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class EREDocumentReader method createAndAddXmlMarkupAnnotations.
/**
* create a view with constituents representing post boundaries and quotations.
* For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
* and attributes NAME_START and NAME_END specify the name offsets in the original xml text
*
* @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
*/
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
TextAnnotation ta = xmlTa.getTextAnnotation();
View postView = new View(getPostViewName(), NAME, ta, 1.0);
for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
String label = spanInfo.label;
Pair<String, IntPair> authorInfo = null;
boolean isPost = false;
if (POST.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(AUTHOR);
} else if (QUOTE.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
}
if (isPost) {
IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
if (null != authorInfo) {
c.addAttribute(AUTHOR, authorInfo.getFirst());
c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
postView.addConstituent(c);
}
}
}
if (!postView.getConstituents().isEmpty())
ta.addView(getPostViewName(), postView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class OracleTokenizer method tokenize.
/**
* Produces a Tokenization with character offsets based on
* 1. a raw text
* 2. a given set of tokens, which must be a sequence of substrings of the raw text in order
* 3. a given set of sentence spans, which will be normalized to cover all tokens
*/
public Tokenizer.Tokenization tokenize(String rawText, List<String> tokens, List<IntPair> sentences) {
List<IntPair> charOffsets = new ArrayList<>();
for (int i = 0, rawTextOffset = 0; i < tokens.size(); ++i) {
String token = tokens.get(i);
int tokenStartOffset = rawText.indexOf(token, rawTextOffset);
if (tokenStartOffset == -1) {
throw new IllegalTokenException(tokens, i, rawText, rawTextOffset, exceptionDisplayLength);
}
int tokenEndOffset = tokenStartOffset + token.length();
charOffsets.add(new IntPair(tokenStartOffset, tokenEndOffset));
rawTextOffset = tokenEndOffset;
}
int[] sentenceEndTokenIndexes = normalizeSentences(sentences, tokens.size());
return new Tokenizer.Tokenization(tokens.toArray(new String[0]), charOffsets.toArray(new IntPair[0]), sentenceEndTokenIndexes);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class OracleTokenizer method normalizeSentences.
/**
* Helper for normalizing a sentence span annotation by creating a sentence for each uncovered token span
* Input sentence spans must be in order and non-overlapping
*/
public static int[] normalizeSentences(List<IntPair> sentences, int numberOfTokens) {
List<Integer> sentenceEndTokenIndexes = new ArrayList<>();
int lastTokenId = 0;
for (IntPair sentence : sentences) {
if (sentence.getFirst() > lastTokenId) {
sentenceEndTokenIndexes.add(sentence.getFirst());
}
lastTokenId = sentence.getSecond();
sentenceEndTokenIndexes.add(sentence.getSecond());
}
if (lastTokenId < numberOfTokens) {
sentenceEndTokenIndexes.add(numberOfTokens);
}
return sentenceEndTokenIndexes.stream().mapToInt(i -> i).toArray();
}
Aggregations