Search in sources :

Example 6 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testWhitespaceBehavior.

/**
 * Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
 * xml markup has been replaced with whitespace of equal span.
 */
@Test
public void testWhitespaceBehavior() {
    String origText = null;
    try {
        origText = LineIO.slurp(INFILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
    Matcher xmlMatcher = xmlTagPattern.matcher(origText);
    StringBuilder cleanTextBldr = new StringBuilder();
    int lastAppendedCharOffset = 0;
    while (xmlMatcher.find()) {
        int start = xmlMatcher.start();
        int end = xmlMatcher.end();
        cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
        for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
        lastAppendedCharOffset = end;
    }
    cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
    String cleanText = cleanTextBldr.toString();
    // count whitespace chars in string
    // check token offsets in tokens returned by SentenceSplitter
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(cleanText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
    Sentence[] sents = splitter.splitAll();
    Sentence s = sents[0];
    LinkedVector words = s.wordSplit();
    for (int i = 0; i < words.size(); ++i) {
        Word firstWord = (Word) words.get(0);
        if ("Sun".equals(firstWord.form)) {
            IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
            assertTrue(sunSpans.contains(tokenCharOffsets));
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
    assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
    for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
        String tok = tokenInfo.getTokens()[i];
        if (tok.equals("Sun")) {
            IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
            if (!sunSpans.contains(tokCharOffsets)) {
                String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
                System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
            }
            assertTrue(sunSpans.contains(tokCharOffsets));
        }
    }
    TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
    assertNotNull(statefulTa);
}
Also used : Pattern(java.util.regex.Pattern) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 7 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.

/**
 * test whether the mapping between character offset and token index is correct.
 */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 8 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class EREDocumentReader method createAndAddXmlMarkupAnnotations.

/**
 * create a view with constituents representing post boundaries and quotations.
 * For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
 *    and attributes NAME_START and NAME_END specify the name offsets in the original xml text
 *
 * @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
 */
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
    List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
    TextAnnotation ta = xmlTa.getTextAnnotation();
    View postView = new View(getPostViewName(), NAME, ta, 1.0);
    for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
        String label = spanInfo.label;
        Pair<String, IntPair> authorInfo = null;
        boolean isPost = false;
        if (POST.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(AUTHOR);
        } else if (QUOTE.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
        }
        if (isPost) {
            IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
            if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
                throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
            int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
            int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
            assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
            Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
            if (null != authorInfo) {
                c.addAttribute(AUTHOR, authorInfo.getFirst());
                c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
                c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
                postView.addConstituent(c);
            }
        }
    }
    if (!postView.getConstituents().isEmpty())
        ta.addView(getPostViewName(), postView);
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 9 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class OracleTokenizer method tokenize.

/**
 * Produces a Tokenization with character offsets based on
 * 1. a raw text
 * 2. a given set of tokens, which must be a sequence of substrings of the raw text in order
 * 3. a given set of sentence spans, which will be normalized to cover all tokens
 */
public Tokenizer.Tokenization tokenize(String rawText, List<String> tokens, List<IntPair> sentences) {
    List<IntPair> charOffsets = new ArrayList<>();
    for (int i = 0, rawTextOffset = 0; i < tokens.size(); ++i) {
        String token = tokens.get(i);
        int tokenStartOffset = rawText.indexOf(token, rawTextOffset);
        if (tokenStartOffset == -1) {
            throw new IllegalTokenException(tokens, i, rawText, rawTextOffset, exceptionDisplayLength);
        }
        int tokenEndOffset = tokenStartOffset + token.length();
        charOffsets.add(new IntPair(tokenStartOffset, tokenEndOffset));
        rawTextOffset = tokenEndOffset;
    }
    int[] sentenceEndTokenIndexes = normalizeSentences(sentences, tokens.size());
    return new Tokenizer.Tokenization(tokens.toArray(new String[0]), charOffsets.toArray(new IntPair[0]), sentenceEndTokenIndexes);
}
Also used : ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 10 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class OracleTokenizer method normalizeSentences.

/**
 * Helper for normalizing a sentence span annotation by creating a sentence for each uncovered token span
 * Input sentence spans must be in order and non-overlapping
 */
public static int[] normalizeSentences(List<IntPair> sentences, int numberOfTokens) {
    List<Integer> sentenceEndTokenIndexes = new ArrayList<>();
    int lastTokenId = 0;
    for (IntPair sentence : sentences) {
        if (sentence.getFirst() > lastTokenId) {
            sentenceEndTokenIndexes.add(sentence.getFirst());
        }
        lastTokenId = sentence.getSecond();
        sentenceEndTokenIndexes.add(sentence.getSecond());
    }
    if (lastTokenId < numberOfTokens) {
        sentenceEndTokenIndexes.add(numberOfTokens);
    }
    return sentenceEndTokenIndexes.stream().mapToInt(i -> i).toArray();
}
Also used : StringUtils(org.apache.commons.lang.StringUtils) List(java.util.List) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Tokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3