Search in sources :

Example 46 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class IllinoisTokenizerTest method testIllinoisTokenizerMultiline.

@Test
public void testIllinoisTokenizerMultiline() {
    Tokenizer tkr = new IllinoisTokenizer();
    String text = "Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists.   " + System.lineSeparator() + "He is intolerant of intolerance!";
    Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
    int[] sentEndOffsets = tknzn.getSentenceEndTokenIndexes();
    assertEquals(2, sentEndOffsets.length);
    assertEquals(12, sentEndOffsets[0]);
    assertEquals(18, sentEndOffsets[1]);
    String[] tokens = tknzn.getTokens();
    assertEquals("--", tokens[6]);
    assertEquals("of", tokens[15]);
    IntPair[] tokenOffsets = tknzn.getCharacterOffsets();
    int notIndex = 8;
    IntPair notOffsets = new IntPair(42, 45);
    assertEquals(notOffsets, tokenOffsets[notIndex]);
    int intolerantIndex = 14;
    IntPair intolerantOffsets = new IntPair(77, 87);
    assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 47 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class IllinoisTokenizerTest method testIllinoisTokenizer.

/**
     * Test method for {@link IllinoisTokenizer} .
     */
@Test
public void testIllinoisTokenizer() {
    Tokenizer tokenizer = new IllinoisTokenizer();
    String sentence = "This is a   test.";
    String[] tokens = { "This", "is", "a", "test", "." };
    IntPair[] offsets = new IntPair[tokens.length];
    offsets[0] = new IntPair(0, 4);
    offsets[1] = new IntPair(5, 7);
    offsets[2] = new IntPair(8, 9);
    offsets[3] = new IntPair(12, 16);
    offsets[4] = new IntPair(16, 17);
    doTokenizerTest(tokenizer, sentence, tokens, offsets);
    sentence = "Hello, world! I am at UIUC.";
    tokens = new String[] { "Hello", ",", "world", "!", "I", "am", "at", "UIUC", "." };
    offsets = new IntPair[tokens.length];
    offsets[0] = new IntPair(0, 5);
    offsets[1] = new IntPair(5, 6);
    offsets[2] = new IntPair(7, 12);
    offsets[3] = new IntPair(12, 13);
    offsets[4] = new IntPair(14, 15);
    offsets[5] = new IntPair(16, 18);
    offsets[6] = new IntPair(19, 21);
    offsets[7] = new IntPair(22, 26);
    offsets[8] = new IntPair(26, 27);
    doTokenizerTest(tokenizer, sentence, tokens, offsets);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 48 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.

/**
     * test whether the mapping between character offset and token index is correct.
     */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Example 49 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testWhitespaceBehavior.

/**
     * Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
     * xml markup has been replaced with whitespace of equal span.
     */
@Test
public void testWhitespaceBehavior() {
    String origText = null;
    try {
        origText = LineIO.slurp(INFILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
    Matcher xmlMatcher = xmlTagPattern.matcher(origText);
    StringBuilder cleanTextBldr = new StringBuilder();
    int lastAppendedCharOffset = 0;
    while (xmlMatcher.find()) {
        int start = xmlMatcher.start();
        int end = xmlMatcher.end();
        cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
        for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
        lastAppendedCharOffset = end;
    }
    cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
    String cleanText = cleanTextBldr.toString();
    // count whitespace chars in string
    // check token offsets in tokens returned by SentenceSplitter
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(cleanText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
    Sentence[] sents = splitter.splitAll();
    Sentence s = sents[0];
    LinkedVector words = s.wordSplit();
    for (int i = 0; i < words.size(); ++i) {
        Word firstWord = (Word) words.get(0);
        if ("Sun".equals(firstWord.form)) {
            IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
            assertTrue(sunSpans.contains(tokenCharOffsets));
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
    assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
    for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
        String tok = tokenInfo.getTokens()[i];
        if (tok.equals("Sun")) {
            IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
            if (!sunSpans.contains(tokCharOffsets)) {
                String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
                System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
            }
            assertTrue(sunSpans.contains(tokCharOffsets));
        }
    }
    TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
    assertNotNull(statefulTa);
}
Also used : Pattern(java.util.regex.Pattern) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 50 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class TextAnnotationTest method testCharacterOffsetToTokenIndex.

/**
     * test whether the mapping between character offset and token index is correct.
     */
@Test
public void testCharacterOffsetToTokenIndex() {
    String normal = "The ordinary sample.\n\nDon't mess things up.";
    String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
    String postWaste = "   \n<ignoremetoo>aaaargh</ignoremetoo>";
    String other = leadingWaste + normal + postWaste;
    TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
    List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
    assertEquals(13, normalToks.get(2).getStartCharOffset());
    assertEquals(24, normalToks.get(5).getStartCharOffset());
    int ignoreUpToOffset = leadingWaste.length();
    IntPair[] characterOffsets = new IntPair[10];
    String[] tokens = taNormal.getTokens();
    for (int i = 0; i < normalToks.size(); ++i) {
        Constituent t = normalToks.get(i);
        characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
    }
    List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
    int[] sentenceEndPositions = new int[sentences.size()];
    for (int i = 0; i < sentences.size(); ++i) {
        Constituent s = sentences.get(i);
        sentenceEndPositions[i] = s.getEndSpan();
    }
    // all info should be same except initial char offsets of tokens ignore spans of text
    TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
    List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
    int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
    int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
    assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
    int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
    int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
    assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
    int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
    assertEquals(-1, meaninglessStartOffset);
    int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
    assertEquals(-1, meaninglessPastEndOffset);
    int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
    assertEquals(-1, meaninglessInBetweenToksOffset);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Test(org.junit.Test)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3