Search in sources :

Example 56 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testIllinoisTokenizer.

/**
     * Test method for {@link IllinoisTokenizer} .
     */
@Test
public void testIllinoisTokenizer() {
    Tokenizer tokenizer = new StatefulTokenizer();
    String sentence = "This is a   test.";
    String[] tokens = { "This", "is", "a", "test", "." };
    IntPair[] offsets = new IntPair[tokens.length];
    offsets[0] = new IntPair(0, 4);
    offsets[1] = new IntPair(5, 7);
    offsets[2] = new IntPair(8, 9);
    offsets[3] = new IntPair(12, 16);
    offsets[4] = new IntPair(16, 17);
    doTokenizerTest(tokenizer, sentence, tokens, offsets);
    sentence = "Hello, world! I am at UIUC.";
    tokens = new String[] { "Hello", ",", "world", "!", "I", "am", "at", "UIUC", "." };
    offsets = new IntPair[tokens.length];
    offsets[0] = new IntPair(0, 5);
    offsets[1] = new IntPair(5, 6);
    offsets[2] = new IntPair(7, 12);
    offsets[3] = new IntPair(12, 13);
    offsets[4] = new IntPair(14, 15);
    offsets[5] = new IntPair(16, 18);
    offsets[6] = new IntPair(19, 21);
    offsets[7] = new IntPair(22, 26);
    offsets[8] = new IntPair(26, 27);
    doTokenizerTest(tokenizer, sentence, tokens, offsets);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 57 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testStatefulTokenizerMultiline.

/**
     * Test the stateful tokenizer doing multi line tests.
     */
@Test
public void testStatefulTokenizerMultiline() {
    Tokenizer tkr = new StatefulTokenizer();
    String text = "Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists.   " + System.lineSeparator() + "He is intolerant of intolerance!";
    Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
    int[] sentEndOffsets = tknzn.getSentenceEndTokenIndexes();
    assertEquals(2, sentEndOffsets.length);
    assertEquals(12, sentEndOffsets[0]);
    assertEquals(18, sentEndOffsets[1]);
    String[] tokens = tknzn.getTokens();
    assertEquals("--", tokens[6]);
    assertEquals("of", tokens[15]);
    IntPair[] tokenOffsets = tknzn.getCharacterOffsets();
    int notIndex = 8;
    IntPair notOffsets = new IntPair(42, 45);
    assertEquals(notOffsets, tokenOffsets[notIndex]);
    int intolerantIndex = 14;
    IntPair intolerantOffsets = new IntPair(77, 87);
    assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 58 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class PredicateArgumentEvaluator method getArgumentMap.

/**
     * This is an annoying function to write. It is probably VERY inefficient too...
     */
private Map<IntPair, Record> getArgumentMap(PredicateArgumentView view, Constituent predicate) {
    Set<IntPair> spans = new HashSet<>();
    List<Pair<String, Constituent>> output = new ArrayList<>();
    for (Relation r : view.getArguments(predicate)) {
        Constituent target = r.getTarget();
        output.add(new Pair<>(r.getRelationName(), target));
        if (spans.contains(target.getSpan()))
            logger.error("Error! Overlapping spans in " + view.getViewName() + "\n" + view.getTextAnnotation() + "\n" + view);
        spans.add(target.getSpan());
    }
    Collections.sort(output, new Comparator<Pair<String, Constituent>>() {

        public int compare(Pair<String, Constituent> arg0, Pair<String, Constituent> arg1) {
            return TextAnnotationUtilities.constituentStartComparator.compare(arg0.getSecond(), arg1.getSecond());
        }
    });
    List<Record> records = new ArrayList<>();
    // add a label for the verb first
    Record vRecord = new Record(predicate.getStartSpan(), predicate.getEndSpan(), "V");
    records.add(vRecord);
    Map<String, Record> recordsSoFar = new HashMap<>();
    recordsSoFar.put("V", vRecord);
    for (Pair<String, Constituent> pair : output) {
        Constituent c = pair.getSecond();
        String label = pair.getFirst().replaceAll("Support", "SUP");
        if (label.startsWith("C-")) {
            String baseLabel = label.replaceAll("C-", "");
            if (recordsSoFar.containsKey(baseLabel)) {
                Record record = recordsSoFar.get(baseLabel);
                record.start = Math.min(c.getStartSpan(), record.start);
                record.end = Math.max(c.getEndSpan(), record.end);
                assert record.baseLabel.equals(baseLabel);
                record.components.put(c.getSpan(), label);
            } else {
                // a dangling C-arg. This should never happen, but one never knows.
                // Simply treat this C-arg as arg.
                Record record = new Record(c.getStartSpan(), c.getEndSpan(), baseLabel);
                recordsSoFar.put(baseLabel, record);
                records.add(record);
            }
        } else {
            Record record = new Record(c.getStartSpan(), c.getEndSpan(), label);
            recordsSoFar.put(label, record);
            records.add(record);
        }
    }
    Map<IntPair, Record> map = new HashMap<>();
    for (Record rec : records) {
        map.put(new IntPair(rec.start, rec.end), rec);
    }
    return map;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 59 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class PredicateArgumentEvaluator method evaluate.

/**
     * This function emulates the standard SRL evaluation script. The treatment of C-Args in the
     * original script is non-intuitive, but has been replicated here.
     *
     * @param tester The multi-class {@link ClassificationTester} for the argument labels
     */
public void evaluate(ClassificationTester tester, View goldView, View predictionView) {
    gold = (PredicateArgumentView) goldView;
    prediction = (PredicateArgumentView) predictionView;
    goldToPredictionPredicateMapping = getGoldToPredictionPredicateMapping();
    for (Constituent gp : gold.getPredicates()) {
        if (!goldToPredictionPredicateMapping.containsKey(gp)) {
            // if there is no matching prediction, then, we have a recall
            // problem for the label "V".
            tester.recordGoldOnly("V");
            // to decide on the arguments of this predicate.
            continue;
        }
        Constituent pp = goldToPredictionPredicateMapping.get(gp);
        Map<IntPair, Record> goldLabels = getArgumentMap(gold, gp);
        Map<IntPair, Record> predictedLabels = getArgumentMap(prediction, pp);
        Set<IntPair> goldDone = new HashSet<>();
        for (IntPair predictedSpan : predictedLabels.keySet()) {
            Record p = predictedLabels.get(predictedSpan);
            Record g = goldLabels.get(predictedSpan);
            if (g == null) {
                tester.recordPredictionOnly(p.baseLabel);
                continue;
            }
            Map<IntPair, String> gComponents = g.components;
            Map<IntPair, String> pComponents = p.components;
            assert gComponents != null;
            assert pComponents != null;
            if (gComponents.size() == 1 && pComponents.size() == 1) {
                tester.record(g.baseLabel, p.baseLabel);
                goldDone.add(predictedSpan);
            } else if (gComponents.size() > 1 && pComponents.size() == 1) {
                // this is a strange thing about the standard evaluation
                // script. If the gold label contains a C-arg and the
                // predicted label doesn't, then the script counts ONE
                // over-prediction (Even if the C-args and the arg of the
                // gold label together form the same span as the prediction.)
                tester.recordPredictionOnly(p.baseLabel);
            } else if (gComponents.size() == 1 && pComponents.size() > 1) {
                // same as above!
                tester.recordPredictionOnly(p.baseLabel);
            } else {
                if (p.baseLabel.startsWith("AM")) {
                    Set<IntPair> set = new HashSet<>();
                    set.addAll(gComponents.keySet());
                    set.addAll(pComponents.keySet());
                    for (IntPair s : set) {
                        String gLabel = gComponents.get(s);
                        String pLabel = pComponents.get(s);
                        if (gLabel != null && pLabel != null)
                            tester.record(gLabel, pLabel);
                        else if (gLabel == null)
                            tester.recordPredictionOnly(pLabel);
                        else
                            tester.recordGoldOnly(gLabel);
                    }
                    goldDone.add(predictedSpan);
                } else {
                    // all spans should be correct!
                    boolean allOK = p.baseLabel.equals(g.baseLabel);
                    Set<IntPair> goldSpansLeft = new HashSet<>(gComponents.keySet());
                    for (IntPair pSpan : pComponents.keySet()) {
                        if (gComponents.containsKey(pSpan))
                            goldSpansLeft.remove(pSpan);
                        else {
                            allOK = false;
                            break;
                        }
                    }
                    if (allOK) {
                        tester.record(g.baseLabel, p.baseLabel);
                        goldDone.add(predictedSpan);
                    } else {
                        tester.recordPredictionOnly(p.baseLabel);
                    }
                }
            }
        }
        for (IntPair gSpan : goldLabels.keySet()) {
            if (!goldDone.contains(gSpan))
                tester.recordGoldOnly(goldLabels.get(gSpan).baseLabel);
        }
    }
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 60 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class BasicTextAnnotationBuilder method tokenizeTextSpan.

private static Tokenization tokenizeTextSpan(List<String[]> tokenizedSentences) {
    List<String> tokensList = new ArrayList<>();
    List<IntPair> charOffsetsList = new ArrayList<>();
    int[] sentenceEndIndexes = new int[tokenizedSentences.size()];
    int sentIndex = 0;
    int sentStartTokOffset = 0;
    int sentStartCharOffset = 0;
    for (String[] sentenceTokens : tokenizedSentences) {
        sentenceEndIndexes[sentIndex++] = sentStartTokOffset + sentenceTokens.length;
        int tokenStartOffset = 0;
        int nextSentStartCharOffset = 0;
        for (String sentenceToken : sentenceTokens) {
            tokensList.add(sentenceToken);
            int tokenCharStart = sentStartCharOffset + tokenStartOffset;
            int tokenCharEnd = tokenCharStart + sentenceToken.length();
            IntPair translatedCharOffset = new IntPair(tokenCharStart, tokenCharEnd);
            charOffsetsList.add(translatedCharOffset);
            // The next token should start after a single space
            tokenStartOffset += sentenceToken.length() + 1;
            // by end of loop, this should match
            nextSentStartCharOffset = tokenCharEnd + 1;
        // start of next sentence
        }
        sentStartTokOffset += sentenceTokens.length;
        sentStartCharOffset = nextSentStartCharOffset;
    }
    assert tokensList.size() == charOffsetsList.size();
    String[] tokens = new String[tokensList.size()];
    for (int i = 0; i < tokensList.size(); i++) tokens[i] = tokensList.get(i);
    IntPair[] charOffsets = new IntPair[charOffsetsList.size()];
    for (int i = 0; i < charOffsetsList.size(); i++) charOffsets[i] = charOffsetsList.get(i);
    return new Tokenization(tokens, charOffsets, sentenceEndIndexes);
}
Also used : ArrayList(java.util.ArrayList) Tokenization(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3