use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testIllinoisTokenizer.
/**
* Test method for {@link IllinoisTokenizer} .
*/
@Test
public void testIllinoisTokenizer() {
Tokenizer tokenizer = new StatefulTokenizer();
String sentence = "This is a test.";
String[] tokens = { "This", "is", "a", "test", "." };
IntPair[] offsets = new IntPair[tokens.length];
offsets[0] = new IntPair(0, 4);
offsets[1] = new IntPair(5, 7);
offsets[2] = new IntPair(8, 9);
offsets[3] = new IntPair(12, 16);
offsets[4] = new IntPair(16, 17);
doTokenizerTest(tokenizer, sentence, tokens, offsets);
sentence = "Hello, world! I am at UIUC.";
tokens = new String[] { "Hello", ",", "world", "!", "I", "am", "at", "UIUC", "." };
offsets = new IntPair[tokens.length];
offsets[0] = new IntPair(0, 5);
offsets[1] = new IntPair(5, 6);
offsets[2] = new IntPair(7, 12);
offsets[3] = new IntPair(12, 13);
offsets[4] = new IntPair(14, 15);
offsets[5] = new IntPair(16, 18);
offsets[6] = new IntPair(19, 21);
offsets[7] = new IntPair(22, 26);
offsets[8] = new IntPair(26, 27);
doTokenizerTest(tokenizer, sentence, tokens, offsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testStatefulTokenizerMultiline.
/**
* Test the stateful tokenizer doing multi line tests.
*/
@Test
public void testStatefulTokenizerMultiline() {
Tokenizer tkr = new StatefulTokenizer();
String text = "Mr. Dawkins -- a liberal professor -- doesn't like fundamentalists. " + System.lineSeparator() + "He is intolerant of intolerance!";
Tokenizer.Tokenization tknzn = tkr.tokenizeTextSpan(text);
int[] sentEndOffsets = tknzn.getSentenceEndTokenIndexes();
assertEquals(2, sentEndOffsets.length);
assertEquals(12, sentEndOffsets[0]);
assertEquals(18, sentEndOffsets[1]);
String[] tokens = tknzn.getTokens();
assertEquals("--", tokens[6]);
assertEquals("of", tokens[15]);
IntPair[] tokenOffsets = tknzn.getCharacterOffsets();
int notIndex = 8;
IntPair notOffsets = new IntPair(42, 45);
assertEquals(notOffsets, tokenOffsets[notIndex]);
int intolerantIndex = 14;
IntPair intolerantOffsets = new IntPair(77, 87);
assertEquals(intolerantOffsets, tokenOffsets[intolerantIndex]);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class PredicateArgumentEvaluator method getArgumentMap.
/**
* This is an annoying function to write. It is probably VERY inefficient too...
*/
private Map<IntPair, Record> getArgumentMap(PredicateArgumentView view, Constituent predicate) {
Set<IntPair> spans = new HashSet<>();
List<Pair<String, Constituent>> output = new ArrayList<>();
for (Relation r : view.getArguments(predicate)) {
Constituent target = r.getTarget();
output.add(new Pair<>(r.getRelationName(), target));
if (spans.contains(target.getSpan()))
logger.error("Error! Overlapping spans in " + view.getViewName() + "\n" + view.getTextAnnotation() + "\n" + view);
spans.add(target.getSpan());
}
Collections.sort(output, new Comparator<Pair<String, Constituent>>() {
public int compare(Pair<String, Constituent> arg0, Pair<String, Constituent> arg1) {
return TextAnnotationUtilities.constituentStartComparator.compare(arg0.getSecond(), arg1.getSecond());
}
});
List<Record> records = new ArrayList<>();
// add a label for the verb first
Record vRecord = new Record(predicate.getStartSpan(), predicate.getEndSpan(), "V");
records.add(vRecord);
Map<String, Record> recordsSoFar = new HashMap<>();
recordsSoFar.put("V", vRecord);
for (Pair<String, Constituent> pair : output) {
Constituent c = pair.getSecond();
String label = pair.getFirst().replaceAll("Support", "SUP");
if (label.startsWith("C-")) {
String baseLabel = label.replaceAll("C-", "");
if (recordsSoFar.containsKey(baseLabel)) {
Record record = recordsSoFar.get(baseLabel);
record.start = Math.min(c.getStartSpan(), record.start);
record.end = Math.max(c.getEndSpan(), record.end);
assert record.baseLabel.equals(baseLabel);
record.components.put(c.getSpan(), label);
} else {
// a dangling C-arg. This should never happen, but one never knows.
// Simply treat this C-arg as arg.
Record record = new Record(c.getStartSpan(), c.getEndSpan(), baseLabel);
recordsSoFar.put(baseLabel, record);
records.add(record);
}
} else {
Record record = new Record(c.getStartSpan(), c.getEndSpan(), label);
recordsSoFar.put(label, record);
records.add(record);
}
}
Map<IntPair, Record> map = new HashMap<>();
for (Record rec : records) {
map.put(new IntPair(rec.start, rec.end), rec);
}
return map;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class PredicateArgumentEvaluator method evaluate.
/**
* This function emulates the standard SRL evaluation script. The treatment of C-Args in the
* original script is non-intuitive, but has been replicated here.
*
* @param tester The multi-class {@link ClassificationTester} for the argument labels
*/
public void evaluate(ClassificationTester tester, View goldView, View predictionView) {
gold = (PredicateArgumentView) goldView;
prediction = (PredicateArgumentView) predictionView;
goldToPredictionPredicateMapping = getGoldToPredictionPredicateMapping();
for (Constituent gp : gold.getPredicates()) {
if (!goldToPredictionPredicateMapping.containsKey(gp)) {
// if there is no matching prediction, then, we have a recall
// problem for the label "V".
tester.recordGoldOnly("V");
// to decide on the arguments of this predicate.
continue;
}
Constituent pp = goldToPredictionPredicateMapping.get(gp);
Map<IntPair, Record> goldLabels = getArgumentMap(gold, gp);
Map<IntPair, Record> predictedLabels = getArgumentMap(prediction, pp);
Set<IntPair> goldDone = new HashSet<>();
for (IntPair predictedSpan : predictedLabels.keySet()) {
Record p = predictedLabels.get(predictedSpan);
Record g = goldLabels.get(predictedSpan);
if (g == null) {
tester.recordPredictionOnly(p.baseLabel);
continue;
}
Map<IntPair, String> gComponents = g.components;
Map<IntPair, String> pComponents = p.components;
assert gComponents != null;
assert pComponents != null;
if (gComponents.size() == 1 && pComponents.size() == 1) {
tester.record(g.baseLabel, p.baseLabel);
goldDone.add(predictedSpan);
} else if (gComponents.size() > 1 && pComponents.size() == 1) {
// this is a strange thing about the standard evaluation
// script. If the gold label contains a C-arg and the
// predicted label doesn't, then the script counts ONE
// over-prediction (Even if the C-args and the arg of the
// gold label together form the same span as the prediction.)
tester.recordPredictionOnly(p.baseLabel);
} else if (gComponents.size() == 1 && pComponents.size() > 1) {
// same as above!
tester.recordPredictionOnly(p.baseLabel);
} else {
if (p.baseLabel.startsWith("AM")) {
Set<IntPair> set = new HashSet<>();
set.addAll(gComponents.keySet());
set.addAll(pComponents.keySet());
for (IntPair s : set) {
String gLabel = gComponents.get(s);
String pLabel = pComponents.get(s);
if (gLabel != null && pLabel != null)
tester.record(gLabel, pLabel);
else if (gLabel == null)
tester.recordPredictionOnly(pLabel);
else
tester.recordGoldOnly(gLabel);
}
goldDone.add(predictedSpan);
} else {
// all spans should be correct!
boolean allOK = p.baseLabel.equals(g.baseLabel);
Set<IntPair> goldSpansLeft = new HashSet<>(gComponents.keySet());
for (IntPair pSpan : pComponents.keySet()) {
if (gComponents.containsKey(pSpan))
goldSpansLeft.remove(pSpan);
else {
allOK = false;
break;
}
}
if (allOK) {
tester.record(g.baseLabel, p.baseLabel);
goldDone.add(predictedSpan);
} else {
tester.recordPredictionOnly(p.baseLabel);
}
}
}
}
for (IntPair gSpan : goldLabels.keySet()) {
if (!goldDone.contains(gSpan))
tester.recordGoldOnly(goldLabels.get(gSpan).baseLabel);
}
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class BasicTextAnnotationBuilder method tokenizeTextSpan.
private static Tokenization tokenizeTextSpan(List<String[]> tokenizedSentences) {
List<String> tokensList = new ArrayList<>();
List<IntPair> charOffsetsList = new ArrayList<>();
int[] sentenceEndIndexes = new int[tokenizedSentences.size()];
int sentIndex = 0;
int sentStartTokOffset = 0;
int sentStartCharOffset = 0;
for (String[] sentenceTokens : tokenizedSentences) {
sentenceEndIndexes[sentIndex++] = sentStartTokOffset + sentenceTokens.length;
int tokenStartOffset = 0;
int nextSentStartCharOffset = 0;
for (String sentenceToken : sentenceTokens) {
tokensList.add(sentenceToken);
int tokenCharStart = sentStartCharOffset + tokenStartOffset;
int tokenCharEnd = tokenCharStart + sentenceToken.length();
IntPair translatedCharOffset = new IntPair(tokenCharStart, tokenCharEnd);
charOffsetsList.add(translatedCharOffset);
// The next token should start after a single space
tokenStartOffset += sentenceToken.length() + 1;
// by end of loop, this should match
nextSentStartCharOffset = tokenCharEnd + 1;
// start of next sentence
}
sentStartTokOffset += sentenceTokens.length;
sentStartCharOffset = nextSentStartCharOffset;
}
assert tokensList.size() == charOffsetsList.size();
String[] tokens = new String[tokensList.size()];
for (int i = 0; i < tokensList.size(); i++) tokens[i] = tokensList.get(i);
IntPair[] charOffsets = new IntPair[charOffsetsList.size()];
for (int i = 0; i < charOffsetsList.size(); i++) charOffsets[i] = charOffsetsList.get(i);
return new Tokenization(tokens, charOffsets, sentenceEndIndexes);
}
Aggregations