use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class JsonSerializerTest method verifyDeserializedJsonString.
/** Behavior specific to unit tests only. Use with caution */
public static void verifyDeserializedJsonString(String json, TextAnnotation ta) throws Exception {
TextAnnotation ta2 = SerializationHelper.deserializeFromJson(json);
assertEquals(ta2.getCorpusId(), ta.getCorpusId());
assertEquals(ta2.getId(), ta.getId());
assertEquals(ta2.getNumberOfSentences(), ta.getNumberOfSentences());
assertEquals(ta2.getSentence(1), ta.getSentence(1));
assertEquals(ta2.getSentenceFromToken(2), ta.getSentenceFromToken(2));
assertEquals(ta2.getTokenIdFromCharacterOffset(5), ta.getTokenIdFromCharacterOffset(5));
assertEquals(ta2.getToken(4), ta.getToken(4));
assertEquals(ta2.getAvailableViews(), ta.getAvailableViews());
assertEquals(Arrays.toString(ta2.getTokensInSpan(1, 3)), Arrays.toString(ta.getTokensInSpan(1, 3)));
assertEquals(ta2.getText(), ta.getText());
Constituent seventhToken = ta.getView(ViewNames.TOKENS).getConstituents().get(6);
IntPair tokCharOffsets = new IntPair(seventhToken.getStartCharOffset(), seventhToken.getEndCharOffset());
String seventhTokenForm = seventhToken.getSurfaceForm();
Constituent seventhTokenCopy = ta2.getView(ViewNames.TOKENS).getConstituents().get(6);
IntPair tokCharOffsets2 = new IntPair(seventhTokenCopy.getStartCharOffset(), seventhTokenCopy.getEndCharOffset());
String seventhTokenForm2 = seventhTokenCopy.getSurfaceForm();
assertEquals(seventhTokenForm, seventhTokenForm2);
assertEquals(tokCharOffsets, tokCharOffsets2);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class GoldLabel method getArgument.
List<Constituent> getArgument(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield, boolean mergeContiguousCArgs) {
String[] parts = propSpanInfo.split("\\*");
List<Pair<IntPair, Boolean>> spans = new ArrayList<>();
boolean someR = false;
for (String part : parts) {
if (part.length() == 0)
continue;
for (String s : part.split(",")) {
if (s.length() == 0)
continue;
Pair<String, IntPair> info = getSpan(s, yield);
String nonTerminal = info.getFirst();
IntPair span = info.getSecond();
if (span.getFirst() < 0 || span.getFirst() >= span.getSecond())
continue;
boolean r = false;
if (nonTerminal.startsWith("WH")) {
r = true;
someR = true;
}
spans.add(new Pair<>(span, r));
}
}
Collections.sort(spans, new Comparator<Pair<IntPair, Boolean>>() {
@Override
public int compare(Pair<IntPair, Boolean> arg0, Pair<IntPair, Boolean> arg1) {
if (arg0.getFirst().getFirst() < arg1.getFirst().getFirst())
return -1;
else if (arg0.getFirst().getFirst() == arg1.getFirst().getFirst())
return 0;
else
return 1;
}
});
if (!someR && mergeContiguousCArgs) {
spans = mergeCArgs(spans);
}
boolean first = true;
List<Constituent> arg = new ArrayList<>();
for (Pair<IntPair, Boolean> item : spans) {
String label = this.label;
if (item.getSecond() && spans.size() > 1) {
label = "R-" + label;
} else {
if (first) {
first = false;
} else {
label = "C-" + label;
}
}
Constituent constituent = new Constituent(label, viewName, ta, item.getFirst().getFirst(), item.getFirst().getSecond());
if (h != null) {
constituent.addAttribute(AbstractSRLAnnotationReader.HyphenTagInfo, h);
}
arg.add(constituent);
}
return arg;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class GoldLabel method addAnnotation.
private void addAnnotation(TextAnnotation ta) {
Tree<String> tree = ParseUtils.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
PredicateArgumentView pav = new PredicateArgumentView(srlViewName, "AnnotatedTreebank", ta, 1.0);
Set<Integer> predicates = new HashSet<>();
for (Fields fields : goldFields.get(ta.getId())) {
Constituent predicate = fields.createPredicate(ta, srlViewName, yield);
if (predicates.contains(predicate.getStartSpan()))
continue;
predicates.add(predicate.getStartSpan());
List<Constituent> args = new ArrayList<>();
List<String> labels = new ArrayList<>();
List<Double> scores = new ArrayList<>();
// We need to make sure that the One-Argument-Per-Span constraint is
// respected. Yes sir, we do, even if the data says otherwise!
Set<IntPair> seenSpans = new HashSet<>();
for (GoldLabel arg : fields.getGoldLabels()) {
List<Constituent> aa = arg.getArgument(ta, srlViewName, yield, mergeContiguousCArgs);
List<Constituent> filtered = new ArrayList<>();
for (Constituent possibleArg : aa) {
if (seenSpans.contains(possibleArg.getSpan()))
continue;
seenSpans.add(possibleArg.getSpan());
filtered.add(possibleArg);
}
addArguments(ta, predicate, args, labels, scores, arg, filtered);
}
// for each arg
pav.addPredicateArguments(predicate, args, labels.toArray(new String[labels.size()]), ArrayUtilities.asDoubleArray(scores));
}
if (pav.getPredicates().size() > 0)
ta.addView(srlViewName, pav);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class GoldLabel method mergeCArgs.
protected List<Pair<IntPair, Boolean>> mergeCArgs(List<Pair<IntPair, Boolean>> spans) {
if (spans.size() <= 1)
return spans;
List<Pair<IntPair, Boolean>> list = new ArrayList<>();
IntPair prev = null;
boolean r = true;
for (Pair<IntPair, Boolean> p : spans) {
if (prev == null) {
prev = p.getFirst();
r = p.getSecond();
} else {
if (p.getFirst().getFirst() == prev.getSecond()) {
prev = new IntPair(prev.getFirst(), p.getFirst().getSecond());
r &= p.getSecond();
} else {
list.add(new Pair<>(prev, r));
prev = p.getFirst();
r = p.getSecond();
}
}
}
list.add(new Pair<>(prev, r));
assert list.size() <= spans.size();
if (spans.size() > 0)
assert list.size() > 0;
return list;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class CoNLLNerReader method loadCoNLLfile.
/**
* This loads filename into a textannotation.
*
* @param filename
* @return
* @throws FileNotFoundException
*/
public static TextAnnotation loadCoNLLfile(String filename) throws FileNotFoundException {
logger.info("Reading: " + filename);
List<String> lines = LineIO.read(filename);
List<IntPair> spans = new ArrayList<>();
List<String> labels = new ArrayList<>();
List<Integer> sentenceEndPositions = new ArrayList<>();
StringBuilder text = new StringBuilder();
int start = -1;
String label = "";
int i = 0;
for (String line : lines) {
String[] sline = line.split("\t");
if (line.startsWith("B-")) {
// two consecutive entities.
if (start > -1) {
// peel off a constituent if it exists.
spans.add(new IntPair(start, i));
labels.add(label);
}
start = i;
label = sline[0].split("-")[1];
} else if (sline[0].startsWith("I-")) {
// don't do anything....
} else {
// this is a sentence boundary.
if (line.trim().length() == 0) {
// in case there are multiple empty lines at the end.
if (!sentenceEndPositions.contains(i) && i > 0) {
sentenceEndPositions.add(i);
}
}
// it's O or it's empty
if (start > -1) {
// peel off a constituent if it exists.
spans.add(new IntPair(start, i));
labels.add(label);
}
label = "";
start = -1;
}
// add the word form to the sentence.
if (sline.length > 5 && !sline[5].equals("-DOCSTART-") && sline[5].trim().length() > 0) {
text.append(sline[5] + " ");
i++;
}
}
// in case the very last line is an NE.
if (start > -1) {
spans.add(new IntPair(start, i));
labels.add(label);
}
// in case there are no empty lines.
if (!sentenceEndPositions.contains(i)) {
sentenceEndPositions.add(i);
}
// we jump through these hoops so we can give the TA an id.
String filenameonly = IOUtils.getFileName(filename);
List<String[]> tokenizedSentences = Collections.singletonList(text.toString().split(" "));
TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens("", filenameonly, tokenizedSentences);
SpanLabelView sentview = new SpanLabelView(ViewNames.SENTENCE, "UserSpecified", ta, 1d);
ta.addView(ViewNames.SENTENCE, sentview);
int sentstart = 0;
for (int s : sentenceEndPositions) {
sentview.addSpanLabel(sentstart, s, ViewNames.SENTENCE, 1d);
sentstart = s;
}
SpanLabelView emptyview = new SpanLabelView(ViewNames.NER_CONLL, "UserSpecified", ta, 1d);
ta.addView(ViewNames.NER_CONLL, emptyview);
for (int k = 0; k < labels.size(); k++) {
label = labels.get(k);
IntPair span = spans.get(k);
Constituent c = new Constituent(label, ViewNames.NER_CONLL, ta, span.getFirst(), span.getSecond());
emptyview.addConstituent(c);
}
return ta;
}
Aggregations