use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class JsonSerializer method readSentences.
private static Pair<Pair<String, Double>, int[]> readSentences(JsonObject json) {
JsonObject object = json.getAsJsonObject("sentences");
String generator = readString("generator", object);
double score = readDouble("score", object);
int[] endPositions = readIntArray("sentenceEndPositions", object);
return new Pair<>(new Pair<>(generator, score), endPositions);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class AnnotationFixer method rationalizeBoundaryAnnotations.
/**
* correct automated annotations (tokenization, sentence splitting) based on gold annotations of entity mentions
* @param ta TextAnnotation with annotated clean text
* @return a map of view names to indexes indicating where violations were found/corrected
*/
public static Map<String, Set<Integer>> rationalizeBoundaryAnnotations(TextAnnotation ta, String viewName) {
Map<String, Set<Integer>> violations = new HashMap<>();
Set<Integer> badSentenceStartIndexes = new HashSet<>();
violations.put(ViewNames.SENTENCE, badSentenceStartIndexes);
View sentences = ta.getView(ViewNames.SENTENCE);
TreeMap<Integer, Constituent> sentenceStarts = new TreeMap<>();
for (Constituent s : sentences) sentenceStarts.put(s.getStartSpan(), s);
Set<Pair<Constituent, Constituent>> sentencesToMerge = new HashSet<>();
View nerMention = ta.getView(viewName);
for (Constituent m : nerMention.getConstituents()) {
Constituent lastSent = null;
for (int sentStart : sentenceStarts.keySet()) {
int mentEnd = m.getEndSpan();
if (// ordered sentence list, so stop after
sentStart > mentEnd)
break;
Constituent currentSent = sentenceStarts.get(sentStart);
int mentStart = m.getStartSpan();
if (sentStart > mentStart && sentStart < mentEnd) {
sentencesToMerge.add(new Pair(lastSent, currentSent));
badSentenceStartIndexes.add(sentStart);
}
lastSent = currentSent;
}
}
Set<Integer> sentStartsProcessed = new HashSet<>();
for (Pair<Constituent, Constituent> sentPair : sentencesToMerge) {
Constituent first = sentPair.getFirst();
Constituent second = sentPair.getSecond();
int firstStart = first.getStartSpan();
int secondStart = second.getStartSpan();
if (sentStartsProcessed.contains(firstStart) || sentStartsProcessed.contains(secondStart)) {
throw new IllegalStateException("more complex boundary constraints than I can currently handle -- " + "more than two consecutive sentences with boundary errors.");
}
Constituent combinedSent = null;
if (null == first.getLabelsToScores())
combinedSent = new Constituent(first.getLabel(), first.getConstituentScore(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
else
combinedSent = new Constituent(first.getLabelsToScores(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
for (String k : first.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
for (String k : second.getAttributeKeys()) {
combinedSent.addAttribute(k, first.getAttribute(k));
}
sentences.removeConstituent(first);
sentences.removeConstituent(second);
sentences.addConstituent(combinedSent);
}
ta.setSentences();
return violations;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class StringTransformation method applyPendingEdits.
/**
* apply any pending edits, update the modified string
*/
public void applyPendingEdits() {
String currentStr = transformedText;
if (isModified) {
/*
immediately set flag, as we may call other methods that check this condition, which could call this
method
*/
isModified = false;
/*
* it's OK for edits to be unsorted: all edit offsets are computed relative to the previous edits
* in the sequence
*/
for (Edit edit : edits) {
IntPair editOffsets = edit.offsets;
String before = currentStr.substring(0, editOffsets.getFirst());
String after = currentStr.substring(editOffsets.getSecond());
currentStr = before + edit.newString + after;
}
transformedText = currentStr;
/*
* store pending recorded offsets while computing absolute offsets for all current edits
*/
Map<Integer, Pair<Integer, EditType>> toAdd = new TreeMap();
for (Integer modOffset : currentOffsetModifications.keySet()) {
Integer currentMod = currentOffsetModifications.get(modOffset).getFirst();
EditType currentEditType = currentOffsetModifications.get(modOffset).getSecond();
/*
* recorded offset mods MUST be made with respect to ORIGINAL offsets -- not the current transformed
* string.
*/
Integer absoluteModOffset = computeOriginalOffset(modOffset);
// TODO: verify that it's OK to just keep the original edit type
if (toAdd.containsKey(absoluteModOffset))
currentMod += toAdd.get(absoluteModOffset).getFirst();
toAdd.put(absoluteModOffset, new Pair<>(currentMod, currentEditType));
}
/**
* The entries in toAdd *cannot* conflict, because they come from a single pass
* Now we need to merge them with previously recorded offset mods
*/
if (recordedOffsetModifications.isEmpty())
recordedOffsetModifications.putAll(toAdd);
else {
TreeMap<Integer, Pair<Integer, EditType>> safeAdds = new TreeMap<>();
// stores position of greatest of last key, or the last key's effective edit position
int lastKeyPos = 0;
for (int key : toAdd.keySet()) {
int mod = toAdd.get(key).getFirst();
EditType editType = toAdd.get(key).getSecond();
if (key < lastKeyPos)
// move to after last entry key + edit
key = lastKeyPos;
/*
* it gets a bit tricky if a new deletion overlaps older edits: you need to split up the new edit.
* TODO: merge edits instead
*/
for (int oldKey : recordedOffsetModifications.keySet()) {
if (mod == 0)
break;
// am I at the same index?
if (oldKey == key) {
//if edit is an expansion, still advance one position
//move on...
key = Math.max(key + 1, key - recordedOffsetModifications.get(oldKey).getFirst());
} else // am I within the window of a prior edit?
if (oldKey < key) {
int oldMod = recordedOffsetModifications.get(oldKey).getFirst();
// negative, to compare with negative mod
int diff = oldKey - key;
if (diff > oldMod) {
// edits interfere; can't happen if oldMod is positive (insertion)
// modifier doesn't change: edit not applied yet; update edit
key = oldKey - oldMod;
// position to just past old edit
}
} else if (oldKey > key) {
// Is next edit within window of my edit?
// negative, to compare with -ve mod
int diff = key - oldKey;
if (diff > mod) {
//if diff > mod, mod is negative and edits interfere.
// delete up to current edit
safeAdds.put(key, new Pair<>(diff, editType));
// part of modification not accounted for; again, recall both negative
mod = mod - diff;
// move to index after old edit
key = oldKey - recordedOffsetModifications.get(oldKey).getFirst();
} else {
// either mod is positive, or next edit does not interfere
safeAdds.put(key, new Pair<>(mod, editType));
// update if -ve mod
lastKeyPos = Math.max(key, key - mod);
//break from the loop
mod = 0;
}
}
}
if (// past all old edits, haven't added it yet...
mod != 0)
safeAdds.put(key, new Pair<>(mod, editType));
}
recordedOffsetModifications.putAll(safeAdds);
}
/*
* compute inverse mapping (from transformed text offsets to original offsets)
* using the complete set of transformations to date: store as offset modifiers
* at transform string indexes where changes occur, such that adding the offset modifier
* to the current transform index yields the corresponding offset in the original string.
*/
recordedInverseModifications.clear();
/*
* recordedOffsetModifications: at char index X, modify running offset modifier by Y
*/
int cumulativeOffset = 0;
for (Integer transformModIndex : recordedOffsetModifications.keySet()) {
int baseIndex = transformModIndex;
int transformMod = recordedOffsetModifications.get(transformModIndex).getFirst();
EditType editType = recordedOffsetModifications.get(transformModIndex).getSecond();
/*
* suppose tranform offset is 33, and modifier is -33 (delete the first 33 chars of the orig string).
* Therefore we want index 0 of the transformed string to map to offset 33 of the orig string.
* So we update the cumulative offset *after* adding the current mod.
* Subsequent edits to orig string increase the total difference between the transformed string
* base index and the corresponding orig string index, hence the need for cumulative offset to
* be subtracted from the orig index. (mod is -ve, therefore subtraction even though it's added
* to the offset from the perspective of the original string
*/
int effectiveIndex = baseIndex - cumulativeOffset;
int effectiveMod = transformMod;
if (recordedInverseModifications.containsKey(effectiveIndex))
effectiveMod -= recordedInverseModifications.get(effectiveIndex).getFirst();
// TODO: verify that using most recent transform type is correct if there was already an edit in RIM
recordedInverseModifications.put(effectiveIndex, new Pair<>(-effectiveMod, editType));
cumulativeOffset -= transformMod;
}
if (DEBUG) {
int lastIndex = 0;
int lastOrigOffset = 0;
for (int revInd : recordedInverseModifications.keySet()) {
int diff = revInd - lastIndex;
String origSub = origText.substring(lastOrigOffset, lastOrigOffset + diff);
System.err.println(lastIndex + "-" + revInd + ": " + origSub);
lastOrigOffset = lastOrigOffset + diff + recordedInverseModifications.get(revInd).getFirst();
lastIndex = revInd;
}
}
/*
* cleanup: remove temporary state that has now been resolved
*/
currentOffsetModifications.clear();
edits.clear();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class TreeView method addParseTree.
/**
* Transforms an unscored input tree into the constituent-relation graph
*/
protected void addParseTree(Tree<Pair<String, IntPair>> spanLabeledTree, Constituent root, int sentenceStartPosition) {
for (int childId = 0; childId < spanLabeledTree.getNumberOfChildren(); childId++) {
Tree<Pair<String, IntPair>> child = spanLabeledTree.getChild(childId);
String edgeLabel;
edgeLabel = PARENT_OF_STRING;
Pair<String, IntPair> childLabel = child.getLabel();
IntPair childSpan = childLabel.getSecond();
int start = childSpan.getFirst() + sentenceStartPosition;
int end = childSpan.getSecond() + sentenceStartPosition;
String constituentLabel = childLabel.getFirst();
Constituent childConstituent;
if (start >= end) {
// Ignore constituents with incorrect span bounds
logger.debug("Constituent with incorrect span found in " + root.getViewName());
} else {
childConstituent = createNewConstituent(start, end, constituentLabel, 1.0);
if (end == start + 1 && child.getNumberOfChildren() == 0) {
// this is a leaf. The leaf must be a token in the sentence
String token = this.getTextAnnotation().getToken(start);
String s = constituentLabel;
token = treebankTokenHacks(token);
s = treebankTokenHacks(s);
if (!token.equals(s)) {
assert false : "Expecting token: " + token + ", found " + s + " instead.";
}
}
this.addConstituent(childConstituent);
this.addRelation(new Relation(edgeLabel, root, childConstituent, 1.0));
this.addParseTree(child, childConstituent, sentenceStartPosition);
}
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class TreeView method setParseTree.
/**
* Set the parse tree of the {@code sentenceId}<sup>th</sup> sentence.
* <p>
* <b>Note</b>: The same TreeView cannot contain both the parse tree and the dependency tree and
* will throw an exception if an attempt is made to set the parse tree in a view that has a
* dependency tree.
*/
public void setParseTree(int sentenceId, Tree<String> tree) {
safeInitializeTrees();
if (firstTree) {
firstTree = false;
this.isDependencyTree = false;
}
if (this.isDependencyTree) {
throw new IllegalArgumentException("Not expecting a dependency tree, but found " + tree);
}
this.trees.set(sentenceId, tree);
int sentenceStart = getSentenceStart(sentenceId);
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
Pair<String, IntPair> rootLabel = spanLabeledTree.getLabel();
IntPair rootSpan = rootLabel.getSecond();
int rootStart = rootSpan.getFirst() + sentenceStart;
int rootEnd = rootSpan.getSecond() + sentenceStart;
Constituent root = createNewConstituent(rootStart, rootEnd, rootLabel.getFirst(), 1.0);
this.addConstituent(root);
addParseTree(spanLabeledTree, root, sentenceStart);
}
Aggregations