use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class PropbankFields method createPredicate.
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
int start = l.getLabel().getSecond().getFirst();
Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
predicate.addAttribute(PropbankReader.FormIdentifier, PropbankReader.Forms.getForm(inflection.charAt(0)).name());
predicate.addAttribute(PropbankReader.TenseIdentifier, PropbankReader.Tenses.getTense(inflection.charAt(1)).name());
predicate.addAttribute(PropbankReader.AspectIdentifier, PropbankReader.Aspects.getAspect(inflection.charAt(2)).name());
predicate.addAttribute(PropbankReader.PersonIdentifier, PropbankReader.Person.getPerson(inflection.charAt(3)).name());
predicate.addAttribute(PropbankReader.VoiceIdentifier, PropbankReader.Voices.getVoice(inflection.charAt(4)).name());
predicate.addAttribute(PropbankReader.Tagger, tagger);
return predicate;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessor method updateAttributeInfo.
/**
* store attribute info with correct offsets into original xml source text
* @param attributesRetained
* @param tagName
* @param startOffset
* @param endOffset
* @param spanAtts
* @param xmlTextSt
*/
private void updateAttributeInfo(List<SpanInfo> attributesRetained, String tagName, int startOffset, int endOffset, Map<String, Pair<String, IntPair>> spanAtts, StringTransformation xmlTextSt) {
IntPair origOffsets = xmlTextSt.getOriginalOffsets(startOffset, endOffset);
attributesRetained.add(new SpanInfo(tagName, origOffsets, spanAtts));
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessor method processXml.
/**
* This class removes XML markup, for the most part. For specified tags that denote spans of text other than
* body text (e.g. quotes, headlines), the text value and offsets are reported. For specified tags and attributes,
* the attribute values and their offsets are reported. Content within <code>quote</code>
* tags is left in place (though quote tags are removed) and the offsets are reported with the
* other specified attributes.
* This class has some facility for handling nested tags. Opens without closes are checked against
* tags to ignore (provided at construction) and if found are ignored (deleted). Otherwise, an exception
* is thrown.
* @param xmlText StringTransformation whose basis is the original xml text.
* @return String comprising text.
*/
public Pair<StringTransformation, List<SpanInfo>> processXml(String xmlText) {
StringTransformation xmlTextSt = new StringTransformation(xmlText);
xmlTextSt = StringTransformationCleanup.normalizeToEncoding(xmlTextSt, Charset.forName("UTF-8"));
// there are embedded xml tags in body text. Unescape them so we can process them easily.
xmlTextSt = replaceXmlEscapedChars(xmlTextSt);
xmlTextSt.applyPendingEdits();
// singletons can be nested in deletable spans, creating major headaches.
xmlTextSt = deleteSingletons(xmlTextSt);
// // there are some nested tags. If the nesting is simple, fix it. Otherwise, throw an exception.
// xmlTextSt.flattenNestedTags(xmlTextSt);
//
String xmlCurrentStr = xmlTextSt.getTransformedText();
// don't call getTransformedText() or applyPendingEdits() in the body of the loop usinr xmlMatcher
Matcher xmlMatcher = xmlTagPattern.matcher(xmlCurrentStr);
// span offsets, label, attName, attVal, attOffsets
List<SpanInfo> attributesRetained = new ArrayList<>();
// track open/close tags, to record spans for later use (e.g. quoted blocks that aren't annotated)
// each entry retains tagname, open tag offsets, attributes
// note that open tag offsets are NOT the same as the (complete span) offsets returned by this method
// IMPORTANT: offsets are computed from modified xml string (initial normalization steps clean up original)
// so code must adjust them for storing offsets for return.
Stack<SpanInfo> tagStack = new Stack<>();
// // right now, this is just useful for debugging
// Map<String, Integer> nestingLevels = new HashMap<>();
// track whether or not a tag is nested within something marked for deletion
int deletableNestingLevel = 0;
// match mark-up: xml open or close tag
while (xmlMatcher.find()) {
String substr = xmlMatcher.group(0);
boolean isClose = false;
if (substr.charAt(1) == '/') {
isClose = true;
} else if (substr.endsWith("/>") || substr.startsWith("<?xml")) {
// this is an empty tag
xmlTextSt.transformString(xmlMatcher.start(0), xmlMatcher.end(0), "");
continue;
}
String lcsubstr = substr.toLowerCase();
// get the tag name
Matcher tagMatcher = xmlTagNamePattern.matcher(lcsubstr);
if (tagMatcher.find()) {
// identify the tag
String tagName = tagMatcher.group(1);
if (isClose) {
SpanInfo openTagAndAtts = tagStack.pop();
// strip leading "/"
tagName = tagName.substring(1);
String openTagName = openTagAndAtts.label;
// check for lone tags (open without close or vice versa )
boolean isLoneClose = false;
while (!openTagName.equals(tagName) && !isLoneClose) {
if (throwExceptionOnUnrecognizedTag)
throw new IllegalStateException("Mismatched open and close tags. Expected '" + openTagName + "', found '" + tagName + "'");
else {
// someone used xml special chars in body text
logger.warn("WARNING: found close tag '{}' after open tag '{}', and (obviously) they don't match.", tagName, openTagName);
if (!tagStack.isEmpty()) {
// if lone tag is a close tag, hope that the open stack is empty
openTagAndAtts = tagStack.peek();
openTagName = openTagAndAtts.label;
if (!openTagAndAtts.equals(tagName))
isLoneClose = true;
else
// it matched, so we're good now
openTagAndAtts = tagStack.pop();
} else {
// unmatched lone close
isLoneClose = true;
}
}
}
if (isLoneClose) {
// revert to previous state, and resume parsing
tagStack.push(openTagAndAtts);
} else {
// now we have open tag and matching close tag; record span and label
IntPair startTagOffsets = openTagAndAtts.spanOffsets;
Map<String, Pair<String, IntPair>> spanAtts = openTagAndAtts.attributes;
int startTagStart = startTagOffsets.getFirst();
int startTagEnd = startTagOffsets.getSecond();
int endTagStart = xmlMatcher.start();
int endTagEnd = xmlMatcher.end();
updateAttributeInfo(attributesRetained, tagName, startTagEnd, endTagStart, spanAtts, xmlTextSt);
// int nestingLevel = nestingLevels.get(tagName) - 1;
// nestingLevels.put(tagName, nestingLevel);
boolean isDeletable = false;
if (deletableSpanTags.contains(tagName)) {
// deletable span
isDeletable = true;
deletableNestingLevel--;
}
/*
* if we are within another deletable tag
* DON'T DELETE or it will create problems.
* else
* delete
* else we are NOT in deletable and NOT nested:
* delete open and close tags.
*/
if (deletableNestingLevel == 0) {
if (isDeletable)
xmlTextSt.transformString(startTagStart, endTagEnd, "");
else {
// we should retain text between open and close, but delete the tags
xmlTextSt.transformString(startTagStart, startTagEnd, "");
xmlTextSt.transformString(endTagStart, endTagEnd, "");
}
}
}
} else {
// tag must be open
IntPair tagSpan = new IntPair(xmlMatcher.start(), xmlMatcher.end());
Map<String, Pair<String, IntPair>> spanAtts = new HashMap<>();
tagStack.push(new SpanInfo(tagName, tagSpan, spanAtts));
if (deletableSpanTags.contains(tagName))
deletableNestingLevel++;
// within an xml open tag: identify any attribute values we need to retain.
if (tagsWithAtts.containsKey(tagName)) {
Set<String> attributeNames = tagsWithAtts.get(tagName);
// parse the substring beyond the tag name.
lcsubstr = lcsubstr.substring(tagMatcher.end());
substr = substr.substring(tagMatcher.end());
Matcher attrMatcher = tagAttributePattern.matcher(lcsubstr);
while (attrMatcher.find()) {
String attrName = attrMatcher.group(1);
// avoid lowercasing attribute values
// attrMatcher.group(2);
String attrVal = substr.substring(attrMatcher.start(2), attrMatcher.end(2));
if (attributeNames.contains(attrName)) {
// substring starts at index of start of (open) xml tag + length of tag name + left angle bracket
// note that we are using a transformed text, so need original offsets
int attrValOffset = tagMatcher.end() + xmlMatcher.start();
int attrValStart = attrMatcher.start(2) + attrValOffset;
int attrValEnd = attrMatcher.end(2) + attrValOffset;
// use adjusted offsets to get char offsets in original xml source text
IntPair attrValSpan = xmlTextSt.getOriginalOffsets(attrValStart, attrValEnd);
spanAtts.put(attrName, new Pair(attrVal, attrValSpan));
}
}
// we now have an open tag name, its offsets, and any retained attributes on the tag stack
}
}
}
}
xmlTextSt = cleanupWhitespace(xmlTextSt);
return new Pair(xmlTextSt, attributesRetained);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StringTransformation method applyPendingEdits.
/**
* apply any pending edits, update the modified string
*/
public void applyPendingEdits() {
String currentStr = transformedText;
if (isModified) {
/*
immediately set flag, as we may call other methods that check this condition, which could call this
method
*/
isModified = false;
/*
* it's OK for edits to be unsorted: all edit offsets are computed relative to the previous edits
* in the sequence
*/
for (Edit edit : edits) {
IntPair editOffsets = edit.offsets;
String before = currentStr.substring(0, editOffsets.getFirst());
String after = currentStr.substring(editOffsets.getSecond());
currentStr = before + edit.newString + after;
}
transformedText = currentStr;
/*
* store pending recorded offsets while computing absolute offsets for all current edits
*/
Map<Integer, Pair<Integer, EditType>> toAdd = new TreeMap();
for (Integer modOffset : currentOffsetModifications.keySet()) {
Integer currentMod = currentOffsetModifications.get(modOffset).getFirst();
EditType currentEditType = currentOffsetModifications.get(modOffset).getSecond();
/*
* recorded offset mods MUST be made with respect to ORIGINAL offsets -- not the current transformed
* string.
*/
Integer absoluteModOffset = computeOriginalOffset(modOffset);
// TODO: verify that it's OK to just keep the original edit type
if (toAdd.containsKey(absoluteModOffset))
currentMod += toAdd.get(absoluteModOffset).getFirst();
toAdd.put(absoluteModOffset, new Pair<>(currentMod, currentEditType));
}
/**
* The entries in toAdd *cannot* conflict, because they come from a single pass
* Now we need to merge them with previously recorded offset mods
*/
if (recordedOffsetModifications.isEmpty())
recordedOffsetModifications.putAll(toAdd);
else {
TreeMap<Integer, Pair<Integer, EditType>> safeAdds = new TreeMap<>();
// stores position of greatest of last key, or the last key's effective edit position
int lastKeyPos = 0;
for (int key : toAdd.keySet()) {
int mod = toAdd.get(key).getFirst();
EditType editType = toAdd.get(key).getSecond();
if (key < lastKeyPos)
// move to after last entry key + edit
key = lastKeyPos;
/*
* it gets a bit tricky if a new deletion overlaps older edits: you need to split up the new edit.
* TODO: merge edits instead
*/
for (int oldKey : recordedOffsetModifications.keySet()) {
if (mod == 0)
break;
// am I at the same index?
if (oldKey == key) {
// if edit is an expansion, still advance one position
// move on...
key = Math.max(key + 1, key - recordedOffsetModifications.get(oldKey).getFirst());
} else // am I within the window of a prior edit?
if (oldKey < key) {
int oldMod = recordedOffsetModifications.get(oldKey).getFirst();
// negative, to compare with negative mod
int diff = oldKey - key;
if (diff > oldMod) {
// edits interfere; can't happen if oldMod is positive (insertion)
// modifier doesn't change: edit not applied yet; update edit
key = oldKey - oldMod;
// position to just past old edit
}
} else if (oldKey > key) {
// Is next edit within window of my edit?
// negative, to compare with -ve mod
int diff = key - oldKey;
if (diff > mod) {
// if diff > mod, mod is negative and edits interfere.
// delete up to current edit
safeAdds.put(key, new Pair<>(diff, editType));
// part of modification not accounted for; again, recall both negative
mod = mod - diff;
// move to index after old edit
key = oldKey - recordedOffsetModifications.get(oldKey).getFirst();
} else {
// either mod is positive, or next edit does not interfere
safeAdds.put(key, new Pair<>(mod, editType));
// update if -ve mod
lastKeyPos = Math.max(key, key - mod);
// break from the loop
mod = 0;
}
}
}
if (// past all old edits, haven't added it yet...
mod != 0)
safeAdds.put(key, new Pair<>(mod, editType));
}
recordedOffsetModifications.putAll(safeAdds);
}
/*
* compute inverse mapping (from transformed text offsets to original offsets)
* using the complete set of transformations to date: store as offset modifiers
* at transform string indexes where changes occur, such that adding the offset modifier
* to the current transform index yields the corresponding offset in the original string.
*/
recordedInverseModifications.clear();
/*
* recordedOffsetModifications: at char index X, modify running offset modifier by Y
*/
int cumulativeOffset = 0;
for (Integer transformModIndex : recordedOffsetModifications.keySet()) {
int baseIndex = transformModIndex;
int transformMod = recordedOffsetModifications.get(transformModIndex).getFirst();
EditType editType = recordedOffsetModifications.get(transformModIndex).getSecond();
/*
* suppose tranform offset is 33, and modifier is -33 (delete the first 33 chars of the orig string).
* Therefore we want index 0 of the transformed string to map to offset 33 of the orig string.
* So we update the cumulative offset *after* adding the current mod.
* Subsequent edits to orig string increase the total difference between the transformed string
* base index and the corresponding orig string index, hence the need for cumulative offset to
* be subtracted from the orig index. (mod is -ve, therefore subtraction even though it's added
* to the offset from the perspective of the original string
*/
int effectiveIndex = baseIndex - cumulativeOffset;
int effectiveMod = transformMod;
if (recordedInverseModifications.containsKey(effectiveIndex))
effectiveMod -= recordedInverseModifications.get(effectiveIndex).getFirst();
// TODO: verify that using most recent transform type is correct if there was already an edit in RIM
recordedInverseModifications.put(effectiveIndex, new Pair<>(-effectiveMod, editType));
cumulativeOffset -= transformMod;
}
if (DEBUG) {
int lastIndex = 0;
int lastOrigOffset = 0;
for (int revInd : recordedInverseModifications.keySet()) {
int diff = revInd - lastIndex;
String origSub = origText.substring(lastOrigOffset, lastOrigOffset + diff);
System.err.println(lastIndex + "-" + revInd + ": " + origSub);
lastOrigOffset = lastOrigOffset + diff + recordedInverseModifications.get(revInd).getFirst();
lastIndex = revInd;
}
}
/*
* cleanup: remove temporary state that has now been resolved
*/
currentOffsetModifications.clear();
edits.clear();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TokenUtils method getTokenOffsets.
/**
* Takes a string and its tokenizied form, and returns an array of span index pairs.
*
* @param sentence raw input text
* @param tokens the tokenized form of the sentence
* @return array of span index pairs.
*/
public static IntPair[] getTokenOffsets(String sentence, String[] tokens) {
List<IntPair> offsets = new ArrayList<>();
int tokenId = 0;
int characterId = 0;
int tokenCharacterStart = 0;
int tokenLength = 0;
while (characterId < sentence.length() && Character.isWhitespace(sentence.charAt(characterId))) characterId++;
// set first token start to end of leading whitespace
tokenCharacterStart = characterId;
while (characterId < sentence.length()) {
if (tokenLength == tokens[tokenId].length()) {
offsets.add(new IntPair(tokenCharacterStart, characterId));
while (characterId < sentence.length() && Character.isWhitespace(sentence.charAt(characterId))) characterId++;
tokenCharacterStart = characterId;
tokenLength = 0;
tokenId++;
} else {
assert sentence.charAt(characterId) == tokens[tokenId].charAt(tokenLength) : sentence.charAt(characterId) + " expected, found " + tokens[tokenId].charAt(tokenLength) + " instead in sentence: " + sentence;
tokenLength++;
characterId++;
}
}
if (characterId == sentence.length() && offsets.size() == tokens.length - 1) {
offsets.add(new IntPair(tokenCharacterStart, sentence.length()));
}
assert offsets.size() == tokens.length : offsets;
return offsets.toArray(new IntPair[offsets.size()]);
}
Aggregations