use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessor method processXml.
/**
* This class removes XML markup, for the most part. For specified tags that denote spans of text other than
* body text (e.g. quotes, headlines), the text value and offsets are reported. For specified tags and attributes,
* the attribute values and their offsets are reported. Content within <code>quote</code>
* tags is left in place (though quote tags are removed) and the offsets are reported with the
* other specified attributes.
* This class has some facility for handling nested tags. Opens without closes are checked against
* tags to ignore (provided at construction) and if found are ignored (deleted). Otherwise, an exception
* is thrown.
* @param xmlText StringTransformation whose basis is the original xml text.
* @return String comprising text.
*/
public Pair<StringTransformation, List<SpanInfo>> processXml(String xmlText) {
StringTransformation xmlTextSt = new StringTransformation(xmlText);
xmlTextSt = StringTransformationCleanup.normalizeToEncoding(xmlTextSt, Charset.forName("UTF-8"));
// there are embedded xml tags in body text. Unescape them so we can process them easily.
xmlTextSt = replaceXmlEscapedChars(xmlTextSt);
xmlTextSt.applyPendingEdits();
// singletons can be nested in deletable spans, creating major headaches.
xmlTextSt = deleteSingletons(xmlTextSt);
// // there are some nested tags. If the nesting is simple, fix it. Otherwise, throw an exception.
// xmlTextSt.flattenNestedTags(xmlTextSt);
//
String xmlCurrentStr = xmlTextSt.getTransformedText();
// don't call getTransformedText() or applyPendingEdits() in the body of the loop usinr xmlMatcher
Matcher xmlMatcher = xmlTagPattern.matcher(xmlCurrentStr);
// span offsets, label, attName, attVal, attOffsets
List<SpanInfo> attributesRetained = new ArrayList<>();
// track open/close tags, to record spans for later use (e.g. quoted blocks that aren't annotated)
// each entry retains tagname, open tag offsets, attributes
// note that open tag offsets are NOT the same as the (complete span) offsets returned by this method
// IMPORTANT: offsets are computed from modified xml string (initial normalization steps clean up original)
// so code must adjust them for storing offsets for return.
Stack<SpanInfo> tagStack = new Stack<>();
// // right now, this is just useful for debugging
// Map<String, Integer> nestingLevels = new HashMap<>();
// track whether or not a tag is nested within something marked for deletion
int deletableNestingLevel = 0;
// match mark-up: xml open or close tag
while (xmlMatcher.find()) {
String substr = xmlMatcher.group(0);
boolean isClose = false;
if (substr.charAt(1) == '/') {
isClose = true;
} else if (substr.endsWith("/>") || substr.startsWith("<?xml")) {
//this is an empty tag
xmlTextSt.transformString(xmlMatcher.start(0), xmlMatcher.end(0), "");
continue;
}
String lcsubstr = substr.toLowerCase();
// get the tag name
Matcher tagMatcher = xmlTagNamePattern.matcher(lcsubstr);
if (tagMatcher.find()) {
// identify the tag
String tagName = tagMatcher.group(1);
if (isClose) {
SpanInfo openTagAndAtts = tagStack.pop();
// strip leading "/"
tagName = tagName.substring(1);
String openTagName = openTagAndAtts.label;
// check for lone tags (open without close or vice versa )
boolean isLoneClose = false;
while (!openTagName.equals(tagName) && !isLoneClose) {
if (throwExceptionOnUnrecognizedTag)
throw new IllegalStateException("Mismatched open and close tags. Expected '" + openTagAndAtts + "', found '" + tagName + "'");
else {
//someone used xml special chars in body text
logger.warn("WARNING: found close tag '{}' after open tag '{}', and (obviously) they don't match.", tagName, openTagName);
if (!tagStack.isEmpty()) {
// if lone tag is a close tag, hope that the open stack is empty
openTagAndAtts = tagStack.peek();
openTagName = openTagAndAtts.label;
if (!openTagAndAtts.equals(tagName))
isLoneClose = true;
else
//it matched, so we're good now
openTagAndAtts = tagStack.pop();
} else {
//unmatched lone close
isLoneClose = true;
}
}
}
if (isLoneClose) {
//revert to previous state, and resume parsing
tagStack.push(openTagAndAtts);
} else {
// now we have open tag and matching close tag; record span and label
IntPair startTagOffsets = openTagAndAtts.spanOffsets;
Map<String, Pair<String, IntPair>> spanAtts = openTagAndAtts.attributes;
int startTagStart = startTagOffsets.getFirst();
int startTagEnd = startTagOffsets.getSecond();
int endTagStart = xmlMatcher.start();
int endTagEnd = xmlMatcher.end();
updateAttributeInfo(attributesRetained, tagName, startTagEnd, endTagStart, spanAtts, xmlTextSt);
// int nestingLevel = nestingLevels.get(tagName) - 1;
// nestingLevels.put(tagName, nestingLevel);
boolean isDeletable = false;
if (deletableSpanTags.contains(tagName)) {
// deletable span
isDeletable = true;
deletableNestingLevel--;
}
/*
* if we are within another deletable tag
* DON'T DELETE or it will create problems.
* else
* delete
* else we are NOT in deletable and NOT nested:
* delete open and close tags.
*/
if (deletableNestingLevel == 0) {
if (isDeletable)
xmlTextSt.transformString(startTagStart, endTagEnd, "");
else {
// we should retain text between open and close, but delete the tags
xmlTextSt.transformString(startTagStart, startTagEnd, "");
xmlTextSt.transformString(endTagStart, endTagEnd, "");
}
}
}
} else {
// tag must be open
IntPair tagSpan = new IntPair(xmlMatcher.start(), xmlMatcher.end());
Map<String, Pair<String, IntPair>> spanAtts = new HashMap<>();
tagStack.push(new SpanInfo(tagName, tagSpan, spanAtts));
if (deletableSpanTags.contains(tagName))
deletableNestingLevel++;
// within an xml open tag: identify any attribute values we need to retain.
if (tagsWithAtts.containsKey(tagName)) {
Set<String> attributeNames = tagsWithAtts.get(tagName);
// parse the substring beyond the tag name.
lcsubstr = lcsubstr.substring(tagMatcher.end());
substr = substr.substring(tagMatcher.end());
Matcher attrMatcher = tagAttributePattern.matcher(lcsubstr);
while (attrMatcher.find()) {
String attrName = attrMatcher.group(1);
// avoid lowercasing attribute values
//attrMatcher.group(2);
String attrVal = substr.substring(attrMatcher.start(2), attrMatcher.end(2));
if (attributeNames.contains(attrName)) {
// substring starts at index of start of (open) xml tag + length of tag name + left angle bracket
// note that we are using a transformed text, so need original offsets
int attrValOffset = tagMatcher.end() + xmlMatcher.start();
int attrValStart = attrMatcher.start(2) + attrValOffset;
int attrValEnd = attrMatcher.end(2) + attrValOffset;
// use adjusted offsets to get char offsets in original xml source text
IntPair attrValSpan = xmlTextSt.getOriginalOffsets(attrValStart, attrValEnd);
spanAtts.put(attrName, new Pair(attrVal, attrValSpan));
}
}
// we now have an open tag name, its offsets, and any retained attributes on the tag stack
}
}
}
}
xmlTextSt = cleanupWhitespace(xmlTextSt);
return new Pair(xmlTextSt, attributesRetained);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ParseUtils method getTokenIndexedParseTreeNodeCovering.
public static Tree<Pair<String, IntPair>> getTokenIndexedParseTreeNodeCovering(String parseViewName, Constituent c) {
// / UGLY CODE ALERT!!!
TextAnnotation ta = c.getTextAnnotation();
int sentenceId = ta.getSentenceId(c);
Tree<String> tree = getParseTree(parseViewName, ta, sentenceId);
final int sentenceStartSpan = ta.getSentence(sentenceId).getStartSpan();
int start = c.getStartSpan() - sentenceStartSpan;
int end = c.getEndSpan() - sentenceStartSpan;
// Find the tree that covers the start and end tokens. However, start
// and end have been shifted relative to the start of the sentence. So
// we need to shift it back, which is why we have that UGLY as sin
// mapper at the end.
Tree<Pair<String, IntPair>> toknTree = getTokenIndexedTreeCovering(tree, start, end);
ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>> transformer = new ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>>() {
@Override
public Pair<String, IntPair> transform(Tree<Pair<String, IntPair>> input) {
Pair<String, IntPair> label = input.getLabel();
IntPair newSpan = new IntPair(label.getSecond().getFirst() + sentenceStartSpan, label.getSecond().getSecond() + sentenceStartSpan);
return new Pair<>(label.getFirst(), newSpan);
}
};
return Mappers.mapTree(toknTree, transformer);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ParseUtils method getSpanLabeledTree.
private static Pair<Tree<Pair<String, IntPair>>, Integer> getSpanLabeledTree(Tree<String> parseTree, int currentLeafId) {
if (parseTree.isLeaf()) {
IntPair span;
if (ParseTreeProperties.isNullLabel(parseTree.getParent().getLabel())) {
span = new IntPair(currentLeafId, currentLeafId);
} else {
span = new IntPair(currentLeafId, currentLeafId + 1);
currentLeafId++;
}
Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
Tree<Pair<String, IntPair>> tree = new Tree<>(label);
return new Pair<>(tree, currentLeafId);
}
List<Tree<Pair<String, IntPair>>> children = new ArrayList<>();
int start = currentLeafId;
for (Tree<String> child : parseTree.getChildren()) {
Pair<Tree<Pair<String, IntPair>>, Integer> tmp = getSpanLabeledTree(child, currentLeafId);
currentLeafId = tmp.getSecond();
children.add(tmp.getFirst());
}
int end = currentLeafId;
IntPair span = new IntPair(start, end);
Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
Tree<Pair<String, IntPair>> output = new Tree<>(label);
for (Tree<Pair<String, IntPair>> child : children) {
output.addSubtree(child);
}
return new Pair<>(output, currentLeafId);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class EREEventReader method getArguments.
/**
* expect arguments to be mention nodes already created by {@link ERENerReader}
* @param eventMentionNode an xml node representing an event mention
* @return a list of pairs containing an index-matched list of lists of three elements
* (role, realis, and origin) and corresponding argument Constituent.
*/
private List<Pair<List<String>, Constituent>> getArguments(Node eventMentionNode) {
/*
<em_arg entity_id="ent-m.0d04z6" entity_mention_id="m-56bd16d7_2_75" role="giver" realis="true">cuba</em_arg>
<em_arg entity_id="ent-m.09c7w0" entity_mention_id="m-56bd16d7_2_135" role="recipient" realis="true">US</em_arg>
<!-- from Event Argument Linking augmented corpus: -->
<em_arg entity_id="ent-243" entity_mention_id="m-237" role="victim" realis="true" origin="ldc">Four deaths</em_arg>
*/
NodeList nl = ((Element) eventMentionNode).getElementsByTagName(EVENT_ARGUMENT);
List<Pair<List<String>, Constituent>> arguments = new ArrayList<>();
for (int argIndex = 0; argIndex < nl.getLength(); ++argIndex) {
numEventRolesInSource++;
Node argNode = nl.item(argIndex);
NamedNodeMap nnMap = argNode.getAttributes();
Node att = nnMap.getNamedItem(ENTITY_MENTION_ID);
if (null == att)
att = nnMap.getNamedItem(FILLER_ID);
String entityMentionId = att.getNodeValue();
Constituent ac = getMentionConstituent(entityMentionId);
if (null == ac)
logger.error("Could not find mention Constituent for mentionId '{}'", entityMentionId);
else {
numEventRolesGenerated++;
String role = nnMap.getNamedItem(ROLE).getNodeValue();
String realis = nnMap.getNamedItem(REALIS).getNodeValue();
String origin = "ldc";
if (nnMap.getNamedItem(ORIGIN) != null)
origin = nnMap.getNamedItem(ORIGIN).getNodeValue();
List<String> argAtts = new LinkedList<>();
argAtts.add(role);
argAtts.add(realis);
argAtts.add(origin);
arguments.add(new Pair(argAtts, ac));
}
}
return arguments;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ERENerReader method findAndUpdateMentionInfo.
/**
* mention length should be the same, but in at least one file many offsets are shifted by +1
* allow also -1 shift
*/
private XmlDocumentProcessor.SpanInfo findAndUpdateMentionInfo(IntPair origOffsets, String nounType, String label, String eId, String mId, String specificity) {
XmlDocumentProcessor.SpanInfo mentionInfo = offsetToSpanInfo.get(origOffsets);
if (null == mentionInfo)
mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() - 1, origOffsets.getSecond() - 1));
if (null == mentionInfo)
mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() + 1, origOffsets.getSecond() + 1));
if (null != mentionInfo) {
mentionInfo.attributes.put(ENTITY_ID, new Pair(eId, origOffsets));
mentionInfo.attributes.put(ENTITY_MENTION_ID, new Pair(mId, origOffsets));
mentionInfo.attributes.put(SPECIFICITY, new Pair(specificity, origOffsets));
mentionInfo.attributes.put(NOUN_TYPE, new Pair(nounType, origOffsets));
mentionInfo.attributes.put(IS_FOUND, new Pair(Boolean.toString(null != mentionInfo), origOffsets));
if (null != label)
mentionInfo.attributes.put(TYPE, new Pair(label, origOffsets));
}
return mentionInfo;
}
Aggregations