use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class EREEventReader method createTrigger.
private Constituent createTrigger(Node eventMentionNode, XmlTextAnnotation xmlTa, String eventId) throws XMLException {
/* <trigger source="ENG_DF_001241_20150407_F0000007T" offset="179" length="5">trade</trigger> */
NodeList nl = ((Element) eventMentionNode).getElementsByTagName(TRIGGER);
Constituent trigger = null;
if (nl.getLength() == 0)
throw new IllegalStateException("Event " + eventId + " has no trigger element.");
if (nl.getLength() > 1)
throw new IllegalStateException("Event " + eventId + " has multiple trigger elements.");
for (int i = 0; i < nl.getLength(); ++i) {
Node eventTriggerNode = nl.item(i);
String triggerForm = SimpleXMLParser.getContentString((Element) eventTriggerNode);
NamedNodeMap nnMap = eventTriggerNode.getAttributes();
String source = nnMap.getNamedItem(SOURCE).getNodeValue();
int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
IntPair offsets = getTokenOffsets(offset, offset + length, triggerForm, xmlTa);
if (null == offsets)
return null;
else if (-1 == offsets.getFirst() && -1 == offsets.getSecond()) {
// handled by next layer up, which records the info separately
return null;
}
trigger = new Constituent(TRIGGER, getEventViewName(), xmlTa.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
trigger.addAttribute(EventIdAttribute, eventId);
trigger.addAttribute(SOURCE, source);
}
return trigger;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ERENerReader method findAndUpdateMentionInfo.
/**
* mention length should be the same, but in at least one file many offsets are shifted by +1
* allow also -1 shift
*/
private XmlDocumentProcessor.SpanInfo findAndUpdateMentionInfo(IntPair origOffsets, String nounType, String label, String eId, String mId, String specificity) {
XmlDocumentProcessor.SpanInfo mentionInfo = offsetToSpanInfo.get(origOffsets);
if (null == mentionInfo)
mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() - 1, origOffsets.getSecond() - 1));
if (null == mentionInfo)
mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() + 1, origOffsets.getSecond() + 1));
if (null != mentionInfo) {
mentionInfo.attributes.put(ENTITY_ID, new Pair(eId, origOffsets));
mentionInfo.attributes.put(ENTITY_MENTION_ID, new Pair(mId, origOffsets));
mentionInfo.attributes.put(SPECIFICITY, new Pair(specificity, origOffsets));
mentionInfo.attributes.put(NOUN_TYPE, new Pair(nounType, origOffsets));
mentionInfo.attributes.put(IS_FOUND, new Pair(Boolean.toString(null != mentionInfo), origOffsets));
if (null != label)
mentionInfo.attributes.put(TYPE, new Pair(label, origOffsets));
}
return mentionInfo;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ERENerReader method getTokenOffsets.
/**
* find the token offsets in the TextAnnotation that correspond to the source character offsets for the given
* mention
*
* @param origStartOffset start character offset from xml markup
* @param origEndOffset end character offset from xml markup
* @param mentionForm mention form from xml markup
* @param xmlTa XmlTextAnnotation object storing original xml, transformed text, extracted xml markup,
* and corresponding TextAnnotation
* @return Intpair(-1, -1) if the specified offsets correspond to deleted span (and hence likely a name mention
* in xml metadata, e.g. post author); null if no mapped tokens could be found (possibly, indexes refer
* to the middle of a single token because tokenizer can't segment some strings); or the corresponding
* token indexes
*/
protected IntPair getTokenOffsets(int origStartOffset, int origEndOffset, String mentionForm, XmlTextAnnotation xmlTa) {
StringTransformation st = xmlTa.getXmlSt();
String origStr = st.getOrigText().substring(origStartOffset, origEndOffset);
if (origStr.startsWith(" ") || origStr.startsWith("\n")) {
origStartOffset += 1;
origEndOffset += 1;
}
int adjStart = st.computeModifiedOffsetFromOriginal(origStartOffset);
int adjEnd = st.computeModifiedOffsetFromOriginal(origEndOffset);
if (adjStart == adjEnd) {
// probably, maps to span deleted when creating cleaned-up text
return new IntPair(-1, -1);
}
IntPair returnOffset = null;
int si = 0, ei = 0;
TextAnnotation ta = xmlTa.getTextAnnotation();
String rawText = ta.getText();
String rawStr = rawText.substring(adjStart, adjEnd);
logger.debug("source xml str: '" + origStr + "' (" + origStartOffset + "," + origEndOffset + ")");
try {
si = findStartIndex(adjStart);
ei = findEndIndex(adjEnd, rawText);
returnOffset = new IntPair(si, ei);
} catch (IllegalArgumentException iae) {
logger.error("could not find token offsets for mention form '" + mentionForm + ", start, end orig: (" + origStartOffset + "," + origEndOffset + "); adjusted: (" + adjStart + "," + adjEnd + ").");
System.exit(1);
} catch (RuntimeException re) {
numOffsetErrors++;
logger.error("Error finding text for '{}' at offsets {}:", rawStr, (adjStart + "-" + adjEnd));
boolean siwaszero = false;
if (si == 0) {
siwaszero = true;
}
si = findStartIndexIgnoreError(adjStart);
ei = findEndIndexIgnoreError(adjEnd);
if (siwaszero)
logger.error("Could not find start token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
else
logger.error("Could not find end token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
int max = ta.getTokens().length;
int start = si >= 2 ? si - 2 : 0;
int end = (ei + 2) < max ? ei + 2 : max;
StringBuilder bldr = new StringBuilder();
for (int jj = start; jj < end; jj++) {
bldr.append(" ");
if (jj == si)
bldr.append(":");
bldr.append(ta.getToken(jj));
if (jj == ei)
bldr.append(":");
bldr.append(" ");
}
bldr.append("\n");
logger.error(bldr.toString());
}
return returnOffset;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ERENerReader method recordNullMentionInfo.
/**
* for a mention that could not be mapped to a set of tokens in the cleaned text, record the information
* to allow use of information by downstream systems in the XmlTextAnnotation object associated with the
* source xml. This means finding the original span that contains the mention (expected to be an attribute)
* and updating its attributes with additional mention information.
*
* @param label label for entity
* @param eId entity id
* @param specificity entity specificity
* @param mentionNode mention markup
*/
private boolean recordNullMentionInfo(String label, String eId, String specificity, Node mentionNode, boolean isFiller) throws XMLException {
NamedNodeMap nnMap = mentionNode.getAttributes();
String mId = nnMap.getNamedItem(ID).getNodeValue();
String nounType = "NONE";
if (!isFiller)
nounType = nnMap.getNamedItem(NOUN_TYPE).getNodeValue();
/*
* expect one child
*/
String mentionForm = null;
if (isFiller)
mentionForm = mentionNode.getTextContent();
else {
NodeList mnl = ((Element) mentionNode).getElementsByTagName(MENTION_TEXT);
if (mnl.getLength() > 0) {
mentionForm = SimpleXMLParser.getContentString((Element) mnl.item(0));
} else {
logger.error("No surface form found for mention with id {}.", mId);
}
}
int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
IntPair origOffsets = new IntPair(offset, offset + length);
// boolean isFound = spanInfo.containsKey(origOffsets);
XmlDocumentProcessor.SpanInfo mentionInfo = findAndUpdateMentionInfo(origOffsets, nounType, label, eId, mId, specificity);
boolean isFound = true;
if (null == mentionInfo) {
isFound = false;
logger.warn("even with shifted indexes, could not find offset pair (" + origOffsets.getFirst() + "," + (origOffsets.getSecond()) + ") in xml markup info " + "in XmlTextAnnotation. Entity id, mention id, label, form are: " + eId + "," + mId + ", " + label + "," + mentionForm + ".");
}
if (isFound)
// ...and so excluded from cleaned-up text.
numXmlMarkupMentionsGenerated++;
if (!isFound)
logger.warn("could not find offset pair (" + origOffsets.getFirst() + "," + (origOffsets.getSecond()) + ") in xml markup info " + "in XmlTextAnnotation. Trying shifted indexes.");
return isFound;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class NombankFields method createPredicate.
@Override
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
int start = l.getLabel().getSecond().getFirst();
Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
return predicate;
}
Aggregations