Search in sources :

Example 6 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class XmlDocumentProcessor method processXml.

/**
     * This class removes XML markup, for the most part. For specified tags that denote spans of text other than
     *    body text (e.g. quotes, headlines), the text value and offsets are reported. For specified tags and attributes,
     *    the attribute values and their offsets are reported. Content within <code>quote</code>
     * tags is left in place (though quote tags are removed) and the offsets are reported with the
     * other specified attributes.
     * This class has some facility for handling nested tags.  Opens without closes are checked against
     *    tags to ignore (provided at construction) and if found are ignored (deleted). Otherwise, an exception
     *    is thrown.
     * @param xmlText StringTransformation whose basis is the original xml text.
     * @return String comprising text.
     */
public Pair<StringTransformation, List<SpanInfo>> processXml(String xmlText) {
    StringTransformation xmlTextSt = new StringTransformation(xmlText);
    xmlTextSt = StringTransformationCleanup.normalizeToEncoding(xmlTextSt, Charset.forName("UTF-8"));
    // there are embedded xml tags in body text. Unescape them so we can process them easily.
    xmlTextSt = replaceXmlEscapedChars(xmlTextSt);
    xmlTextSt.applyPendingEdits();
    // singletons can be nested in deletable spans, creating major headaches.
    xmlTextSt = deleteSingletons(xmlTextSt);
    //        // there are some nested tags. If the nesting is simple, fix it. Otherwise, throw an exception.
    //        xmlTextSt.flattenNestedTags(xmlTextSt);
    //
    String xmlCurrentStr = xmlTextSt.getTransformedText();
    // don't call getTransformedText() or applyPendingEdits() in the body of the loop usinr xmlMatcher
    Matcher xmlMatcher = xmlTagPattern.matcher(xmlCurrentStr);
    // span offsets, label, attName, attVal, attOffsets
    List<SpanInfo> attributesRetained = new ArrayList<>();
    // track open/close tags, to record spans for later use (e.g. quoted blocks that aren't annotated)
    // each entry retains tagname, open tag offsets, attributes
    // note that open tag offsets are NOT the same as the (complete span) offsets returned by this method
    // IMPORTANT: offsets are computed from modified xml string (initial normalization steps clean up original)
    //    so code must adjust them for storing offsets for return.
    Stack<SpanInfo> tagStack = new Stack<>();
    //        // right now, this is just useful for debugging
    //        Map<String, Integer> nestingLevels = new HashMap<>();
    // track whether or not a tag is nested within something marked for deletion
    int deletableNestingLevel = 0;
    // match mark-up: xml open or close tag
    while (xmlMatcher.find()) {
        String substr = xmlMatcher.group(0);
        boolean isClose = false;
        if (substr.charAt(1) == '/') {
            isClose = true;
        } else if (substr.endsWith("/>") || substr.startsWith("<?xml")) {
            //this is an empty tag
            xmlTextSt.transformString(xmlMatcher.start(0), xmlMatcher.end(0), "");
            continue;
        }
        String lcsubstr = substr.toLowerCase();
        // get the tag name
        Matcher tagMatcher = xmlTagNamePattern.matcher(lcsubstr);
        if (tagMatcher.find()) {
            // identify the tag
            String tagName = tagMatcher.group(1);
            if (isClose) {
                SpanInfo openTagAndAtts = tagStack.pop();
                // strip leading "/"
                tagName = tagName.substring(1);
                String openTagName = openTagAndAtts.label;
                // check for lone tags (open without close or vice versa )
                boolean isLoneClose = false;
                while (!openTagName.equals(tagName) && !isLoneClose) {
                    if (throwExceptionOnUnrecognizedTag)
                        throw new IllegalStateException("Mismatched open and close tags. Expected '" + openTagAndAtts + "', found '" + tagName + "'");
                    else {
                        //someone used xml special chars in body text
                        logger.warn("WARNING: found close tag '{}' after open tag '{}', and (obviously) they don't match.", tagName, openTagName);
                        if (!tagStack.isEmpty()) {
                            // if lone tag is a close tag, hope that the open stack is empty
                            openTagAndAtts = tagStack.peek();
                            openTagName = openTagAndAtts.label;
                            if (!openTagAndAtts.equals(tagName))
                                isLoneClose = true;
                            else
                                //it matched, so we're good now
                                openTagAndAtts = tagStack.pop();
                        } else {
                            //unmatched lone close
                            isLoneClose = true;
                        }
                    }
                }
                if (isLoneClose) {
                    //revert to previous state, and resume parsing
                    tagStack.push(openTagAndAtts);
                } else {
                    // now we have open tag and matching close tag; record span and label
                    IntPair startTagOffsets = openTagAndAtts.spanOffsets;
                    Map<String, Pair<String, IntPair>> spanAtts = openTagAndAtts.attributes;
                    int startTagStart = startTagOffsets.getFirst();
                    int startTagEnd = startTagOffsets.getSecond();
                    int endTagStart = xmlMatcher.start();
                    int endTagEnd = xmlMatcher.end();
                    updateAttributeInfo(attributesRetained, tagName, startTagEnd, endTagStart, spanAtts, xmlTextSt);
                    //                    int nestingLevel = nestingLevels.get(tagName) - 1;
                    //                    nestingLevels.put(tagName, nestingLevel);
                    boolean isDeletable = false;
                    if (deletableSpanTags.contains(tagName)) {
                        // deletable span
                        isDeletable = true;
                        deletableNestingLevel--;
                    }
                    /*
                     * if we are within another deletable tag
                     *    DON'T DELETE or it will create problems.
                     * else
                     *    delete
                     * else we are NOT in deletable and NOT nested:
                     *    delete open and close tags.
                     */
                    if (deletableNestingLevel == 0) {
                        if (isDeletable)
                            xmlTextSt.transformString(startTagStart, endTagEnd, "");
                        else {
                            // we should retain text between open and close, but delete the tags
                            xmlTextSt.transformString(startTagStart, startTagEnd, "");
                            xmlTextSt.transformString(endTagStart, endTagEnd, "");
                        }
                    }
                }
            } else {
                // tag must be open
                IntPair tagSpan = new IntPair(xmlMatcher.start(), xmlMatcher.end());
                Map<String, Pair<String, IntPair>> spanAtts = new HashMap<>();
                tagStack.push(new SpanInfo(tagName, tagSpan, spanAtts));
                if (deletableSpanTags.contains(tagName))
                    deletableNestingLevel++;
                // within an xml open tag: identify any attribute values we need to retain.
                if (tagsWithAtts.containsKey(tagName)) {
                    Set<String> attributeNames = tagsWithAtts.get(tagName);
                    // parse the substring beyond the tag name.
                    lcsubstr = lcsubstr.substring(tagMatcher.end());
                    substr = substr.substring(tagMatcher.end());
                    Matcher attrMatcher = tagAttributePattern.matcher(lcsubstr);
                    while (attrMatcher.find()) {
                        String attrName = attrMatcher.group(1);
                        // avoid lowercasing attribute values
                        //attrMatcher.group(2);
                        String attrVal = substr.substring(attrMatcher.start(2), attrMatcher.end(2));
                        if (attributeNames.contains(attrName)) {
                            // substring starts at index of start of (open) xml tag + length of tag name + left angle bracket
                            // note that we are using a transformed text, so need original offsets
                            int attrValOffset = tagMatcher.end() + xmlMatcher.start();
                            int attrValStart = attrMatcher.start(2) + attrValOffset;
                            int attrValEnd = attrMatcher.end(2) + attrValOffset;
                            // use adjusted offsets to get char offsets in original xml source text
                            IntPair attrValSpan = xmlTextSt.getOriginalOffsets(attrValStart, attrValEnd);
                            spanAtts.put(attrName, new Pair(attrVal, attrValSpan));
                        }
                    }
                // we now have an open tag name, its offsets, and any retained attributes on the tag stack
                }
            }
        }
    }
    xmlTextSt = cleanupWhitespace(xmlTextSt);
    return new Pair(xmlTextSt, attributesRetained);
}
Also used : Matcher(java.util.regex.Matcher) TextCleanerStringTransformation(edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 7 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ParseUtils method getTokenIndexedParseTreeNodeCovering.

public static Tree<Pair<String, IntPair>> getTokenIndexedParseTreeNodeCovering(String parseViewName, Constituent c) {
    // / UGLY CODE ALERT!!!
    TextAnnotation ta = c.getTextAnnotation();
    int sentenceId = ta.getSentenceId(c);
    Tree<String> tree = getParseTree(parseViewName, ta, sentenceId);
    final int sentenceStartSpan = ta.getSentence(sentenceId).getStartSpan();
    int start = c.getStartSpan() - sentenceStartSpan;
    int end = c.getEndSpan() - sentenceStartSpan;
    // Find the tree that covers the start and end tokens. However, start
    // and end have been shifted relative to the start of the sentence. So
    // we need to shift it back, which is why we have that UGLY as sin
    // mapper at the end.
    Tree<Pair<String, IntPair>> toknTree = getTokenIndexedTreeCovering(tree, start, end);
    ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>> transformer = new ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>>() {

        @Override
        public Pair<String, IntPair> transform(Tree<Pair<String, IntPair>> input) {
            Pair<String, IntPair> label = input.getLabel();
            IntPair newSpan = new IntPair(label.getSecond().getFirst() + sentenceStartSpan, label.getSecond().getSecond() + sentenceStartSpan);
            return new Pair<>(label.getFirst(), newSpan);
        }
    };
    return Mappers.mapTree(toknTree, transformer);
}
Also used : ITransformer(edu.illinois.cs.cogcomp.core.transformers.ITransformer) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 8 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ParseUtils method getSpanLabeledTree.

private static Pair<Tree<Pair<String, IntPair>>, Integer> getSpanLabeledTree(Tree<String> parseTree, int currentLeafId) {
    if (parseTree.isLeaf()) {
        IntPair span;
        if (ParseTreeProperties.isNullLabel(parseTree.getParent().getLabel())) {
            span = new IntPair(currentLeafId, currentLeafId);
        } else {
            span = new IntPair(currentLeafId, currentLeafId + 1);
            currentLeafId++;
        }
        Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
        Tree<Pair<String, IntPair>> tree = new Tree<>(label);
        return new Pair<>(tree, currentLeafId);
    }
    List<Tree<Pair<String, IntPair>>> children = new ArrayList<>();
    int start = currentLeafId;
    for (Tree<String> child : parseTree.getChildren()) {
        Pair<Tree<Pair<String, IntPair>>, Integer> tmp = getSpanLabeledTree(child, currentLeafId);
        currentLeafId = tmp.getSecond();
        children.add(tmp.getFirst());
    }
    int end = currentLeafId;
    IntPair span = new IntPair(start, end);
    Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
    Tree<Pair<String, IntPair>> output = new Tree<>(label);
    for (Tree<Pair<String, IntPair>> child : children) {
        output.addSubtree(child);
    }
    return new Pair<>(output, currentLeafId);
}
Also used : ArrayList(java.util.ArrayList) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 9 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class EREEventReader method getArguments.

/**
     * expect arguments to be mention nodes already created by {@link ERENerReader}
     * @param eventMentionNode an xml node representing an event mention
     * @return a list of pairs containing an index-matched list of lists of three elements
     *         (role, realis, and origin) and corresponding argument Constituent.
     */
private List<Pair<List<String>, Constituent>> getArguments(Node eventMentionNode) {
    /*
        <em_arg entity_id="ent-m.0d04z6" entity_mention_id="m-56bd16d7_2_75" role="giver" realis="true">cuba</em_arg>
        <em_arg entity_id="ent-m.09c7w0" entity_mention_id="m-56bd16d7_2_135" role="recipient" realis="true">US</em_arg>
        <!-- from Event Argument Linking augmented corpus: -->
        <em_arg entity_id="ent-243" entity_mention_id="m-237" role="victim" realis="true" origin="ldc">Four deaths</em_arg>

     */
    NodeList nl = ((Element) eventMentionNode).getElementsByTagName(EVENT_ARGUMENT);
    List<Pair<List<String>, Constituent>> arguments = new ArrayList<>();
    for (int argIndex = 0; argIndex < nl.getLength(); ++argIndex) {
        numEventRolesInSource++;
        Node argNode = nl.item(argIndex);
        NamedNodeMap nnMap = argNode.getAttributes();
        Node att = nnMap.getNamedItem(ENTITY_MENTION_ID);
        if (null == att)
            att = nnMap.getNamedItem(FILLER_ID);
        String entityMentionId = att.getNodeValue();
        Constituent ac = getMentionConstituent(entityMentionId);
        if (null == ac)
            logger.error("Could not find mention Constituent for mentionId '{}'", entityMentionId);
        else {
            numEventRolesGenerated++;
            String role = nnMap.getNamedItem(ROLE).getNodeValue();
            String realis = nnMap.getNamedItem(REALIS).getNodeValue();
            String origin = "ldc";
            if (nnMap.getNamedItem(ORIGIN) != null)
                origin = nnMap.getNamedItem(ORIGIN).getNodeValue();
            List<String> argAtts = new LinkedList<>();
            argAtts.add(role);
            argAtts.add(realis);
            argAtts.add(origin);
            arguments.add(new Pair(argAtts, ac));
        }
    }
    return arguments;
}
Also used : ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 10 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ERENerReader method findAndUpdateMentionInfo.

/**
     * mention length should be the same, but in at least one file many offsets are shifted by +1
     * allow also -1 shift
     */
private XmlDocumentProcessor.SpanInfo findAndUpdateMentionInfo(IntPair origOffsets, String nounType, String label, String eId, String mId, String specificity) {
    XmlDocumentProcessor.SpanInfo mentionInfo = offsetToSpanInfo.get(origOffsets);
    if (null == mentionInfo)
        mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() - 1, origOffsets.getSecond() - 1));
    if (null == mentionInfo)
        mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() + 1, origOffsets.getSecond() + 1));
    if (null != mentionInfo) {
        mentionInfo.attributes.put(ENTITY_ID, new Pair(eId, origOffsets));
        mentionInfo.attributes.put(ENTITY_MENTION_ID, new Pair(mId, origOffsets));
        mentionInfo.attributes.put(SPECIFICITY, new Pair(specificity, origOffsets));
        mentionInfo.attributes.put(NOUN_TYPE, new Pair(nounType, origOffsets));
        mentionInfo.attributes.put(IS_FOUND, new Pair(Boolean.toString(null != mentionInfo), origOffsets));
        if (null != label)
            mentionInfo.attributes.put(TYPE, new Pair(label, origOffsets));
    }
    return mentionInfo;
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2