Search in sources :

Example 66 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class XmlDocumentProcessor method deleteSingletons.

/**
     * delete all spans that correspond to singleton tags (i.e. self-contained span presented as open tag with
     *    attributes, but no corresponding close). Relies on user specifying these ahead of time.
     * @param xmlTextSt StringTransformation containing text to be searched.
     * @return StringTransformation with appropriate edits.
     */
private StringTransformation deleteSingletons(StringTransformation xmlTextSt) {
    // don't call getTransformedText() or applyPendingEdits() in the body of the loop usinr xmlMatcher
    Matcher xmlMatcher = xmlTagPattern.matcher(xmlTextSt.getTransformedText());
    Map<IntPair, Map<String, String>> attributesRetained = new HashMap<>();
    // match mark-up: xml open or close tag
    while (xmlMatcher.find()) {
        String substr = xmlMatcher.group(0);
        if (substr.charAt(1) == '/') {
            //irrelevant to singletons by definition
            continue;
        }
        String lcsubstr = substr.toLowerCase();
        // get the tag name
        Matcher tagMatcher = xmlTagNamePattern.matcher(lcsubstr);
        if (tagMatcher.find()) {
            // identify the tag
            String tagName = tagMatcher.group(1);
            if (singletonTags.contains(tagName)) {
                xmlTextSt.transformString(xmlMatcher.start(), xmlMatcher.end(), "");
            }
        }
    }
    xmlTextSt.applyPendingEdits();
    return xmlTextSt;
}
Also used : Matcher(java.util.regex.Matcher) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 67 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ParseUtils method getTokenIndexedTreeCovering.

/**
     * From a parse tree and a span that is specified with the start and end (exclusive), this
     * function returns a tree that corresponds to the subtree that covers the span. Each node in
     * the new tree corresponds to a node in the input tree and is labeled with the label of the
     * original node along with the span that this node covered in the original tree.
     *
     * @return A new tree that covers the specified span and each node specifies the label and the
     *         span of the original tree that it covers.
     */
public static Tree<Pair<String, IntPair>> getTokenIndexedTreeCovering(Tree<String> parse, int start, int end) {
    Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(parse);
    Tree<Pair<String, IntPair>> current = spanLabeledTree;
    while (current != null) {
        IntPair span = current.getLabel().getSecond();
        if (span.getFirst() == start && span.getSecond() == end) {
            return current;
        } else {
            boolean found = false;
            for (Tree<Pair<String, IntPair>> child : current.getChildren()) {
                if (child.getLabel().getSecond().getFirst() <= start && child.getLabel().getSecond().getSecond() >= end) {
                    current = child;
                    found = true;
                    break;
                }
            }
            if (!found)
                break;
        }
    }
    return current;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 68 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ParseUtils method getPhraseFromHead.

/**
     * Primarily a fix for prepSRL objects; converts them from single head words to constituents.
     * E.g. for the sentence "the man with the telescope", the object of the preposition will be
     * "the telescope" instead of just "telescope".
     *
     * @param predicate The predicate of the construction (e.g. "with")
     * @param argHead The head-word of the argument of the construction (e.g. "telescope")
     * @param parseViewName The name of the parse view used to extract the phrase-structure tree
     * @return The full constituent phrase containing the argument head
     */
public static Constituent getPhraseFromHead(Constituent predicate, Constituent argHead, String parseViewName) {
    // Get the path from the argument to the preposition
    // but only if the predicate node "m-commands" the arg
    TextAnnotation ta = argHead.getTextAnnotation();
    int sentenceOffset = ta.getSentence(ta.getSentenceId(argHead)).getStartSpan();
    int argStart = argHead.getStartSpan() - sentenceOffset;
    Tree<Pair<String, IntPair>> predParentTree = getTokenIndexedTreeCovering(predicate, parseViewName).getParent();
    boolean found = false;
    for (Tree<Pair<String, IntPair>> s : predParentTree.getYield()) {
        if (s.getLabel().getSecond().getFirst() == argStart)
            found = true;
    }
    if (!found)
        return null;
    // Now follow the path from the argument node to get to the preposition
    Tree<Pair<String, IntPair>> argPhrase = getTokenIndexedTreeCovering(argHead, parseViewName);
    while (!checkForPredicate(argPhrase.getParent(), predicate.getStartSpan() - sentenceOffset)) {
        if (argPhrase.getParent() == null)
            break;
        argPhrase = argPhrase.getParent();
    }
    // If the phrase covering the constituent is the whole sentence then the annotation is wrong
    if (argPhrase.getParent() == null)
        return null;
    int start = predicate.getStartSpan() + 1;
    int end = start + argPhrase.getYield().size();
    return new Constituent(argHead.getLabel(), argHead.getViewName(), argHead.getTextAnnotation(), start, end);
}
Also used : TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 69 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class JsonSerializer method readTextAnnotation.

TextAnnotation readTextAnnotation(String string) throws Exception {
    JsonObject json = (JsonObject) new JsonParser().parse(string);
    String corpusId = readString("corpusId", json);
    String id = readString("id", json);
    String text = readString("text", json);
    String[] tokens = readStringArray("tokens", json);
    Pair<Pair<String, Double>, int[]> sentences = readSentences(json);
    IntPair[] offsets = TokenUtils.getTokenOffsets(text, tokens);
    TextAnnotation ta = new TextAnnotation(corpusId, id, text, offsets, tokens, sentences.getSecond());
    JsonArray views = json.getAsJsonArray("views");
    for (int i = 0; i < views.size(); i++) {
        JsonObject view = (JsonObject) views.get(i);
        String viewName = readString("viewName", view);
        JsonArray viewData = view.getAsJsonArray("viewData");
        List<View> topKViews = new ArrayList<>();
        for (int k = 0; k < viewData.size(); k++) {
            JsonObject kView = (JsonObject) viewData.get(k);
            topKViews.add(readView(kView, ta));
        }
        ta.addTopKView(viewName, topKViews);
    }
    readAttributes(ta, json);
    return ta;
}
Also used : JsonObject(com.google.gson.JsonObject) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) JsonArray(com.google.gson.JsonArray) JsonParser(com.google.gson.JsonParser) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 70 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class VerbSensePropbankReader method addAnnotation.

private void addAnnotation(TextAnnotation ta) {
    String goldViewName = SenseManager.getGoldViewName();
    Tree<String> tree = ParseHelper.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
    Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
    List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
    TokenLabelView view = new TokenLabelView(goldViewName, "AnnotatedTreebank", ta, 1.0);
    Set<Integer> predicates = new HashSet<>();
    for (PropbankFields fields : goldFields.get(ta.getId())) {
        int start = fields.getPredicateStart(yield);
        if (predicates.contains(start))
            continue;
        predicates.add(start);
        view.addTokenLabel(start, fields.getSense(), 1.0);
        try {
            view.addTokenAttribute(start, LemmaIdentifier, fields.getLemma());
        } catch (Exception e) {
            // XXX Maybe log the exception?
            e.printStackTrace();
        }
    }
    if (view.getConstituents().size() > 0)
        ta.addView(goldViewName, view);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) FileNotFoundException(java.io.FileNotFoundException) TokenLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3