Search in sources :

Example 1 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class JsonSerializer method readSentences.

private static Pair<Pair<String, Double>, int[]> readSentences(JsonObject json) {
    JsonObject object = json.getAsJsonObject("sentences");
    String generator = readString("generator", object);
    double score = readDouble("score", object);
    int[] endPositions = readIntArray("sentenceEndPositions", object);
    return new Pair<>(new Pair<>(generator, score), endPositions);
}
Also used : JsonObject(com.google.gson.JsonObject) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 2 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class AnnotationFixer method rationalizeBoundaryAnnotations.

/**
     * correct automated annotations (tokenization, sentence splitting) based on gold annotations of entity mentions
     * @param ta TextAnnotation with annotated clean text
     * @return a map of view names to indexes indicating where violations were found/corrected
     */
public static Map<String, Set<Integer>> rationalizeBoundaryAnnotations(TextAnnotation ta, String viewName) {
    Map<String, Set<Integer>> violations = new HashMap<>();
    Set<Integer> badSentenceStartIndexes = new HashSet<>();
    violations.put(ViewNames.SENTENCE, badSentenceStartIndexes);
    View sentences = ta.getView(ViewNames.SENTENCE);
    TreeMap<Integer, Constituent> sentenceStarts = new TreeMap<>();
    for (Constituent s : sentences) sentenceStarts.put(s.getStartSpan(), s);
    Set<Pair<Constituent, Constituent>> sentencesToMerge = new HashSet<>();
    View nerMention = ta.getView(viewName);
    for (Constituent m : nerMention.getConstituents()) {
        Constituent lastSent = null;
        for (int sentStart : sentenceStarts.keySet()) {
            int mentEnd = m.getEndSpan();
            if (// ordered sentence list, so stop after
            sentStart > mentEnd)
                break;
            Constituent currentSent = sentenceStarts.get(sentStart);
            int mentStart = m.getStartSpan();
            if (sentStart > mentStart && sentStart < mentEnd) {
                sentencesToMerge.add(new Pair(lastSent, currentSent));
                badSentenceStartIndexes.add(sentStart);
            }
            lastSent = currentSent;
        }
    }
    Set<Integer> sentStartsProcessed = new HashSet<>();
    for (Pair<Constituent, Constituent> sentPair : sentencesToMerge) {
        Constituent first = sentPair.getFirst();
        Constituent second = sentPair.getSecond();
        int firstStart = first.getStartSpan();
        int secondStart = second.getStartSpan();
        if (sentStartsProcessed.contains(firstStart) || sentStartsProcessed.contains(secondStart)) {
            throw new IllegalStateException("more complex boundary constraints than I can currently handle -- " + "more than two consecutive sentences with boundary errors.");
        }
        Constituent combinedSent = null;
        if (null == first.getLabelsToScores())
            combinedSent = new Constituent(first.getLabel(), first.getConstituentScore(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
        else
            combinedSent = new Constituent(first.getLabelsToScores(), ViewNames.SENTENCE, first.getTextAnnotation(), first.getStartSpan(), second.getEndSpan());
        for (String k : first.getAttributeKeys()) {
            combinedSent.addAttribute(k, first.getAttribute(k));
        }
        for (String k : second.getAttributeKeys()) {
            combinedSent.addAttribute(k, first.getAttribute(k));
        }
        sentences.removeConstituent(first);
        sentences.removeConstituent(second);
        sentences.addConstituent(combinedSent);
    }
    ta.setSentences();
    return violations;
}
Also used : View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 3 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class StringTransformation method applyPendingEdits.

/**
     * apply any pending edits, update the modified string
     */
public void applyPendingEdits() {
    String currentStr = transformedText;
    if (isModified) {
        /*
              immediately set flag, as we may call other methods that check this condition, which could call this
                 method
             */
        isModified = false;
        /*
             * it's OK for edits to be unsorted: all edit offsets are computed relative to the previous edits
             *    in the sequence
             */
        for (Edit edit : edits) {
            IntPair editOffsets = edit.offsets;
            String before = currentStr.substring(0, editOffsets.getFirst());
            String after = currentStr.substring(editOffsets.getSecond());
            currentStr = before + edit.newString + after;
        }
        transformedText = currentStr;
        /*
             * store pending recorded offsets while computing absolute offsets for all current edits
             */
        Map<Integer, Pair<Integer, EditType>> toAdd = new TreeMap();
        for (Integer modOffset : currentOffsetModifications.keySet()) {
            Integer currentMod = currentOffsetModifications.get(modOffset).getFirst();
            EditType currentEditType = currentOffsetModifications.get(modOffset).getSecond();
            /*
                 * recorded offset mods MUST be made with respect to ORIGINAL offsets -- not the current transformed
                 *     string.
                 */
            Integer absoluteModOffset = computeOriginalOffset(modOffset);
            // TODO: verify that it's OK to just keep the original edit type
            if (toAdd.containsKey(absoluteModOffset))
                currentMod += toAdd.get(absoluteModOffset).getFirst();
            toAdd.put(absoluteModOffset, new Pair<>(currentMod, currentEditType));
        }
        /**
             * The entries in toAdd *cannot* conflict, because they come from a single pass
             * Now we need to merge them with previously recorded offset mods
             */
        if (recordedOffsetModifications.isEmpty())
            recordedOffsetModifications.putAll(toAdd);
        else {
            TreeMap<Integer, Pair<Integer, EditType>> safeAdds = new TreeMap<>();
            // stores position of greatest of last key, or the last key's effective edit position
            int lastKeyPos = 0;
            for (int key : toAdd.keySet()) {
                int mod = toAdd.get(key).getFirst();
                EditType editType = toAdd.get(key).getSecond();
                if (key < lastKeyPos)
                    // move to after last entry key + edit
                    key = lastKeyPos;
                /*
                     * it gets a bit tricky if a new deletion overlaps older edits: you need to split up the new edit.
                     * TODO: merge edits instead
                     */
                for (int oldKey : recordedOffsetModifications.keySet()) {
                    if (mod == 0)
                        break;
                    // am I at the same index?
                    if (oldKey == key) {
                        //if edit is an expansion, still advance one position
                        //move on...
                        key = Math.max(key + 1, key - recordedOffsetModifications.get(oldKey).getFirst());
                    } else // am I within the window of a prior edit?
                    if (oldKey < key) {
                        int oldMod = recordedOffsetModifications.get(oldKey).getFirst();
                        // negative, to compare with negative mod
                        int diff = oldKey - key;
                        if (diff > oldMod) {
                            // edits interfere; can't happen if oldMod is positive (insertion)
                            // modifier doesn't change: edit not applied yet; update edit
                            key = oldKey - oldMod;
                        // position to just past old edit
                        }
                    } else if (oldKey > key) {
                        // Is next edit within window of my edit?
                        // negative, to compare with -ve mod
                        int diff = key - oldKey;
                        if (diff > mod) {
                            //if diff > mod, mod is negative and edits interfere.
                            // delete up to current edit
                            safeAdds.put(key, new Pair<>(diff, editType));
                            // part of modification not accounted for; again, recall both negative
                            mod = mod - diff;
                            // move to index after old edit
                            key = oldKey - recordedOffsetModifications.get(oldKey).getFirst();
                        } else {
                            // either mod is positive, or next edit does not interfere
                            safeAdds.put(key, new Pair<>(mod, editType));
                            // update if -ve mod
                            lastKeyPos = Math.max(key, key - mod);
                            //break from the loop
                            mod = 0;
                        }
                    }
                }
                if (// past all old edits, haven't added it yet...
                mod != 0)
                    safeAdds.put(key, new Pair<>(mod, editType));
            }
            recordedOffsetModifications.putAll(safeAdds);
        }
        /*
             * compute inverse mapping (from transformed text offsets to original offsets)
             *    using the complete set of transformations to date: store as offset modifiers
             *    at transform string indexes where changes occur, such that adding the offset modifier
             *    to the current transform index yields the corresponding offset in the original string.
             */
        recordedInverseModifications.clear();
        /*
             * recordedOffsetModifications: at char index X, modify running offset modifier by Y
             */
        int cumulativeOffset = 0;
        for (Integer transformModIndex : recordedOffsetModifications.keySet()) {
            int baseIndex = transformModIndex;
            int transformMod = recordedOffsetModifications.get(transformModIndex).getFirst();
            EditType editType = recordedOffsetModifications.get(transformModIndex).getSecond();
            /*
                 * suppose tranform offset is 33, and modifier is -33 (delete the first 33 chars of the orig string).
                 * Therefore we want index 0 of the transformed string to map to offset 33 of the orig string.
                 * So we update the cumulative offset *after* adding the current mod.
                 * Subsequent edits to orig string increase the total difference between the transformed string
                 *    base index and the corresponding orig string index, hence the need for cumulative offset to
                 *    be subtracted from the orig index. (mod is -ve, therefore subtraction even though it's added
                 *    to the offset from the perspective of the original string
                 */
            int effectiveIndex = baseIndex - cumulativeOffset;
            int effectiveMod = transformMod;
            if (recordedInverseModifications.containsKey(effectiveIndex))
                effectiveMod -= recordedInverseModifications.get(effectiveIndex).getFirst();
            // TODO: verify that using most recent transform type is correct if there was already an edit in RIM
            recordedInverseModifications.put(effectiveIndex, new Pair<>(-effectiveMod, editType));
            cumulativeOffset -= transformMod;
        }
        if (DEBUG) {
            int lastIndex = 0;
            int lastOrigOffset = 0;
            for (int revInd : recordedInverseModifications.keySet()) {
                int diff = revInd - lastIndex;
                String origSub = origText.substring(lastOrigOffset, lastOrigOffset + diff);
                System.err.println(lastIndex + "-" + revInd + ": " + origSub);
                lastOrigOffset = lastOrigOffset + diff + recordedInverseModifications.get(revInd).getFirst();
                lastIndex = revInd;
            }
        }
        /*
             * cleanup: remove temporary state that has now been resolved
             */
        currentOffsetModifications.clear();
        edits.clear();
    }
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 4 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class TreeView method addParseTree.

/**
     * Transforms an unscored input tree into the constituent-relation graph
     */
protected void addParseTree(Tree<Pair<String, IntPair>> spanLabeledTree, Constituent root, int sentenceStartPosition) {
    for (int childId = 0; childId < spanLabeledTree.getNumberOfChildren(); childId++) {
        Tree<Pair<String, IntPair>> child = spanLabeledTree.getChild(childId);
        String edgeLabel;
        edgeLabel = PARENT_OF_STRING;
        Pair<String, IntPair> childLabel = child.getLabel();
        IntPair childSpan = childLabel.getSecond();
        int start = childSpan.getFirst() + sentenceStartPosition;
        int end = childSpan.getSecond() + sentenceStartPosition;
        String constituentLabel = childLabel.getFirst();
        Constituent childConstituent;
        if (start >= end) {
            // Ignore constituents with incorrect span bounds
            logger.debug("Constituent with incorrect span found in " + root.getViewName());
        } else {
            childConstituent = createNewConstituent(start, end, constituentLabel, 1.0);
            if (end == start + 1 && child.getNumberOfChildren() == 0) {
                // this is a leaf. The leaf must be a token in the sentence
                String token = this.getTextAnnotation().getToken(start);
                String s = constituentLabel;
                token = treebankTokenHacks(token);
                s = treebankTokenHacks(s);
                if (!token.equals(s)) {
                    assert false : "Expecting token: " + token + ", found " + s + " instead.";
                }
            }
            this.addConstituent(childConstituent);
            this.addRelation(new Relation(edgeLabel, root, childConstituent, 1.0));
            this.addParseTree(child, childConstituent, sentenceStartPosition);
        }
    }
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 5 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class TreeView method setParseTree.

/**
     * Set the parse tree of the {@code sentenceId}<sup>th</sup> sentence.
     * <p>
     * <b>Note</b>: The same TreeView cannot contain both the parse tree and the dependency tree and
     * will throw an exception if an attempt is made to set the parse tree in a view that has a
     * dependency tree.
     */
public void setParseTree(int sentenceId, Tree<String> tree) {
    safeInitializeTrees();
    if (firstTree) {
        firstTree = false;
        this.isDependencyTree = false;
    }
    if (this.isDependencyTree) {
        throw new IllegalArgumentException("Not expecting a dependency tree, but found " + tree);
    }
    this.trees.set(sentenceId, tree);
    int sentenceStart = getSentenceStart(sentenceId);
    Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
    Pair<String, IntPair> rootLabel = spanLabeledTree.getLabel();
    IntPair rootSpan = rootLabel.getSecond();
    int rootStart = rootSpan.getFirst() + sentenceStart;
    int rootEnd = rootSpan.getSecond() + sentenceStart;
    Constituent root = createNewConstituent(rootStart, rootEnd, rootLabel.getFirst(), 1.0);
    this.addConstituent(root);
    addParseTree(spanLabeledTree, root, sentenceStart);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2