Search in sources :

Example 51 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class ACE_WL_Reader method parse.

public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
    List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
    Map<String, String> metadata = new HashMap<>();
    Pattern pattern = null;
    Matcher matcher = null;
    String docID = "";
    String dateTime = "";
    String headLine = "";
    String text = "";
    pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        docID = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, docID);
    pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        dateTime = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
    pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        headLine = (matcher.group(1)).trim();
    }
    metadata.put(DocumentMetadata.HeadLine, headLine);
    pattern = Pattern.compile("<POST>(.*?)</POST>");
    matcher = pattern.matcher(content);
    while (matcher.find()) {
        text = (matcher.group(1)).trim();
        text = text.substring(text.indexOf("</POSTDATE>") + "</POSTDATE>".length()).trim();
        int index4 = content.indexOf(text);
        Paragraph para4 = new Paragraph(index4, text);
        Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
        paragraphs.add(pair4);
    }
    int index = 0;
    for (int i = 0; i < paragraphs.size(); ++i) {
        int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
        paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
        index += paragraphs.get(i).getSecond().content.length();
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    if (isDebug) {
        for (int i = 0; i < paragraphs.size(); ++i) {
            logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
            logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
            logger.info("\n");
        }
    }
    return new Pair<>(paragraphs, metadata);
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair) Paragraph(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)

Example 52 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class CreateTrainDevTestSplit method getBestSplit.

/** iterate over candidate sets of documents; find smallest diff of relation counts with target counts */
private Pair<Set<String>, Counter<String>> getBestSplit(double frac, Set<String> availIds) {
    Set<String> split = new HashSet<>();
    Counter<String> splitCount = null;
    if (frac < 0.01)
        return new Pair(split, splitCount);
    Map<String, Double> targetCounts = findTargetCounts(frac);
    double bestDiff = LARGE_DIFF;
    /*
         * fill in a table of partial counts. Naive, so size is approx 2 * (n choose k)
         * as we keep the last row to save some computation.
         * stop as soon as we have a round where we don't improve the bestRoundDiff, as adding more documents
         * will not reduce the count differences.
         */
    PriorityQueue<QueueElement> oldBestSplitsOfSizeK = new PriorityQueue<>(BEAM_SIZE);
    PriorityQueue<QueueElement> bestSplits = new PriorityQueue<>(BEAM_SIZE);
    // number of documents in the sets considered
    for (int num = 1; num <= availIds.size(); ++num) {
        logger.info("Round {}...", num);
        double bestRoundDiff = LARGE_DIFF;
        // store new combinations generated this round
        boolean isBetterRound = false;
        // each document to that of each previously existing id combination
        // todo: move dcc into olddcc; populate newdcc with dcc counts plus doc counts for each doc
        // make sure to copy counters to avoid shared references across combinations (will corrupt counts)
        //new HashMap<>();
        Map<Set<String>, Counter<String>> oldCombCounts = initializeCurrentRoundCounts(oldBestSplitsOfSizeK);
        /*
             * compute NUM_DOCS * BEAM_SIZE possible splits.
             */
        Map<Set<String>, Counter<String>> docCombinationCounts = new HashMap<>();
        for (Set<String> keyComb : oldCombCounts.keySet()) {
            Counter<String> keyCount = oldCombCounts.get(keyComb);
            for (String docId : availIds) {
                Set<String> newComb = new HashSet<>();
                newComb.addAll(keyComb);
                newComb.add(docId);
                // naive implementation does not consider order, so avoid duplication
                if (!oldCombCounts.containsKey(newComb)) {
                    // the counts for the current docId
                    Counter<String> docLabelCount = labelCounts.get(docId);
                    Counter<String> newCombLabelCount = new Counter<>();
                    // initialize newCombLabelCount with count from base id combination
                    for (String label : keyCount.keySet()) newCombLabelCount.incrementCount(label, keyCount.getCount(label));
                    //add current docId label counts
                    for (String label : docLabelCount.items()) {
                        newCombLabelCount.incrementCount(label, docLabelCount.getCount(label));
                    }
                    docCombinationCounts.put(newComb, newCombLabelCount);
                }
            }
        }
        PriorityQueue<QueueElement> bestSplitsOfSizeK = new PriorityQueue<>();
        // want explicit generation because we will use these as seeds in the next round
        for (Set<String> docidComb : docCombinationCounts.keySet()) {
            double diff = computeCountDiff(docCombinationCounts.get(docidComb), targetCounts);
            bestSplitsOfSizeK.add(new QueueElement(diff, docidComb, docCombinationCounts.get(docidComb)));
            if (diff < bestRoundDiff) {
                bestRoundDiff = diff;
                if (bestRoundDiff < bestDiff) {
                    isBetterRound = true;
                    bestDiff = bestRoundDiff;
                }
            }
        }
        logger.info("current round best diff is {}", bestRoundDiff);
        if (stopEarly && !isBetterRound) {
            logger.warn("Stopping after round {}", num);
            logger.warn("current round best diff is {}", bestRoundDiff);
            break;
        }
        // store best fixed-size splits
        oldBestSplitsOfSizeK = bestSplitsOfSizeK;
        // track best splits overall
        bestSplits.addAll(bestSplitsOfSizeK);
        oldBestSplitsOfSizeK = trimQueue(oldBestSplitsOfSizeK);
        bestSplits = trimQueue(bestSplits);
    }
    QueueElement bestSplit = bestSplits.poll();
    return new Pair(bestSplit.docIdSet, bestSplit.labelCounter);
}
Also used : Counter(edu.illinois.cs.cogcomp.core.stats.Counter) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 53 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class GoldLabel method getArgument.

List<Constituent> getArgument(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield, boolean mergeContiguousCArgs) {
    String[] parts = propSpanInfo.split("\\*");
    List<Pair<IntPair, Boolean>> spans = new ArrayList<>();
    boolean someR = false;
    for (String part : parts) {
        if (part.length() == 0)
            continue;
        for (String s : part.split(",")) {
            if (s.length() == 0)
                continue;
            Pair<String, IntPair> info = getSpan(s, yield);
            String nonTerminal = info.getFirst();
            IntPair span = info.getSecond();
            if (span.getFirst() < 0 || span.getFirst() >= span.getSecond())
                continue;
            boolean r = false;
            if (nonTerminal.startsWith("WH")) {
                r = true;
                someR = true;
            }
            spans.add(new Pair<>(span, r));
        }
    }
    Collections.sort(spans, new Comparator<Pair<IntPair, Boolean>>() {

        @Override
        public int compare(Pair<IntPair, Boolean> arg0, Pair<IntPair, Boolean> arg1) {
            if (arg0.getFirst().getFirst() < arg1.getFirst().getFirst())
                return -1;
            else if (arg0.getFirst().getFirst() == arg1.getFirst().getFirst())
                return 0;
            else
                return 1;
        }
    });
    if (!someR && mergeContiguousCArgs) {
        spans = mergeCArgs(spans);
    }
    boolean first = true;
    List<Constituent> arg = new ArrayList<>();
    for (Pair<IntPair, Boolean> item : spans) {
        String label = this.label;
        if (item.getSecond() && spans.size() > 1) {
            label = "R-" + label;
        } else {
            if (first) {
                first = false;
            } else {
                label = "C-" + label;
            }
        }
        Constituent constituent = new Constituent(label, viewName, ta, item.getFirst().getFirst(), item.getFirst().getSecond());
        if (h != null) {
            constituent.addAttribute(AbstractSRLAnnotationReader.HyphenTagInfo, h);
        }
        arg.add(constituent);
    }
    return arg;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 54 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class GoldLabel method addAnnotation.

private void addAnnotation(TextAnnotation ta) {
    Tree<String> tree = ParseUtils.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
    Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
    List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
    PredicateArgumentView pav = new PredicateArgumentView(srlViewName, "AnnotatedTreebank", ta, 1.0);
    Set<Integer> predicates = new HashSet<>();
    for (Fields fields : goldFields.get(ta.getId())) {
        Constituent predicate = fields.createPredicate(ta, srlViewName, yield);
        if (predicates.contains(predicate.getStartSpan()))
            continue;
        predicates.add(predicate.getStartSpan());
        List<Constituent> args = new ArrayList<>();
        List<String> labels = new ArrayList<>();
        List<Double> scores = new ArrayList<>();
        // We need to make sure that the One-Argument-Per-Span constraint is
        // respected. Yes sir, we do, even if the data says otherwise!
        Set<IntPair> seenSpans = new HashSet<>();
        for (GoldLabel arg : fields.getGoldLabels()) {
            List<Constituent> aa = arg.getArgument(ta, srlViewName, yield, mergeContiguousCArgs);
            List<Constituent> filtered = new ArrayList<>();
            for (Constituent possibleArg : aa) {
                if (seenSpans.contains(possibleArg.getSpan()))
                    continue;
                seenSpans.add(possibleArg.getSpan());
                filtered.add(possibleArg);
            }
            addArguments(ta, predicate, args, labels, scores, arg, filtered);
        }
        // for each arg
        pav.addPredicateArguments(predicate, args, labels.toArray(new String[labels.size()]), ArrayUtilities.asDoubleArray(scores));
    }
    if (pav.getPredicates().size() > 0)
        ta.addView(srlViewName, pav);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) PredicateArgumentView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 55 with Pair

use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.

the class GoldLabel method mergeCArgs.

protected List<Pair<IntPair, Boolean>> mergeCArgs(List<Pair<IntPair, Boolean>> spans) {
    if (spans.size() <= 1)
        return spans;
    List<Pair<IntPair, Boolean>> list = new ArrayList<>();
    IntPair prev = null;
    boolean r = true;
    for (Pair<IntPair, Boolean> p : spans) {
        if (prev == null) {
            prev = p.getFirst();
            r = p.getSecond();
        } else {
            if (p.getFirst().getFirst() == prev.getSecond()) {
                prev = new IntPair(prev.getFirst(), p.getFirst().getSecond());
                r &= p.getSecond();
            } else {
                list.add(new Pair<>(prev, r));
                prev = p.getFirst();
                r = p.getSecond();
            }
        }
    }
    list.add(new Pair<>(prev, r));
    assert list.size() <= spans.size();
    if (spans.size() > 0)
        assert list.size() > 0;
    return list;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)59 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)35 ArrayList (java.util.ArrayList)17 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)10 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)7 Matcher (java.util.regex.Matcher)7 Paragraph (edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.Paragraph)6 HashMap (java.util.HashMap)6 Pattern (java.util.regex.Pattern)6 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)3 SenseInstance (edu.illinois.cs.cogcomp.verbsense.jlis.SenseInstance)3 SenseStructure (edu.illinois.cs.cogcomp.verbsense.jlis.SenseStructure)3 JsonObject (com.google.gson.JsonObject)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 ITransformer (edu.illinois.cs.cogcomp.core.transformers.ITransformer)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 LinkedHashSet (java.util.LinkedHashSet)2