Search in sources :

Example 1 with Counter

use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.

the class VerbSenseClassifierMain method addRequiredViews.

private static void addRequiredViews(IResetableIterator<TextAnnotation> dataset) {
    Counter<String> addedViews = new Counter<>();
    log.info("Initializing pre-processor");
    TextPreProcessor.initialize();
    int count = 0;
    while (dataset.hasNext()) {
        TextAnnotation ta = dataset.next();
        Set<String> views = new HashSet<>(ta.getAvailableViews());
        try {
            TextPreProcessor.getInstance().preProcessText(ta);
        } catch (Exception e) {
            // Remove from dataset
            log.error("Annotation failed, removing sentence from dataset");
            SentenceDBHandler.instance.removeTextAnnotation(ta);
            continue;
        }
        Set<String> newViews = new HashSet<>(ta.getAvailableViews());
        newViews.removeAll(views);
        if (newViews.size() > 0) {
            SentenceDBHandler.instance.updateTextAnnotation(ta);
            for (String s : newViews) addedViews.incrementCount(s);
        }
        count++;
        if (count % 1000 == 0)
            System.out.println(count + " sentences done");
    }
    System.out.println("New views: ");
    for (String s : addedViews.items()) System.out.println(s + "\t" + addedViews.getCount(s));
}
Also used : Counter(edu.illinois.cs.cogcomp.core.stats.Counter) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 2 with Counter

use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.

the class TextAnnotationLabelCounter method populateLabelCounts.

/**
 * generate the target label/feature counts.
 * @param annotationViews map from doc id to set of views containing the annotations (constituents, relations)
 *                        that will be split.
 */
@Override
public void populateLabelCounts(Map<String, Set<View>> annotationViews) {
    for (String docId : annotationViews.keySet()) {
        Counter<String> docLabelCount = new Counter<>();
        labelCounts.put(docId, docLabelCount);
        for (View v : annotationViews.get(docId)) {
            for (Relation r : v.getRelations()) {
                String label = r.getRelationName();
                if (useAllLabels || labelsToConsider.contains(label)) {
                    docLabelCount.incrementCount(label);
                    labelTotals.incrementCount(label);
                }
            }
            for (Constituent c : v.getConstituents()) {
                String label = c.getLabel();
                if (useAllLabels || labelsToConsider.contains(label)) {
                    docLabelCount.incrementCount(label);
                    labelTotals.incrementCount(label);
                }
            }
        }
    }
}
Also used : Relation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation) Counter(edu.illinois.cs.cogcomp.core.stats.Counter) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 3 with Counter

use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.

the class CreateTrainDevTestSplit method main.

/**
 * split an ERE corpus with 0.7/0.1/0.2 train/dev/test proportions, trying to balance
 *    all (or at least, lowest frequency) type count.
 *
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
        System.exit(-1);
    }
    EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
    String corpusRoot = args[1];
    String outDir = args[2];
    ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
    boolean throwExceptionOnXmlParserFail = false;
    double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
    double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
    double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
    // Path corpusPath = Paths.get(corpusRoot);
    // String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
    IOUtils.mkdir(outDir);
    String outFileStem = outDir + "/";
    // {ViewNames.EVENT_ERE};
    String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
    String[] labelsToCount = {};
    EREMentionRelationReader reader = null;
    try {
        reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
    Map<String, Set<View>> ereViews = new HashMap<>();
    while (reader.hasNext()) {
        XmlTextAnnotation xmlTextAnnotation = reader.next();
        ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
        Set<View> views = new HashSet<>();
        TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
        for (String viewName : viewNames) if (ta.hasView(viewName))
            views.add(ta.getView(viewName));
        ereViews.put(ta.getId(), views);
    }
    TextAnnotationLabelCounter lce = new TextAnnotationLabelCounter(labelsToCount.length == 0, labelsToCount, ereViews);
    CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(lce);
    Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
    Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
    Map<String, Counter<String>> counts = creator.getExampleLabelCounts();
    List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
    for (String docId : counts.keySet()) {
        outLines.add(docId + ": " + printCounts(counts.get(docId)));
    }
    for (Split s : splitCounts.keySet()) {
        outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
    }
    Counter<String> totalLabelCounts = creator.getLabelTotals();
    outLines.add("TOTALS: " + printCounts(totalLabelCounts));
    try {
        LineIO.write(outFileStem + "countInfo.txt", outLines);
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    for (Split s : splits.keySet()) {
        List<String> ids = new ArrayList<>(splits.get(s));
        try {
            LineIO.write(outFileStem + s.name() + ".txt", ids);
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
    }
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) EREDocumentReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader) Counter(edu.illinois.cs.cogcomp.core.stats.Counter) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IOException(java.io.IOException) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IOException(java.io.IOException) EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) EREEventReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)

Example 4 with Counter

use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.

the class CoNLLColumnFormatReader method main.

public static void main(String[] args) throws Exception {
    String columnFile = "02.feats";
    CoNLLColumnFormatReader reader = new CoNLLColumnFormatReader("PennTreebank-WSJ", "02", columnFile, ViewNames.SRL_VERB, new BasicTextAnnotationBuilder());
    Counter<String> counter = new Counter<>();
    List<String> predicates = new ArrayList<>();
    for (TextAnnotation ta : reader) {
        counter.incrementCount("Sentences");
        System.out.println(ta.getTokenizedText());
        if (!ta.hasView(ViewNames.SRL_VERB))
            continue;
        PredicateArgumentView pav = (PredicateArgumentView) ta.getView(ViewNames.SRL_VERB);
        List<Constituent> predicates2 = pav.getPredicates();
        counter.incrementCount("Predicates", predicates2.size());
        for (Constituent c : predicates2) {
            predicates.add(c.getAttribute(PredicateArgumentView.LemmaIdentifier));
        }
    }
    System.out.println((int) counter.getCount("Sentences") + " sentences");
    System.out.println((int) counter.getCount("Predicates") + " predicates");
}
Also used : BasicTextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder) Counter(edu.illinois.cs.cogcomp.core.stats.Counter) ArrayList(java.util.ArrayList)

Example 5 with Counter

use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.

the class CreateTrainDevTestSplit method getBestSplit.

/**
 * sample without replacement the available ids a set of  (frac, 1-frac), trying to match the
 *    proportions of labels indicated.
 *
 * @param availIds ids to apportion
 * iterate over candidate sets of documents; find smallest diff of relation counts with target counts.
 * for larger data sets, splits in to blocks of specified size and performs the split in each,
 *    then concatenate the results.
 */
private Pair<Set<String>, Counter<String>> getBestSplit(double frac, Set<String> availIds) {
    Set<String> bestSplit = new HashSet<>();
    Counter<String> splitCount = null;
    if (frac < 0.01)
        return new Pair(bestSplit, splitCount);
    Counter<String> targetCounts = labelCountExtractor.findTargetCounts(frac);
    double bestDiff = LARGE_DIFF;
    // Pick a dimension to split on -- say, the one with the lowest count (and therefore the most
    // likely to be proportionally imbalanced in random split)
    List<String> targetSplitOrder = getTargetSplitOrder(targetCounts);
    // TODO: have cost weight infrequent labels more highly.
    Map<String, Double> weights = setTargetWeights(targetSplitOrder, targetCounts);
    for (int i = 0; i < NUM_TRIALS && bestDiff > 0; ++i) {
        Pair<Set<String>, Counter<String>> splitAndCount = getRandomSplit(availIds, targetCounts, targetSplitOrder);
        Set<String> splitIds = splitAndCount.getFirst();
        // 
        Counter<String> labelCount = splitAndCount.getSecond();
        double cost = computeCountDiff(labelCount, targetCounts, weights);
        logger.debug("best prior diff: {}; current diff: {}", bestDiff, cost);
        if (cost < bestDiff) {
            bestSplit = splitIds;
            splitCount = labelCount;
            bestDiff = cost;
        }
    }
    return new Pair(bestSplit, splitCount);
}
Also used : Counter(edu.illinois.cs.cogcomp.core.stats.Counter) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

Counter (edu.illinois.cs.cogcomp.core.stats.Counter)7 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)3 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)3 BasicTextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 QueryableList (edu.illinois.cs.cogcomp.core.datastructures.QueryableList)1 Relation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation)1 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)1 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)1 EREDocumentReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader)1 EREEventReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)1 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 Test (org.junit.Test)1