use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.
the class VerbSenseClassifierMain method addRequiredViews.
private static void addRequiredViews(IResetableIterator<TextAnnotation> dataset) {
Counter<String> addedViews = new Counter<>();
log.info("Initializing pre-processor");
TextPreProcessor.initialize();
int count = 0;
while (dataset.hasNext()) {
TextAnnotation ta = dataset.next();
Set<String> views = new HashSet<>(ta.getAvailableViews());
try {
TextPreProcessor.getInstance().preProcessText(ta);
} catch (Exception e) {
// Remove from dataset
log.error("Annotation failed, removing sentence from dataset");
SentenceDBHandler.instance.removeTextAnnotation(ta);
continue;
}
Set<String> newViews = new HashSet<>(ta.getAvailableViews());
newViews.removeAll(views);
if (newViews.size() > 0) {
SentenceDBHandler.instance.updateTextAnnotation(ta);
for (String s : newViews) addedViews.incrementCount(s);
}
count++;
if (count % 1000 == 0)
System.out.println(count + " sentences done");
}
System.out.println("New views: ");
for (String s : addedViews.items()) System.out.println(s + "\t" + addedViews.getCount(s));
}
use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.
the class TextAnnotationLabelCounter method populateLabelCounts.
/**
* generate the target label/feature counts.
* @param annotationViews map from doc id to set of views containing the annotations (constituents, relations)
* that will be split.
*/
@Override
public void populateLabelCounts(Map<String, Set<View>> annotationViews) {
for (String docId : annotationViews.keySet()) {
Counter<String> docLabelCount = new Counter<>();
labelCounts.put(docId, docLabelCount);
for (View v : annotationViews.get(docId)) {
for (Relation r : v.getRelations()) {
String label = r.getRelationName();
if (useAllLabels || labelsToConsider.contains(label)) {
docLabelCount.incrementCount(label);
labelTotals.incrementCount(label);
}
}
for (Constituent c : v.getConstituents()) {
String label = c.getLabel();
if (useAllLabels || labelsToConsider.contains(label)) {
docLabelCount.incrementCount(label);
labelTotals.incrementCount(label);
}
}
}
}
}
use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method main.
/**
* split an ERE corpus with 0.7/0.1/0.2 train/dev/test proportions, trying to balance
* all (or at least, lowest frequency) type count.
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
System.exit(-1);
}
EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
String corpusRoot = args[1];
String outDir = args[2];
ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
boolean throwExceptionOnXmlParserFail = false;
double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
// Path corpusPath = Paths.get(corpusRoot);
// String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
IOUtils.mkdir(outDir);
String outFileStem = outDir + "/";
// {ViewNames.EVENT_ERE};
String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
String[] labelsToCount = {};
EREMentionRelationReader reader = null;
try {
reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
Map<String, Set<View>> ereViews = new HashMap<>();
while (reader.hasNext()) {
XmlTextAnnotation xmlTextAnnotation = reader.next();
ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
Set<View> views = new HashSet<>();
TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
for (String viewName : viewNames) if (ta.hasView(viewName))
views.add(ta.getView(viewName));
ereViews.put(ta.getId(), views);
}
TextAnnotationLabelCounter lce = new TextAnnotationLabelCounter(labelsToCount.length == 0, labelsToCount, ereViews);
CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(lce);
Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
Map<String, Counter<String>> counts = creator.getExampleLabelCounts();
List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
for (String docId : counts.keySet()) {
outLines.add(docId + ": " + printCounts(counts.get(docId)));
}
for (Split s : splitCounts.keySet()) {
outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
}
Counter<String> totalLabelCounts = creator.getLabelTotals();
outLines.add("TOTALS: " + printCounts(totalLabelCounts));
try {
LineIO.write(outFileStem + "countInfo.txt", outLines);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
for (Split s : splits.keySet()) {
List<String> ids = new ArrayList<>(splits.get(s));
try {
LineIO.write(outFileStem + s.name() + ".txt", ids);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
}
use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.
the class CoNLLColumnFormatReader method main.
public static void main(String[] args) throws Exception {
String columnFile = "02.feats";
CoNLLColumnFormatReader reader = new CoNLLColumnFormatReader("PennTreebank-WSJ", "02", columnFile, ViewNames.SRL_VERB, new BasicTextAnnotationBuilder());
Counter<String> counter = new Counter<>();
List<String> predicates = new ArrayList<>();
for (TextAnnotation ta : reader) {
counter.incrementCount("Sentences");
System.out.println(ta.getTokenizedText());
if (!ta.hasView(ViewNames.SRL_VERB))
continue;
PredicateArgumentView pav = (PredicateArgumentView) ta.getView(ViewNames.SRL_VERB);
List<Constituent> predicates2 = pav.getPredicates();
counter.incrementCount("Predicates", predicates2.size());
for (Constituent c : predicates2) {
predicates.add(c.getAttribute(PredicateArgumentView.LemmaIdentifier));
}
}
System.out.println((int) counter.getCount("Sentences") + " sentences");
System.out.println((int) counter.getCount("Predicates") + " predicates");
}
use of edu.illinois.cs.cogcomp.core.stats.Counter in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method getBestSplit.
/**
* sample without replacement the available ids a set of (frac, 1-frac), trying to match the
* proportions of labels indicated.
*
* @param availIds ids to apportion
* iterate over candidate sets of documents; find smallest diff of relation counts with target counts.
* for larger data sets, splits in to blocks of specified size and performs the split in each,
* then concatenate the results.
*/
private Pair<Set<String>, Counter<String>> getBestSplit(double frac, Set<String> availIds) {
Set<String> bestSplit = new HashSet<>();
Counter<String> splitCount = null;
if (frac < 0.01)
return new Pair(bestSplit, splitCount);
Counter<String> targetCounts = labelCountExtractor.findTargetCounts(frac);
double bestDiff = LARGE_DIFF;
// Pick a dimension to split on -- say, the one with the lowest count (and therefore the most
// likely to be proportionally imbalanced in random split)
List<String> targetSplitOrder = getTargetSplitOrder(targetCounts);
// TODO: have cost weight infrequent labels more highly.
Map<String, Double> weights = setTargetWeights(targetSplitOrder, targetCounts);
for (int i = 0; i < NUM_TRIALS && bestDiff > 0; ++i) {
Pair<Set<String>, Counter<String>> splitAndCount = getRandomSplit(availIds, targetCounts, targetSplitOrder);
Set<String> splitIds = splitAndCount.getFirst();
//
Counter<String> labelCount = splitAndCount.getSecond();
double cost = computeCountDiff(labelCount, targetCounts, weights);
logger.debug("best prior diff: {}; current diff: {}", bestDiff, cost);
if (cost < bestDiff) {
bestSplit = splitIds;
splitCount = labelCount;
bestDiff = cost;
}
}
return new Pair(bestSplit, splitCount);
}
Aggregations