use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ACE_WL_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, docID);
pattern = Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
pattern = Pattern.compile("<HEADLINE>(.*?)</HEADLINE>");
matcher = pattern.matcher(content);
while (matcher.find()) {
headLine = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.HeadLine, headLine);
pattern = Pattern.compile("<POST>(.*?)</POST>");
matcher = pattern.matcher(content);
while (matcher.find()) {
text = (matcher.group(1)).trim();
text = text.substring(text.indexOf("</POSTDATE>") + "</POSTDATE>".length()).trim();
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
int offsetWithFiltering = contentRemovingTags.indexOf(paragraphs.get(i).getSecond().content, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paragraphs.get(i).getSecond().content.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method getBestSplit.
/** iterate over candidate sets of documents; find smallest diff of relation counts with target counts */
private Pair<Set<String>, Counter<String>> getBestSplit(double frac, Set<String> availIds) {
Set<String> split = new HashSet<>();
Counter<String> splitCount = null;
if (frac < 0.01)
return new Pair(split, splitCount);
Map<String, Double> targetCounts = findTargetCounts(frac);
double bestDiff = LARGE_DIFF;
/*
* fill in a table of partial counts. Naive, so size is approx 2 * (n choose k)
* as we keep the last row to save some computation.
* stop as soon as we have a round where we don't improve the bestRoundDiff, as adding more documents
* will not reduce the count differences.
*/
PriorityQueue<QueueElement> oldBestSplitsOfSizeK = new PriorityQueue<>(BEAM_SIZE);
PriorityQueue<QueueElement> bestSplits = new PriorityQueue<>(BEAM_SIZE);
// number of documents in the sets considered
for (int num = 1; num <= availIds.size(); ++num) {
logger.info("Round {}...", num);
double bestRoundDiff = LARGE_DIFF;
// store new combinations generated this round
boolean isBetterRound = false;
// each document to that of each previously existing id combination
// todo: move dcc into olddcc; populate newdcc with dcc counts plus doc counts for each doc
// make sure to copy counters to avoid shared references across combinations (will corrupt counts)
//new HashMap<>();
Map<Set<String>, Counter<String>> oldCombCounts = initializeCurrentRoundCounts(oldBestSplitsOfSizeK);
/*
* compute NUM_DOCS * BEAM_SIZE possible splits.
*/
Map<Set<String>, Counter<String>> docCombinationCounts = new HashMap<>();
for (Set<String> keyComb : oldCombCounts.keySet()) {
Counter<String> keyCount = oldCombCounts.get(keyComb);
for (String docId : availIds) {
Set<String> newComb = new HashSet<>();
newComb.addAll(keyComb);
newComb.add(docId);
// naive implementation does not consider order, so avoid duplication
if (!oldCombCounts.containsKey(newComb)) {
// the counts for the current docId
Counter<String> docLabelCount = labelCounts.get(docId);
Counter<String> newCombLabelCount = new Counter<>();
// initialize newCombLabelCount with count from base id combination
for (String label : keyCount.keySet()) newCombLabelCount.incrementCount(label, keyCount.getCount(label));
//add current docId label counts
for (String label : docLabelCount.items()) {
newCombLabelCount.incrementCount(label, docLabelCount.getCount(label));
}
docCombinationCounts.put(newComb, newCombLabelCount);
}
}
}
PriorityQueue<QueueElement> bestSplitsOfSizeK = new PriorityQueue<>();
// want explicit generation because we will use these as seeds in the next round
for (Set<String> docidComb : docCombinationCounts.keySet()) {
double diff = computeCountDiff(docCombinationCounts.get(docidComb), targetCounts);
bestSplitsOfSizeK.add(new QueueElement(diff, docidComb, docCombinationCounts.get(docidComb)));
if (diff < bestRoundDiff) {
bestRoundDiff = diff;
if (bestRoundDiff < bestDiff) {
isBetterRound = true;
bestDiff = bestRoundDiff;
}
}
}
logger.info("current round best diff is {}", bestRoundDiff);
if (stopEarly && !isBetterRound) {
logger.warn("Stopping after round {}", num);
logger.warn("current round best diff is {}", bestRoundDiff);
break;
}
// store best fixed-size splits
oldBestSplitsOfSizeK = bestSplitsOfSizeK;
// track best splits overall
bestSplits.addAll(bestSplitsOfSizeK);
oldBestSplitsOfSizeK = trimQueue(oldBestSplitsOfSizeK);
bestSplits = trimQueue(bestSplits);
}
QueueElement bestSplit = bestSplits.poll();
return new Pair(bestSplit.docIdSet, bestSplit.labelCounter);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class GoldLabel method getArgument.
List<Constituent> getArgument(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield, boolean mergeContiguousCArgs) {
String[] parts = propSpanInfo.split("\\*");
List<Pair<IntPair, Boolean>> spans = new ArrayList<>();
boolean someR = false;
for (String part : parts) {
if (part.length() == 0)
continue;
for (String s : part.split(",")) {
if (s.length() == 0)
continue;
Pair<String, IntPair> info = getSpan(s, yield);
String nonTerminal = info.getFirst();
IntPair span = info.getSecond();
if (span.getFirst() < 0 || span.getFirst() >= span.getSecond())
continue;
boolean r = false;
if (nonTerminal.startsWith("WH")) {
r = true;
someR = true;
}
spans.add(new Pair<>(span, r));
}
}
Collections.sort(spans, new Comparator<Pair<IntPair, Boolean>>() {
@Override
public int compare(Pair<IntPair, Boolean> arg0, Pair<IntPair, Boolean> arg1) {
if (arg0.getFirst().getFirst() < arg1.getFirst().getFirst())
return -1;
else if (arg0.getFirst().getFirst() == arg1.getFirst().getFirst())
return 0;
else
return 1;
}
});
if (!someR && mergeContiguousCArgs) {
spans = mergeCArgs(spans);
}
boolean first = true;
List<Constituent> arg = new ArrayList<>();
for (Pair<IntPair, Boolean> item : spans) {
String label = this.label;
if (item.getSecond() && spans.size() > 1) {
label = "R-" + label;
} else {
if (first) {
first = false;
} else {
label = "C-" + label;
}
}
Constituent constituent = new Constituent(label, viewName, ta, item.getFirst().getFirst(), item.getFirst().getSecond());
if (h != null) {
constituent.addAttribute(AbstractSRLAnnotationReader.HyphenTagInfo, h);
}
arg.add(constituent);
}
return arg;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class GoldLabel method addAnnotation.
private void addAnnotation(TextAnnotation ta) {
Tree<String> tree = ParseUtils.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
PredicateArgumentView pav = new PredicateArgumentView(srlViewName, "AnnotatedTreebank", ta, 1.0);
Set<Integer> predicates = new HashSet<>();
for (Fields fields : goldFields.get(ta.getId())) {
Constituent predicate = fields.createPredicate(ta, srlViewName, yield);
if (predicates.contains(predicate.getStartSpan()))
continue;
predicates.add(predicate.getStartSpan());
List<Constituent> args = new ArrayList<>();
List<String> labels = new ArrayList<>();
List<Double> scores = new ArrayList<>();
// We need to make sure that the One-Argument-Per-Span constraint is
// respected. Yes sir, we do, even if the data says otherwise!
Set<IntPair> seenSpans = new HashSet<>();
for (GoldLabel arg : fields.getGoldLabels()) {
List<Constituent> aa = arg.getArgument(ta, srlViewName, yield, mergeContiguousCArgs);
List<Constituent> filtered = new ArrayList<>();
for (Constituent possibleArg : aa) {
if (seenSpans.contains(possibleArg.getSpan()))
continue;
seenSpans.add(possibleArg.getSpan());
filtered.add(possibleArg);
}
addArguments(ta, predicate, args, labels, scores, arg, filtered);
}
// for each arg
pav.addPredicateArguments(predicate, args, labels.toArray(new String[labels.size()]), ArrayUtilities.asDoubleArray(scores));
}
if (pav.getPredicates().size() > 0)
ta.addView(srlViewName, pav);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class GoldLabel method mergeCArgs.
protected List<Pair<IntPair, Boolean>> mergeCArgs(List<Pair<IntPair, Boolean>> spans) {
if (spans.size() <= 1)
return spans;
List<Pair<IntPair, Boolean>> list = new ArrayList<>();
IntPair prev = null;
boolean r = true;
for (Pair<IntPair, Boolean> p : spans) {
if (prev == null) {
prev = p.getFirst();
r = p.getSecond();
} else {
if (p.getFirst().getFirst() == prev.getSecond()) {
prev = new IntPair(prev.getFirst(), p.getFirst().getSecond());
r &= p.getSecond();
} else {
list.add(new Pair<>(prev, r));
prev = p.getFirst();
r = p.getSecond();
}
}
}
list.add(new Pair<>(prev, r));
assert list.size() <= spans.size();
if (spans.size() > 0)
assert list.size() > 0;
return list;
}
Aggregations