use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessor method deleteSingletons.
/**
* delete all spans that correspond to singleton tags (i.e. self-contained span presented as open tag with
* attributes, but no corresponding close). Relies on user specifying these ahead of time.
* @param xmlTextSt StringTransformation containing text to be searched.
* @return StringTransformation with appropriate edits.
*/
private StringTransformation deleteSingletons(StringTransformation xmlTextSt) {
// don't call getTransformedText() or applyPendingEdits() in the body of the loop usinr xmlMatcher
Matcher xmlMatcher = xmlTagPattern.matcher(xmlTextSt.getTransformedText());
Map<IntPair, Map<String, String>> attributesRetained = new HashMap<>();
// match mark-up: xml open or close tag
while (xmlMatcher.find()) {
String substr = xmlMatcher.group(0);
if (substr.charAt(1) == '/') {
//irrelevant to singletons by definition
continue;
}
String lcsubstr = substr.toLowerCase();
// get the tag name
Matcher tagMatcher = xmlTagNamePattern.matcher(lcsubstr);
if (tagMatcher.find()) {
// identify the tag
String tagName = tagMatcher.group(1);
if (singletonTags.contains(tagName)) {
xmlTextSt.transformString(xmlMatcher.start(), xmlMatcher.end(), "");
}
}
}
xmlTextSt.applyPendingEdits();
return xmlTextSt;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ParseUtils method getTokenIndexedTreeCovering.
/**
* From a parse tree and a span that is specified with the start and end (exclusive), this
* function returns a tree that corresponds to the subtree that covers the span. Each node in
* the new tree corresponds to a node in the input tree and is labeled with the label of the
* original node along with the span that this node covered in the original tree.
*
* @return A new tree that covers the specified span and each node specifies the label and the
* span of the original tree that it covers.
*/
public static Tree<Pair<String, IntPair>> getTokenIndexedTreeCovering(Tree<String> parse, int start, int end) {
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(parse);
Tree<Pair<String, IntPair>> current = spanLabeledTree;
while (current != null) {
IntPair span = current.getLabel().getSecond();
if (span.getFirst() == start && span.getSecond() == end) {
return current;
} else {
boolean found = false;
for (Tree<Pair<String, IntPair>> child : current.getChildren()) {
if (child.getLabel().getSecond().getFirst() <= start && child.getLabel().getSecond().getSecond() >= end) {
current = child;
found = true;
break;
}
}
if (!found)
break;
}
}
return current;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ParseUtils method getPhraseFromHead.
/**
* Primarily a fix for prepSRL objects; converts them from single head words to constituents.
* E.g. for the sentence "the man with the telescope", the object of the preposition will be
* "the telescope" instead of just "telescope".
*
* @param predicate The predicate of the construction (e.g. "with")
* @param argHead The head-word of the argument of the construction (e.g. "telescope")
* @param parseViewName The name of the parse view used to extract the phrase-structure tree
* @return The full constituent phrase containing the argument head
*/
public static Constituent getPhraseFromHead(Constituent predicate, Constituent argHead, String parseViewName) {
// Get the path from the argument to the preposition
// but only if the predicate node "m-commands" the arg
TextAnnotation ta = argHead.getTextAnnotation();
int sentenceOffset = ta.getSentence(ta.getSentenceId(argHead)).getStartSpan();
int argStart = argHead.getStartSpan() - sentenceOffset;
Tree<Pair<String, IntPair>> predParentTree = getTokenIndexedTreeCovering(predicate, parseViewName).getParent();
boolean found = false;
for (Tree<Pair<String, IntPair>> s : predParentTree.getYield()) {
if (s.getLabel().getSecond().getFirst() == argStart)
found = true;
}
if (!found)
return null;
// Now follow the path from the argument node to get to the preposition
Tree<Pair<String, IntPair>> argPhrase = getTokenIndexedTreeCovering(argHead, parseViewName);
while (!checkForPredicate(argPhrase.getParent(), predicate.getStartSpan() - sentenceOffset)) {
if (argPhrase.getParent() == null)
break;
argPhrase = argPhrase.getParent();
}
// If the phrase covering the constituent is the whole sentence then the annotation is wrong
if (argPhrase.getParent() == null)
return null;
int start = predicate.getStartSpan() + 1;
int end = start + argPhrase.getYield().size();
return new Constituent(argHead.getLabel(), argHead.getViewName(), argHead.getTextAnnotation(), start, end);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class JsonSerializer method readTextAnnotation.
TextAnnotation readTextAnnotation(String string) throws Exception {
JsonObject json = (JsonObject) new JsonParser().parse(string);
String corpusId = readString("corpusId", json);
String id = readString("id", json);
String text = readString("text", json);
String[] tokens = readStringArray("tokens", json);
Pair<Pair<String, Double>, int[]> sentences = readSentences(json);
IntPair[] offsets = TokenUtils.getTokenOffsets(text, tokens);
TextAnnotation ta = new TextAnnotation(corpusId, id, text, offsets, tokens, sentences.getSecond());
JsonArray views = json.getAsJsonArray("views");
for (int i = 0; i < views.size(); i++) {
JsonObject view = (JsonObject) views.get(i);
String viewName = readString("viewName", view);
JsonArray viewData = view.getAsJsonArray("viewData");
List<View> topKViews = new ArrayList<>();
for (int k = 0; k < viewData.size(); k++) {
JsonObject kView = (JsonObject) viewData.get(k);
topKViews.add(readView(kView, ta));
}
ta.addTopKView(viewName, topKViews);
}
readAttributes(ta, json);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class VerbSensePropbankReader method addAnnotation.
private void addAnnotation(TextAnnotation ta) {
String goldViewName = SenseManager.getGoldViewName();
Tree<String> tree = ParseHelper.getParseTree(ViewNames.PARSE_GOLD, ta, 0);
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
List<Tree<Pair<String, IntPair>>> yield = spanLabeledTree.getYield();
TokenLabelView view = new TokenLabelView(goldViewName, "AnnotatedTreebank", ta, 1.0);
Set<Integer> predicates = new HashSet<>();
for (PropbankFields fields : goldFields.get(ta.getId())) {
int start = fields.getPredicateStart(yield);
if (predicates.contains(start))
continue;
predicates.add(start);
view.addTokenLabel(start, fields.getSense(), 1.0);
try {
view.addTokenAttribute(start, LemmaIdentifier, fields.getLemma());
} catch (Exception e) {
// XXX Maybe log the exception?
e.printStackTrace();
}
}
if (view.getConstituents().size() > 0)
ta.addView(goldViewName, view);
}
Aggregations