use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ParseHelper method getTokenIndexedTreeCovering.
/**
* From a parse tree and a span that is specified with the start and end (exclusive), this
* function returns a tree that corresponds to the subtree that covers the span. Each node in
* the new tree corresponds to a node in the input tree and is labeled with the label of the
* original node along with the span that this node covered in the original tree.
*
* @return A new tree that covers the specified span and each node specifies the label and the
* span of the original tree that it covers.
*/
public static Tree<Pair<String, IntPair>> getTokenIndexedTreeCovering(Tree<String> parse, int start, int end) {
Tree<Pair<String, IntPair>> current = ParseUtils.getSpanLabeledTree(parse);
while (current != null) {
IntPair span = current.getLabel().getSecond();
if (span.getFirst() == start && span.getSecond() == end) {
return current;
} else {
boolean found = false;
for (Tree<Pair<String, IntPair>> child : current.getChildren()) {
if (child.getLabel().getSecond().getFirst() <= start && child.getLabel().getSecond().getSecond() >= end) {
current = child;
found = true;
break;
}
}
if (!found)
break;
}
}
return current;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class PathFeatureHelper method getPathsToCommonAncestor.
/**
* Get the paths from two constituents to their common ancestor. Each path is truncated to a
* length of maxDepth if it is longer than that.
* <p>
* <b>Note:</b> This function requires the two constituents to be from the same {@link View}.
*
* @throws IllegalArgumentException If no common ancestor is found.
* @see edu.illinois.cs.cogcomp.edison.features.helpers.PathFeatureHelper#getPathToRoot(Constituent,
* int)
*/
public static Pair<List<Constituent>, List<Constituent>> getPathsToCommonAncestor(Constituent start, Constituent end, int maxDepth) {
assert start.getView() == end.getView() : "Cannot find paths across different views. " + "The start and end constituents should be from the same view.";
List<Constituent> p1 = getPathToRoot(start, maxDepth);
List<Constituent> p2 = getPathToRoot(end, maxDepth);
Set<Constituent> s1 = new LinkedHashSet<>(p1);
Set<Constituent> s2 = new LinkedHashSet<>(p2);
boolean foundAncestor = false;
List<Constituent> pathUp = new ArrayList<>();
for (Constituent aP1 : p1) {
if (!foundAncestor) {
pathUp.add(aP1);
}
if (s2.contains(aP1)) {
foundAncestor = true;
break;
}
}
if (!foundAncestor)
throw new IllegalArgumentException("Common ancestor not found in path down.");
List<Constituent> pathDown = new ArrayList<>();
foundAncestor = false;
for (Constituent aP2 : p2) {
if (!foundAncestor) {
pathDown.add(aP2);
}
if (s1.contains(aP2)) {
foundAncestor = true;
break;
}
}
if (!foundAncestor)
throw new IllegalArgumentException("Common ancestor not found in path up.");
return new Pair<>(pathUp, pathDown);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class CharacterTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
for (int i = 0; i < sentence.length(); i++) {
String c = sentence.substring(i, i + 1).trim();
if (!c.isEmpty() && !c.equals(" ")) {
surfaces.add(c);
offsets.add(new IntPair(i, i + 1));
}
}
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair<>(surfs, offs);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class StanfordAnalyzer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentenceText The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentenceText) {
Annotation document = new Annotation(sentenceText);
pipeline.annotate(document);
List<CoreLabel> tokens = new ArrayList<>();
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int[] sen_ends = new int[sentences.size()];
int sen_idx = 0;
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
tokens.add(token);
}
sen_ends[sen_idx++] = tokens.size();
}
String[] surfaces = new String[tokens.size()];
IntPair[] tokenCharOffsets = new IntPair[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
surfaces[i] = tokens.get(i).originalText();
tokenCharOffsets[i] = new IntPair(tokens.get(i).beginPosition(), tokens.get(i).endPosition());
// System.out.println(surfaces[i]);
// System.out.println(tokenCharOffsets[i]);
}
return new Pair(surfaces, tokenCharOffsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class StanfordDepHandler method addView.
@Override
public void addView(TextAnnotation textAnnotation) throws AnnotatorException {
// If the sentence is longer than STFRD_MAX_SENTENCE_LENGTH there is no point in trying to
// parse
StanfordParseHandler.checkLength(textAnnotation, throwExceptionOnSentenceLengthCheck, maxParseSentenceLength);
TreeView treeView = new TreeView(ViewNames.DEPENDENCY_STANFORD, "StanfordDepHandler", textAnnotation, 1d);
// The (tokenized) sentence offset in case we have more than one sentences in the record
List<CoreMap> sentences = StanfordParseHandler.buildStanfordSentences(textAnnotation);
Annotation document = new Annotation(sentences);
posAnnotator.annotate(document);
parseAnnotator.annotate(document);
sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences.get(0).get(TreeCoreAnnotations.TreeAnnotation.class).nodeString().equals("X")) {
// This is most like because we ran out of time
throw new AnnotatorException("Unable to parse TextAnnotation " + textAnnotation.getId() + ". " + "This is most likely due to a timeout.");
}
for (int sentenceId = 0; sentenceId < sentences.size(); sentenceId++) {
boolean runtimeExceptionWasThrown = false;
CoreMap sentence = sentences.get(sentenceId);
if (maxParseSentenceLength > 0 && sentence.size() > maxParseSentenceLength) {
logger.warn(HandlerUtils.getSentenceLengthError(textAnnotation.getId(), sentence.toString(), maxParseSentenceLength));
} else {
SemanticGraph depGraph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
IndexedWord root = null;
try {
root = depGraph.getFirstRoot();
} catch (RuntimeException e) {
String msg = "ERROR in getting root of dep graph for sentence. Sentence is:\n" + sentence.toString() + "'\nDependency graph is:\n" + depGraph.toCompactString() + "\nText is:\n" + textAnnotation.getText();
logger.error(msg);
System.err.println(msg);
e.printStackTrace();
if (throwExceptionOnSentenceLengthCheck)
throw e;
else
runtimeExceptionWasThrown = true;
}
if (!runtimeExceptionWasThrown) {
int tokenStart = getNodePosition(textAnnotation, root, sentenceId);
Pair<String, Integer> nodePair = new Pair<>(root.originalText(), tokenStart);
Tree<Pair<String, Integer>> tree = new Tree<>(nodePair);
populateChildren(depGraph, root, tree, textAnnotation, sentenceId);
treeView.setDependencyTree(sentenceId, tree);
}
}
}
textAnnotation.addView(getViewName(), treeView);
}
Aggregations