use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of {@literal Pair< String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param text an arbitrary span of text.
* @return a {@link Tokenization} object containing the ordered token strings, their character
* offsets, and sentence end positions (as one-past-the-end token offsets)
*/
@Override
public Tokenization tokenizeTextSpan(String text) {
String[] splitterInput = new String[1];
splitterInput[0] = text;
SentenceSplitter splitter = new SentenceSplitter(splitterInput);
Sentence[] sentences = splitter.splitAll();
List<IntPair> characterOffsets = new LinkedList<>();
int[] sentenceEndTokenOffsets = new int[sentences.length];
int sentenceEndTokenIndex = 0;
int sentIndex = 0;
List<String> tokens = new LinkedList<>();
for (Sentence s : splitter.splitAll()) {
LinkedVector words = s.wordSplit();
if (s.end >= text.length()) {
throw new IllegalArgumentException("Error in tokenizer, sentence end ( " + s.end + ") is greater than rawtext length (" + text.length() + ").");
}
for (int i = 0; i < words.size(); i++) {
Word word = (Word) words.get(i);
IntPair wordOffsets = new IntPair(word.start, word.end + 1);
characterOffsets.add(wordOffsets);
tokens.add(text.substring(wordOffsets.getFirst(), wordOffsets.getSecond()));
}
sentenceEndTokenIndex += words.size();
sentenceEndTokenOffsets[sentIndex++] = sentenceEndTokenIndex;
}
String[] tokenArray = tokens.toArray(new String[tokens.size()]);
IntPair[] charOffsetArray = characterOffsets.toArray(new IntPair[characterOffsets.size()]);
return new Tokenization(tokenArray, charOffsetArray, sentenceEndTokenOffsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence the plain text sentence to tokenize
* @return an ordered list of tokens from the sentence, and an ordered list of their start and
* end character offsets (one-past-the-end indexing)
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
Sentence lbjSentence = new Sentence(sentence);
LinkedVector wordSplit = lbjSentence.wordSplit();
String[] output = new String[wordSplit.size()];
IntPair[] offsets = new IntPair[wordSplit.size()];
for (int i = 0; i < output.length; i++) {
LinkedChild linkedChild = wordSplit.get(i);
output[i] = linkedChild.toString();
offsets[i] = new IntPair(linkedChild.start, linkedChild.end + 1);
}
return new Pair<>(output, offsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class SyntacticFrame method addToFeature.
private void addToFeature(Constituent target, Constituent arg, StringBuffer sb1, StringBuffer sb2, StringBuffer sb3) {
final IntPair span = target.getSpan();
final String label = target.getLabel();
if (ParseTreeProperties.isNominal(label)) {
if (span.equals(arg.getSpan())) {
sb1.append(label);
sb2.append("CUR");
sb3.append("CUR");
} else {
sb1.append(label.toLowerCase());
sb2.append(label.toLowerCase());
sb3.append(label.toLowerCase());
}
sb1.append("-");
sb2.append("-");
sb3.append("-");
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class VerbVoiceIndicator method getWordFeatures.
@Override
public Set<Feature> getWordFeatures(TextAnnotation ta, int wordPosition) throws EdisonException {
Sentence sentence = ta.getSentenceFromToken(wordPosition);
int sentenceStart = sentence.getStartSpan();
int predicatePosition = wordPosition - sentenceStart;
Tree<String> tree = ParseHelper.getParseTree(parseViewName, sentence);
Tree<Pair<String, IntPair>> spanLabeledTree = ParseUtils.getSpanLabeledTree(tree);
Tree<Pair<String, IntPair>> currentNode = spanLabeledTree.getYield().get(predicatePosition).getParent();
String f = getVoice(currentNode);
return new LinkedHashSet<Feature>(Collections.singletonList(DiscreteFeature.create(f)));
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ClauseViewGenerator method addView.
@Override
public void addView(TextAnnotation ta) {
SpanLabelView view = new SpanLabelView(getViewName(), "From " + parseViewName, ta, 1.0, true);
TreeView parse = (TreeView) ta.getView(parseViewName);
Set<IntPair> set = new LinkedHashSet<>();
for (Constituent c : parse) {
if (TreeView.isLeaf(c))
continue;
if (ParseTreeProperties.isPreTerminal(c))
continue;
String label = c.getLabel();
label = ParseUtils.stripFunctionTags(label);
label = ParseUtils.stripIndexReferences(label);
if (label.startsWith("S") && !label.equals("S1")) {
int start = c.getStartSpan();
int end = c.getEndSpan();
if (start >= 0 && end > start) {
set.add(new IntPair(start, end));
}
}
}
for (IntPair span : set) {
view.addSpanLabel(span.getFirst(), span.getSecond(), "S", 1.0);
}
ta.addView(getViewName(), view);
}
Aggregations