use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefulTokenizer method tokenizeSentence.
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
// parse the test
TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
tsm.parseText(sentence);
// construct the data needed for the tokenization.
int words = 0;
for (State s : tsm.completed) {
int idx = s.stateIndex();
if (idx != TokenizerState.IN_SENTENCE.ordinal())
words++;
}
IntPair[] wordOffsets = new IntPair[words];
String[] tokens = new String[words];
int wordIndex = 0;
for (State s : tsm.completed) {
State ms = (State) s;
if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) {
tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
}
}
return new Pair<>(tokens, wordOffsets);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class StatefulTokenizer method tokenizeTextSpan.
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
// parse the text
TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
tsm.parseText(textSpan);
// construct the data needed for the tokenization.
int sentences = 0;
int words = 0;
for (State s : tsm.completed) {
int idx = s.stateIndex();
if (idx == TokenizerState.IN_SENTENCE.ordinal())
sentences++;
else
words++;
}
IntPair[] wordOffsets = new IntPair[words];
int[] sentenceEnds = new int[sentences];
String[] tokens = new String[words];
int sentenceIndex = 0;
int wordIndex = 0;
for (State s : tsm.completed) {
State ms = (State) s;
if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal())
sentenceEnds[sentenceIndex++] = wordIndex;
else {
tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
}
}
// Removing any training sentence containing no words.
if (sentences > 1)
if (sentenceEnds[sentences - 1] == sentenceEnds[sentences - 2]) {
int[] temp = new int[sentences - 1];
System.arraycopy(sentenceEnds, 0, temp, 0, sentences - 1);
sentenceEnds = temp;
}
return new Tokenization(tokens, wordOffsets, sentenceEnds);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TokenizerTextAnnotationBuilder method buildTextAnnotation.
/**
* instantiate a TextAnnotation using a SentenceViewGenerator to create an explicit Sentence
* view
*
* @param corpusId a field in TextAnnotation that can be used by the client for book-keeping
* (e.g. track texts from the same corpus)
* @param textId a field in TextAnnotation that can be used by the client for book-keeping (e.g.
* identify a specific document by some reference string)
* @param text the plain English text to process
* @param tokens the token Strings, in order from original text
* @param sentenceEndPositions token offsets of sentence ends (one-past-the-end indexing)
* @param sentenceViewGenerator the name of the source of the sentence split
* @param sentenceViewScore a score that may indicate how reliable the sentence split
* information is
* @return a TextAnnotation object with {@link ViewNames#TOKENS} and {@link ViewNames#SENTENCE}
* views.
*/
public static TextAnnotation buildTextAnnotation(String corpusId, String textId, String text, String[] tokens, int[] sentenceEndPositions, String sentenceViewGenerator, double sentenceViewScore) {
if (sentenceEndPositions[sentenceEndPositions.length - 1] != tokens.length)
throw new IllegalArgumentException("Invalid sentence boundary. Last element should be the number of tokens");
IntPair[] offsets = TokenUtils.getTokenOffsets(text, tokens);
assert offsets.length == tokens.length;
TextAnnotation ta = new TextAnnotation(corpusId, textId, text, offsets, tokens, sentenceEndPositions);
SpanLabelView view = new SpanLabelView(ViewNames.SENTENCE, sentenceViewGenerator, ta, sentenceViewScore);
int start = 0;
for (int s : sentenceEndPositions) {
view.addSpanLabel(start, s, ViewNames.SENTENCE, 1d);
start = s;
}
ta.addView(ViewNames.SENTENCE, view);
SpanLabelView tokView = new SpanLabelView(ViewNames.TOKENS, sentenceViewGenerator, ta, sentenceViewScore);
for (int tokIndex = 0; tokIndex < tokens.length; ++tokIndex) {
tokView.addSpanLabel(tokIndex, tokIndex + 1, tokens[tokIndex], 1d);
}
ta.addView(ViewNames.TOKENS, tokView);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TokenizerUtilities method addTokenView.
public static SpanLabelView addTokenView(TextAnnotation input, Tokenizer tokenizer, String source) {
SentenceSplitter splitter = new SentenceSplitter(new String[] { input.getText() });
Sentence[] sentences = splitter.splitAll();
List<String> tokens = new ArrayList<>();
List<IntPair> charOffsets = new ArrayList<>();
List<IntPair> sentenceSpans = new ArrayList<>();
int start = 0;
for (Sentence s : sentences) {
Pair<String[], IntPair[]> toks = tokenizer.tokenizeSentence(s.text);
for (int i = 0; i < toks.getFirst().length; i++) {
tokens.add(toks.getFirst()[i]);
IntPair charOffset = toks.getSecond()[i];
IntPair translatedCharOffset = new IntPair(charOffset.getFirst() + s.start, charOffset.getSecond() + s.start);
charOffsets.add(translatedCharOffset);
}
sentenceSpans.add(new IntPair(start, tokens.size()));
start = tokens.size();
}
if (tokens.size() != charOffsets.size())
throw new IllegalArgumentException("tokens (" + tokens.size() + ") must equal charOffsets (" + charOffsets.size() + "), but does not.");
SpanLabelView tokView = new SpanLabelView(ViewNames.TOKENS, source, input, 1.0);
SpanLabelView view = new SpanLabelView(ViewNames.SENTENCE, source, input, 1.0);
for (int i = 0; i < tokens.size(); ++i) {
tokView.addSpanLabel(i, i + 1, tokens.get(i), 1d);
}
for (IntPair span : sentenceSpans) {
view.addSpanLabel(span.getFirst(), span.getSecond(), ViewNames.SENTENCE, 1d);
}
return tokView;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class SentenceStructure method getView.
public TokenLabelView getView(SenseManager manager, TextAnnotation ta) {
String viewName = SenseManager.getPredictedViewName();
TokenLabelView view = new TokenLabelView(viewName, VerbSenseConstants.systemIdentifier, ta, 1.0);
for (SenseStructure y : this.ys) {
SenseInstance senseInstance = y.getInstance();
IntPair predicateSpan = senseInstance.getConstituent().getSpan();
String sense = manager.getSense(y.getLabel());
view.addTokenLabel(predicateSpan.getFirst(), sense, 1.0);
}
return view;
}
Aggregations