use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeTextSpan.
* given a span of text, return a list of {@literal Pair< String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
* @param text an arbitrary span of text.
* @return a {@link Tokenization} object containing the ordered token strings, their character
* offsets, and sentence end positions (as one-past-the-end token offsets)
public Tokenization tokenizeTextSpan(String text) {
String[] splitterInput = new String[1];
splitterInput[0] = text;
SentenceSplitter splitter = new SentenceSplitter(splitterInput);
Sentence[] sentences = splitter.splitAll();
List<IntPair> characterOffsets = new LinkedList<>();
int[] sentenceEndTokenOffsets = new int[sentences.length];
int sentenceEndTokenIndex = 0;
int sentIndex = 0;
List<String> tokens = new LinkedList<>();
for (Sentence s : splitter.splitAll()) {
LinkedVector words = s.wordSplit();
if (s.end >= text.length()) {
throw new IllegalArgumentException("Error in tokenizer, sentence end ( " + s.end + ") is greater than rawtext length (" + text.length() + ").");
for (int i = 0; i < words.size(); i++) {
Word word = (Word) words.get(i);
IntPair wordOffsets = new IntPair(word.start, word.end + 1);
tokens.add(text.substring(wordOffsets.getFirst(), wordOffsets.getSecond()));
sentenceEndTokenIndex += words.size();
sentenceEndTokenOffsets[sentIndex++] = sentenceEndTokenIndex;
String[] tokenArray = tokens.toArray(new String[tokens.size()]);
IntPair[] charOffsetArray = characterOffsets.toArray(new IntPair[characterOffsets.size()]);
return new Tokenization(tokenArray, charOffsetArray, sentenceEndTokenOffsets);
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class TestDiff method testDiff.
public void testDiff() {
POSTagger tagger = new POSTagger();
Parser parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(testFile)));
String sentence = "";
int sentenceCounter = 0;
int tokenCounter = 0;
int correctCounter = 0;
for (Token word = (Token); word != null; word = (Token) {
String tag = tagger.discreteValue(word);
if (refTags.get(tokenCounter).equals(tag)) {
double result = ((double) correctCounter) / tokenCounter;
if (result < thresholdAcc) {
fail("Tagger performance is insufficient: " + "\nProduced: " + result + "\nExpected: " + thresholdAcc);
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class TestDiff method testDiff.
public void testDiff() {
Chunker tagger = new Chunker();
Parser parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(testFile)));
String previous = "";
String sentence = "";
int sentenceCounter = 0;
for (Token w = (Token); w != null; w = (Token) {
String prediction = tagger.discreteValue(w);
if (prediction.startsWith("B-") || prediction.startsWith("I-") && !previous.endsWith(prediction.substring(2)))
sentence += ("[" + prediction.substring(2) + " ");
sentence += ("(" + w.partOfSpeech + " " + w.form + ") ");
if (!prediction.equals("O") && ( == null || tagger.discreteValue("O") || tagger.discreteValue("B-") || !tagger.discreteValue(
sentence += ("] ");
if ( == null) {
sentence = sentence.trim();
String refSentence = refSentences.get(sentenceCounter).trim();
if (!sentence.equals(refSentence))
fail("Produced output doesn't match reference: " + "\nProduced: " + sentence + "\nExpected: " + refSentence);
sentence = "";
previous = prediction;
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class ChunksAndPOSTags method main.
public static void main(String[] args) {
String filename = null;
try {
filename = args[0];
if (args.length > 1)
throw new Exception();
} catch (Exception e) {
System.err.println("usage: java edu.illinois.cs.cogcomp.chunker.main.ChunksAndPOSTags <input file>");
Chunker chunker = new Chunker();
Parser parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(filename)));
String previous = "";
for (Word w = (Word); w != null; w = (Word) {
String prediction = chunker.discreteValue(w);
if (prediction.startsWith("B-") || prediction.startsWith("I-") && !previous.endsWith(prediction.substring(2)))"[" + prediction.substring(2) + " ");"(" + w.partOfSpeech + " " + w.form + ") ");
if (!prediction.equals("O") && ( == null || chunker.discreteValue("O") || chunker.discreteValue("B-") || !chunker.discreteValue("] ");
if ( == null)"\n");
previous = prediction;
use of edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter in project cogcomp-nlp by CogComp.
the class SegmentTagPlain method main.
public static void main(String[] args) {
String taggerName = null;
String inputFile = null;
String parserName = null;
try {
taggerName = args[0];
inputFile = args[1];
if (args.length > 2) {
parserName = args[2];
if (args.length > 3)
throw new Exception();
} catch (Exception e) {
System.err.println("usage: java <word classifier> " + "<input file> \\\n" + " [<parser>]");
Classifier tagger = ClassUtils.getClassifier(taggerName);
Parser parser;
if (parserName == null)
parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(inputFile)));
parser = ClassUtils.getParser(parserName, new Class[] { Parser.class }, new Parser[] { new WordSplitter(new SentenceSplitter(inputFile)) });
String previous = "";
for (Word w = (Word); w != null; w = (Word) {
String prediction = tagger.discreteValue(w);
if (prediction.startsWith("B-") || prediction.startsWith("I-") && !previous.endsWith(prediction.substring(2)))
System.out.print("[" + prediction.substring(2) + " ");
System.out.print(w.form + " ");
if (!prediction.equals("O") && ( == null || tagger.discreteValue("O") || tagger.discreteValue("B-") || !tagger.discreteValue(
System.out.print("] ");
if ( == null)
previous = prediction;