use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class PlainTextReader method showSentenceVector.
public static String showSentenceVector(Vector<LinkedVector> sentences) {
String display = "";
for (LinkedVector v : sentences) {
for (int i = 0; i < v.size(); ++i) {
NEWord s = (NEWord) (v.get(i));
display += (s.toString());
}
}
return display;
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of {@literal Pair< String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param text an arbitrary span of text.
* @return a {@link Tokenization} object containing the ordered token strings, their character
* offsets, and sentence end positions (as one-past-the-end token offsets)
*/
@Override
public Tokenization tokenizeTextSpan(String text) {
String[] splitterInput = new String[1];
splitterInput[0] = text;
SentenceSplitter splitter = new SentenceSplitter(splitterInput);
Sentence[] sentences = splitter.splitAll();
List<IntPair> characterOffsets = new LinkedList<>();
int[] sentenceEndTokenOffsets = new int[sentences.length];
int sentenceEndTokenIndex = 0;
int sentIndex = 0;
List<String> tokens = new LinkedList<>();
for (Sentence s : splitter.splitAll()) {
LinkedVector words = s.wordSplit();
if (s.end >= text.length()) {
throw new IllegalArgumentException("Error in tokenizer, sentence end ( " + s.end + ") is greater than rawtext length (" + text.length() + ").");
}
for (int i = 0; i < words.size(); i++) {
Word word = (Word) words.get(i);
IntPair wordOffsets = new IntPair(word.start, word.end + 1);
characterOffsets.add(wordOffsets);
tokens.add(text.substring(wordOffsets.getFirst(), wordOffsets.getSecond()));
}
sentenceEndTokenIndex += words.size();
sentenceEndTokenOffsets[sentIndex++] = sentenceEndTokenIndex;
}
String[] tokenArray = tokens.toArray(new String[tokens.size()]);
IntPair[] charOffsetArray = characterOffsets.toArray(new IntPair[characterOffsets.size()]);
return new Tokenization(tokenArray, charOffsetArray, sentenceEndTokenOffsets);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence the plain text sentence to tokenize
* @return an ordered list of tokens from the sentence, and an ordered list of their start and
* end character offsets (one-past-the-end indexing)
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
Sentence lbjSentence = new Sentence(sentence);
LinkedVector wordSplit = lbjSentence.wordSplit();
String[] output = new String[wordSplit.size()];
IntPair[] offsets = new IntPair[wordSplit.size()];
for (int i = 0; i < output.length; i++) {
LinkedChild linkedChild = wordSplit.get(i);
output[i] = linkedChild.toString();
offsets[i] = new IntPair(linkedChild.start, linkedChild.end + 1);
}
return new Pair<>(output, offsets);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class POSTag method main.
/**
* Implements the program described above.
*
* @param args The command line parameters.
**/
public static void main(String[] args) {
// Parse the command line
if (!(args.length == 1 && !args[0].startsWith("-") || args.length == 2 && (args[0].equals("-q") || args[0].equals("--quiet")) && !args[1].startsWith("-"))) {
System.err.println("usage: java edu.illinois.cs.cogcomp.lbj.pos.POSTag [-q] <testing set>\n" + " If -q is specified, the only output is timing and accuracy\n" + " information. Otherwise, the testing set is output with\n" + " extra tags indicating whether each prediction was correct.");
System.exit(1);
}
boolean quiet = args.length == 2;
testingFile = args[args.length - 1];
POSTagger tagger = new POSTagger();
BufferedReader in = open();
int correct = 0, incorrect = 0;
for (String line = readLine(in); line != null; line = readLine(in)) {
LinkedVector sentence = POSBracketToVector.parsePOSBracketForm(line);
for (Word word = (Word) sentence.get(0); word != null; word = (Word) word.next) {
String label = word.partOfSpeech;
word.partOfSpeech = null;
String prediction = tagger.discreteValue(word);
if (prediction.equals(label)) {
++correct;
if (!quiet)
System.out.print("+");
} else {
++incorrect;
if (!quiet)
System.out.print("-[" + label + "]");
}
if (!quiet)
System.out.print(word + " ");
}
if (!quiet)
System.out.print("");
}
System.out.println("Accuracy: " + (100 * correct / (double) (correct + incorrect)) + "%");
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class CoNLL2000Parser method next.
/**
* Produces the next object parsed from the input file; in this case, that object is guaranteed
* to be a <code>LinkedVector</code> populated by <code>Token</code>s representing a sentence.
**/
public Object next() {
String[] line = (String[]) super.next();
while (line != null && line.length == 0) line = (String[]) super.next();
if (line == null)
return null;
String pos = line[1];
if (pos.equals("-"))
pos = null;
Token t = new Token(new Word(line[0], pos), null, line[2]);
for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
pos = line[1];
if (pos.equals("-"))
pos = null;
t.next = new Token(new Word(line[0], pos), t, line[2]);
t = (Token) t.next;
}
return new LinkedVector(t);
}
Aggregations