Search in sources :

Example 36 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class PlainTextReader method showSentenceVector.

public static String showSentenceVector(Vector<LinkedVector> sentences) {
    String display = "";
    for (LinkedVector v : sentences) {
        for (int i = 0; i < v.size(); ++i) {
            NEWord s = (NEWord) (v.get(i));
            display += (s.toString());
        }
    }
    return display;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 37 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class IllinoisTokenizer method tokenizeTextSpan.

/**
     * given a span of text, return a list of {@literal Pair< String[], IntPair[] >} corresponding
     * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
     * IntPair[] is the corresponding list of character offsets with respect to <b>the original
     * text</b>.
     *
     * @param text an arbitrary span of text.
     * @return a {@link Tokenization} object containing the ordered token strings, their character
     *         offsets, and sentence end positions (as one-past-the-end token offsets)
     */
@Override
public Tokenization tokenizeTextSpan(String text) {
    String[] splitterInput = new String[1];
    splitterInput[0] = text;
    SentenceSplitter splitter = new SentenceSplitter(splitterInput);
    Sentence[] sentences = splitter.splitAll();
    List<IntPair> characterOffsets = new LinkedList<>();
    int[] sentenceEndTokenOffsets = new int[sentences.length];
    int sentenceEndTokenIndex = 0;
    int sentIndex = 0;
    List<String> tokens = new LinkedList<>();
    for (Sentence s : splitter.splitAll()) {
        LinkedVector words = s.wordSplit();
        if (s.end >= text.length()) {
            throw new IllegalArgumentException("Error in tokenizer, sentence end ( " + s.end + ") is greater than rawtext length (" + text.length() + ").");
        }
        for (int i = 0; i < words.size(); i++) {
            Word word = (Word) words.get(i);
            IntPair wordOffsets = new IntPair(word.start, word.end + 1);
            characterOffsets.add(wordOffsets);
            tokens.add(text.substring(wordOffsets.getFirst(), wordOffsets.getSecond()));
        }
        sentenceEndTokenIndex += words.size();
        sentenceEndTokenOffsets[sentIndex++] = sentenceEndTokenIndex;
    }
    String[] tokenArray = tokens.toArray(new String[tokens.size()]);
    IntPair[] charOffsetArray = characterOffsets.toArray(new IntPair[characterOffsets.size()]);
    return new Tokenization(tokenArray, charOffsetArray, sentenceEndTokenOffsets);
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)

Example 38 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class IllinoisTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param sentence the plain text sentence to tokenize
     * @return an ordered list of tokens from the sentence, and an ordered list of their start and
     *         end character offsets (one-past-the-end indexing)
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
    Sentence lbjSentence = new Sentence(sentence);
    LinkedVector wordSplit = lbjSentence.wordSplit();
    String[] output = new String[wordSplit.size()];
    IntPair[] offsets = new IntPair[wordSplit.size()];
    for (int i = 0; i < output.length; i++) {
        LinkedChild linkedChild = wordSplit.get(i);
        output[i] = linkedChild.toString();
        offsets[i] = new IntPair(linkedChild.start, linkedChild.end + 1);
    }
    return new Pair<>(output, offsets);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) LinkedChild(edu.illinois.cs.cogcomp.lbjava.parse.LinkedChild) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 39 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class POSTag method main.

/**
     * Implements the program described above.
     *
     * @param args The command line parameters.
     **/
public static void main(String[] args) {
    // Parse the command line
    if (!(args.length == 1 && !args[0].startsWith("-") || args.length == 2 && (args[0].equals("-q") || args[0].equals("--quiet")) && !args[1].startsWith("-"))) {
        System.err.println("usage: java edu.illinois.cs.cogcomp.lbj.pos.POSTag [-q] <testing set>\n" + "       If -q is specified, the only output is timing and accuracy\n" + "       information.  Otherwise, the testing set is output with\n" + "       extra tags indicating whether each prediction was correct.");
        System.exit(1);
    }
    boolean quiet = args.length == 2;
    testingFile = args[args.length - 1];
    POSTagger tagger = new POSTagger();
    BufferedReader in = open();
    int correct = 0, incorrect = 0;
    for (String line = readLine(in); line != null; line = readLine(in)) {
        LinkedVector sentence = POSBracketToVector.parsePOSBracketForm(line);
        for (Word word = (Word) sentence.get(0); word != null; word = (Word) word.next) {
            String label = word.partOfSpeech;
            word.partOfSpeech = null;
            String prediction = tagger.discreteValue(word);
            if (prediction.equals(label)) {
                ++correct;
                if (!quiet)
                    System.out.print("+");
            } else {
                ++incorrect;
                if (!quiet)
                    System.out.print("-[" + label + "]");
            }
            if (!quiet)
                System.out.print(word + " ");
        }
        if (!quiet)
            System.out.print("");
    }
    System.out.println("Accuracy: " + (100 * correct / (double) (correct + incorrect)) + "%");
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) BufferedReader(java.io.BufferedReader) POSTagger(edu.illinois.cs.cogcomp.pos.lbjava.POSTagger)

Example 40 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class CoNLL2000Parser method next.

/**
     * Produces the next object parsed from the input file; in this case, that object is guaranteed
     * to be a <code>LinkedVector</code> populated by <code>Token</code>s representing a sentence.
     **/
public Object next() {
    String[] line = (String[]) super.next();
    while (line != null && line.length == 0) line = (String[]) super.next();
    if (line == null)
        return null;
    String pos = line[1];
    if (pos.equals("-"))
        pos = null;
    Token t = new Token(new Word(line[0], pos), null, line[2]);
    for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
        pos = line[1];
        if (pos.equals("-"))
            pos = null;
        t.next = new Token(new Word(line[0], pos), t, line[2]);
        t = (Token) t.next;
    }
    return new LinkedVector(t);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1