Search in sources :

Example 1 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class ChunkerTrain method trainModelsWithParser.

/**
     * Trains the chunker models with the specified training data
     * 
     * @param parser Parser for the training data. Initialized in trainModels(String trainingData)
     */
public void trainModelsWithParser(Parser parser) {
    Chunker.isTraining = true;
    // Run the learner
    for (int i = 1; i <= iter; i++) {
        LinkedVector ex;
        while ((ex = (LinkedVector) parser.next()) != null) {
            for (int j = 0; j < ex.size(); j++) {
                chunker.learn(ex.get(j));
            }
        }
        parser.reset();
        chunker.doneWithRound();
        logger.info("Iteration number : " + i);
    }
    chunker.doneLearning();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 2 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class ChunkerTrain method trainModelsWithParser.

public void trainModelsWithParser(Parser parser, String modeldir, String modelname, double dev_ratio) {
    Chunker.isTraining = true;
    double tmpF1 = 0;
    double bestF1 = 0;
    int bestIter = 0;
    double[] F1array = new double[iter];
    String lcpath = modeldir + File.separator + modelname + ".lc";
    String lexpath = modeldir + File.separator + modelname + ".lex";
    // Get the total number of training set
    int cnt = 0;
    LinkedVector ex;
    while ((ex = (LinkedVector) parser.next()) != null) {
        cnt++;
    }
    parser.reset();
    // Get the boundary between train and dev
    long idx = Math.round(cnt * (1 - dev_ratio));
    if (idx < 0)
        idx = 0;
    if (idx > cnt)
        idx = cnt;
    // Run the learner and save F1 for each iteration
    for (int i = 1; i <= iter; i++) {
        cnt = 0;
        while ((ex = (LinkedVector) parser.next()) != null) {
            for (int j = 0; j < ex.size(); j++) {
                chunker.learn(ex.get(j));
            }
            if (cnt >= idx)
                break;
            else
                cnt++;
        }
        chunker.doneWithRound();
        writeModelsToDisk(modeldir, modelname);
        // Test on dev set
        BIOTester tester = new BIOTester(new Chunker(lcpath, lexpath), new ChunkLabel(), new ChildrenFromVectors(parser));
        double[] result = tester.test().getOverallStats();
        tmpF1 = result[2];
        F1array[i - 1] = tmpF1;
        System.out.println("Iteration number : " + i + ". F1 score on devset: " + tmpF1);
        parser.reset();
    }
    // Get the best F1 score and corresponding iter
    for (int i = 0; i < iter; i++) {
        if (F1array[i] > bestF1) {
            bestF1 = F1array[i];
            bestIter = i + 1;
        }
    }
    System.out.println("Best #Iter = " + bestIter + " (F1=" + bestF1 + ")");
    System.out.println("Rerun the learner using best #Iter...");
    // Rerun the learner
    for (int i = 1; i <= bestIter; i++) {
        while ((ex = (LinkedVector) parser.next()) != null) {
            for (int j = 0; j < ex.size(); j++) {
                chunker.learn(ex.get(j));
            }
        }
        parser.reset();
        chunker.doneWithRound();
        System.out.println("Iteration number : " + i);
    }
    chunker.doneLearning();
}
Also used : ChildrenFromVectors(edu.illinois.cs.cogcomp.lbjava.parse.ChildrenFromVectors) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) BIOTester(edu.illinois.cs.cogcomp.lbjava.nlp.seg.BIOTester) Chunker(edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker) ChunkLabel(edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)

Example 3 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class Reuters2003Parser method next.

/**
     * Produces the next object parsed from the input file; in this case, that object is guaranteed
     * to be a <code>LinkedVector</code> populated by <code>Token</code>s representing a sentence.
     **/
public Object next() {
    String[] line = (String[]) super.next();
    while (line != null && (line.length < 2 || line[4].equals("-X-"))) line = (String[]) super.next();
    if (line == null)
        return null;
    if (line[3].charAt(0) == 'I')
        line[3] = "B" + line[3].substring(1);
    Token t = new Token(new Word(line[5], line[4]), null, line[3]);
    String previous = line[3];
    for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
        if (line[3].charAt(0) == 'I' && !previous.endsWith(line[3].substring(2)))
            line[3] = "B" + line[3].substring(1);
        t.next = new Token(new Word(line[5], line[4]), t, line[3]);
        t = (Token) t.next;
        previous = line[3];
    }
    return new LinkedVector(t);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Example 4 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class POSBracketToToken method next.

/**
     * Returns the next labeled word in the data.
     */
public Object next() {
    if (currentWord == null) {
        LinkedVector vector = (LinkedVector) super.next();
        while (vector != null && vector.size() == 0) vector = (LinkedVector) super.next();
        if (vector == null)
            return null;
        Word w = (Word) vector.get(0);
        Token t = currentWord = new Token(w, null, w.partOfSpeech);
        t.partOfSpeech = null;
        while (w.next != null) {
            w = (Word) w.next;
            t.next = new Token(w, t, w.partOfSpeech);
            t.partOfSpeech = null;
            t = (Token) t.next;
        }
    }
    Token result = currentWord;
    currentWord = (Token) currentWord.next;
    return result;
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 5 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class PlainToTokenParser method next.

/**
     * This method returns {@link Token}s until the input is exhausted, at
     * which point it returns <code>null</code>.
     **/
public Object next() {
    while (next == null) {
        LinkedVector words = (LinkedVector) parser.next();
        if (words == null)
            return null;
        Word w = (Word) words.get(0);
        Token t = new Token(w, null, null);
        for (w = (Word) w.next; w != null; w = (Word) w.next) {
            t.next = new Token(w, t, null);
            t = (Token) t.next;
        }
        LinkedVector tokens = new LinkedVector(t);
        next = (Token) tokens.get(0);
    }
    Token result = next;
    next = (Token) next.next;
    return result;
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1