use of edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker in project cogcomp-nlp by CogComp.
the class ChunkerTrain method init.
public void init() {
rm = new ChunkerConfigurator().getDefaultConfig();
String modelFile = rm.getString("modelPath");
String modelLexFile = rm.getString("modelLexPath");
chunker = new Chunker(modelFile, modelLexFile);
}
use of edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker in project cogcomp-nlp by CogComp.
the class ChunkerTrain method trainModelsWithParser.
public void trainModelsWithParser(Parser parser, String modeldir, String modelname, double dev_ratio) {
Chunker.isTraining = true;
double tmpF1 = 0;
double bestF1 = 0;
int bestIter = 0;
double[] F1array = new double[iter];
String lcpath = modeldir + File.separator + modelname + ".lc";
String lexpath = modeldir + File.separator + modelname + ".lex";
// Get the total number of training set
int cnt = 0;
LinkedVector ex;
while ((ex = (LinkedVector) parser.next()) != null) {
cnt++;
}
parser.reset();
// Get the boundary between train and dev
long idx = Math.round(cnt * (1 - dev_ratio));
if (idx < 0)
idx = 0;
if (idx > cnt)
idx = cnt;
// Run the learner and save F1 for each iteration
for (int i = 1; i <= iter; i++) {
cnt = 0;
while ((ex = (LinkedVector) parser.next()) != null) {
for (int j = 0; j < ex.size(); j++) {
chunker.learn(ex.get(j));
}
if (cnt >= idx)
break;
else
cnt++;
}
chunker.doneWithRound();
writeModelsToDisk(modeldir, modelname);
// Test on dev set
BIOTester tester = new BIOTester(new Chunker(lcpath, lexpath), new ChunkLabel(), new ChildrenFromVectors(parser));
double[] result = tester.test().getOverallStats();
tmpF1 = result[2];
F1array[i - 1] = tmpF1;
System.out.println("Iteration number : " + i + ". F1 score on devset: " + tmpF1);
parser.reset();
}
// Get the best F1 score and corresponding iter
for (int i = 0; i < iter; i++) {
if (F1array[i] > bestF1) {
bestF1 = F1array[i];
bestIter = i + 1;
}
}
System.out.println("Best #Iter = " + bestIter + " (F1=" + bestF1 + ")");
System.out.println("Rerun the learner using best #Iter...");
// Rerun the learner
for (int i = 1; i <= bestIter; i++) {
while ((ex = (LinkedVector) parser.next()) != null) {
for (int j = 0; j < ex.size(); j++) {
chunker.learn(ex.get(j));
}
}
parser.reset();
chunker.doneWithRound();
System.out.println("Iteration number : " + i);
}
chunker.doneLearning();
}
use of edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker in project cogcomp-nlp by CogComp.
the class ChunkTester method chunkTester.
public static void chunkTester(String testFile, String modeldir, String modelname) {
Parser parser;
String lcpath = modeldir + File.separator + modelname + ".lc";
String lexpath = modeldir + File.separator + modelname + ".lex";
parser = new CoNLL2000Parser(testFile);
BIOTester tester = new BIOTester(new Chunker(lcpath, lexpath), new ChunkLabel(), new ChildrenFromVectors(parser));
tester.test().printPerformance(System.out);
}
use of edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker in project cogcomp-nlp by CogComp.
the class TestDiff method testDiff.
@Test
public void testDiff() {
Chunker tagger = new Chunker();
Parser parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(testFile)));
String previous = "";
String sentence = "";
int sentenceCounter = 0;
for (Token w = (Token) parser.next(); w != null; w = (Token) parser.next()) {
String prediction = tagger.discreteValue(w);
if (prediction.startsWith("B-") || prediction.startsWith("I-") && !previous.endsWith(prediction.substring(2)))
sentence += ("[" + prediction.substring(2) + " ");
sentence += ("(" + w.partOfSpeech + " " + w.form + ") ");
if (!prediction.equals("O") && (w.next == null || tagger.discreteValue(w.next).equals("O") || tagger.discreteValue(w.next).startsWith("B-") || !tagger.discreteValue(w.next).endsWith(prediction.substring(2))))
sentence += ("] ");
if (w.next == null) {
sentence = sentence.trim();
String refSentence = refSentences.get(sentenceCounter).trim();
if (!sentence.equals(refSentence))
fail("Produced output doesn't match reference: " + "\nProduced: " + sentence + "\nExpected: " + refSentence);
sentence = "";
sentenceCounter++;
}
previous = prediction;
}
}
use of edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker in project cogcomp-nlp by CogComp.
the class ChunksAndPOSTags method main.
public static void main(String[] args) {
String filename = null;
try {
filename = args[0];
if (args.length > 1)
throw new Exception();
} catch (Exception e) {
System.err.println("usage: java edu.illinois.cs.cogcomp.chunker.main.ChunksAndPOSTags <input file>");
System.exit(1);
}
Chunker chunker = new Chunker();
Parser parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(filename)));
String previous = "";
for (Word w = (Word) parser.next(); w != null; w = (Word) parser.next()) {
String prediction = chunker.discreteValue(w);
if (prediction.startsWith("B-") || prediction.startsWith("I-") && !previous.endsWith(prediction.substring(2)))
logger.info("[" + prediction.substring(2) + " ");
logger.info("(" + w.partOfSpeech + " " + w.form + ") ");
if (!prediction.equals("O") && (w.next == null || chunker.discreteValue(w.next).equals("O") || chunker.discreteValue(w.next).startsWith("B-") || !chunker.discreteValue(w.next).endsWith(prediction.substring(2))))
logger.info("] ");
if (w.next == null)
logger.info("\n");
previous = prediction;
}
}
Aggregations