use of edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter in project CoreNLP by stanfordnlp.
the class NERBenchmarkSlowITest method evalConll.
/**
* The main engine that does the heavy lifting for evaluating a dataset. We are performing
* 4-way classification on: ORG, PER, LOC, MISC
* @param dataset Dataset prefix to evaluate. Should be one of "train", "dev", "test"
* @throws IOException
* @throws Exception
* @return F1 computed for given dataset by model
*/
// NOTE that CoNLL tests assume a 4-class classification scheme: ORG, PER, LOC, MISC
public HashMap<String, Double> evalConll(String dataset) throws IOException {
SeqClassifierFlags flags = new SeqClassifierFlags();
flags.entitySubclassification = "noprefix";
CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter();
rw.init(flags);
String inputFile;
File resultsFile;
switch(dataset) {
case "train":
resultsFile = File.createTempFile("conlldev", null);
inputFile = CONLL_DEV;
break;
case "dev":
resultsFile = File.createTempFile("conlldev", null);
inputFile = CONLL_DEV;
break;
case "test":
resultsFile = File.createTempFile("conlltest", null);
inputFile = CONLL_TEST;
break;
default:
throw new RuntimeException("Not a valid dataset name provided!");
}
resultsFile.deleteOnExit();
PrintWriter writer = new PrintWriter(resultsFile);
for (Iterator<List<CoreLabel>> itr = rw.getIterator(IOUtils.readerFromString(inputFile)); itr.hasNext(); ) {
List<CoreLabel> goldLabels = itr.next();
String docString = "";
for (CoreLabel f1 : goldLabels) {
docString += " " + f1.word();
}
Annotation docAnnotation = new Annotation(docString);
conllNERAnnotationPipeline.annotate(docAnnotation);
List<CoreLabel> predictLabels = new ArrayList<CoreLabel>();
for (CoreLabel l : docAnnotation.get(TokensAnnotation.class)) {
predictLabels.add(l);
}
assertEquals("# gold outputs not same as # predicted!\n", goldLabels.size(), predictLabels.size());
int numLabels = goldLabels.size();
// Write to output file
for (int i = 0; i < numLabels; i++) {
CoreLabel gold = goldLabels.get(i);
String goldToken;
// TODO(meric): What is difference between GoldAnswer and Answer annotation?
goldToken = gold.get(AnswerAnnotation.class);
CoreLabel predict = predictLabels.get(i);
String predictStr = predict.get(NamedEntityTagAnnotation.class);
String predictPrefix = convert(predictStr);
assertEquals("Gold and Predict words don't match!\n", gold.get(TextAnnotation.class), predict.get(TextAnnotation.class));
writer.println(gold.get(TextAnnotation.class) + "\t" + "_" + "\t" + goldToken + "\t" + predictPrefix);
}
}
writer.close();
// Run CoNLL eval script and extract F1 score
String result = runEvalScript(resultsFile);
HashMap<String, Double> parsedF1 = parseResults(result);
return parsedF1;
}
use of edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseColumnFile.
public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
Properties props = new Properties();
SeqClassifierFlags flags = new SeqClassifierFlags(props);
flags.entitySubclassification = "noprefix";
flags.retainEntitySubclassification = false;
conllreader.init(flags);
Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
;
int num = -1;
Map<String, DataInstance> sents = new HashMap<>();
while (dociter.hasNext()) {
List<CoreLabel> doc = dociter.next();
List<String> words = new ArrayList<>();
List<CoreLabel> sentcore = new ArrayList<>();
int tokenindex = 0;
for (CoreLabel l : doc) {
if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
words = new ArrayList<>();
sentcore = new ArrayList<>();
tokenindex = 0;
}
continue;
}
tokenindex++;
words.add(l.word());
l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
String label = l.get(CoreAnnotations.AnswerAnnotation.class);
assert label != null : "label cannot be null";
l.set(CoreAnnotations.TextAnnotation.class, l.word());
l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
if (setGoldClass) {
l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
l.set(setClassForTheseLabels.get(label), label);
sentcore.add(l);
}
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
;
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
}
}
return sents;
}
Aggregations