Search in sources :

Example 1 with CoNLLDocumentReaderAndWriter

use of edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter in project CoreNLP by stanfordnlp.

the class NERBenchmarkSlowITest method evalConll.

/**
 * The main engine that does the heavy lifting for evaluating a dataset. We are performing
 * 4-way classification on: ORG, PER, LOC, MISC
 * @param dataset Dataset prefix to evaluate. Should be one of "train", "dev", "test"
 * @throws IOException
 * @throws Exception
 * @return F1 computed for given dataset by model
 */
// NOTE that CoNLL tests assume a 4-class classification scheme: ORG, PER, LOC, MISC
public HashMap<String, Double> evalConll(String dataset) throws IOException {
    SeqClassifierFlags flags = new SeqClassifierFlags();
    flags.entitySubclassification = "noprefix";
    CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter();
    rw.init(flags);
    String inputFile;
    File resultsFile;
    switch(dataset) {
        case "train":
            resultsFile = File.createTempFile("conlldev", null);
            inputFile = CONLL_DEV;
            break;
        case "dev":
            resultsFile = File.createTempFile("conlldev", null);
            inputFile = CONLL_DEV;
            break;
        case "test":
            resultsFile = File.createTempFile("conlltest", null);
            inputFile = CONLL_TEST;
            break;
        default:
            throw new RuntimeException("Not a valid dataset name provided!");
    }
    resultsFile.deleteOnExit();
    PrintWriter writer = new PrintWriter(resultsFile);
    for (Iterator<List<CoreLabel>> itr = rw.getIterator(IOUtils.readerFromString(inputFile)); itr.hasNext(); ) {
        List<CoreLabel> goldLabels = itr.next();
        String docString = "";
        for (CoreLabel f1 : goldLabels) {
            docString += " " + f1.word();
        }
        Annotation docAnnotation = new Annotation(docString);
        conllNERAnnotationPipeline.annotate(docAnnotation);
        List<CoreLabel> predictLabels = new ArrayList<CoreLabel>();
        for (CoreLabel l : docAnnotation.get(TokensAnnotation.class)) {
            predictLabels.add(l);
        }
        assertEquals("# gold outputs not same as # predicted!\n", goldLabels.size(), predictLabels.size());
        int numLabels = goldLabels.size();
        // Write to output file
        for (int i = 0; i < numLabels; i++) {
            CoreLabel gold = goldLabels.get(i);
            String goldToken;
            // TODO(meric): What is difference between GoldAnswer and Answer annotation?
            goldToken = gold.get(AnswerAnnotation.class);
            CoreLabel predict = predictLabels.get(i);
            String predictStr = predict.get(NamedEntityTagAnnotation.class);
            String predictPrefix = convert(predictStr);
            assertEquals("Gold and Predict words don't match!\n", gold.get(TextAnnotation.class), predict.get(TextAnnotation.class));
            writer.println(gold.get(TextAnnotation.class) + "\t" + "_" + "\t" + goldToken + "\t" + predictPrefix);
        }
    }
    writer.close();
    // Run CoNLL eval script and extract F1 score
    String result = runEvalScript(resultsFile);
    HashMap<String, Double> parsedF1 = parseResults(result);
    return parsedF1;
}
Also used : AnswerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation) ArrayList(java.util.ArrayList) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) AnswerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation) TokensAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) NamedEntityTagAnnotation(edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ArrayList(java.util.ArrayList) List(java.util.List) TextAnnotation(edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation) File(java.io.File) CoNLLDocumentReaderAndWriter(edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter) PrintWriter(java.io.PrintWriter)

Example 2 with CoNLLDocumentReaderAndWriter

use of edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseColumnFile.

public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
    CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
    Properties props = new Properties();
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    flags.entitySubclassification = "noprefix";
    flags.retainEntitySubclassification = false;
    conllreader.init(flags);
    Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
    ;
    int num = -1;
    Map<String, DataInstance> sents = new HashMap<>();
    while (dociter.hasNext()) {
        List<CoreLabel> doc = dociter.next();
        List<String> words = new ArrayList<>();
        List<CoreLabel> sentcore = new ArrayList<>();
        int tokenindex = 0;
        for (CoreLabel l : doc) {
            if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
                if (words.size() > 0) {
                    num++;
                    String docid = sentIDprefix + "-" + String.valueOf(num);
                    DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
                    sents.put(docid, sentInst);
                    words = new ArrayList<>();
                    sentcore = new ArrayList<>();
                    tokenindex = 0;
                }
                continue;
            }
            tokenindex++;
            words.add(l.word());
            l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            String label = l.get(CoreAnnotations.AnswerAnnotation.class);
            assert label != null : "label cannot be null";
            l.set(CoreAnnotations.TextAnnotation.class, l.word());
            l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
            if (setGoldClass) {
                l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
            }
            if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                l.set(setClassForTheseLabels.get(label), label);
            sentcore.add(l);
        }
        if (words.size() > 0) {
            num++;
            String docid = sentIDprefix + "-" + String.valueOf(num);
            ;
            DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
            sents.put(docid, sentInst);
        }
    }
    return sents;
}
Also used : DataInstance(edu.stanford.nlp.patterns.DataInstance) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoNLLDocumentReaderAndWriter(edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 CoNLLDocumentReaderAndWriter (edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)2 SeqClassifierFlags (edu.stanford.nlp.sequences.SeqClassifierFlags)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 AnswerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation)1 NamedEntityTagAnnotation (edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation)1 TextAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation)1 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)1 DataInstance (edu.stanford.nlp.patterns.DataInstance)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 File (java.io.File)1 PrintWriter (java.io.PrintWriter)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1