Search in sources :

Example 11 with RVFDatum

use of edu.stanford.nlp.ling.RVFDatum in project CoreNLP by stanfordnlp.

the class KBPStatisticalExtractor method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    // Disable SLF4J crap.
    RedwoodConfiguration.standard().apply();
    // Fill command-line options
    ArgumentParser.fillOptions(KBPStatisticalExtractor.class, args);
    // Load the test (or dev) data
    forceTrack("Test data");
    List<Pair<KBPInput, String>> testExamples = KBPRelationExtractor.readDataset(TEST_FILE);
    log.info("Read " + testExamples.size() + " examples");
    endTrack("Test data");
    // If we can't find an existing model, train one
    if (!IOUtils.existsInClasspathOrFileSystem(MODEL_FILE)) {
        forceTrack("Training data");
        List<Pair<KBPInput, String>> trainExamples = KBPRelationExtractor.readDataset(TRAIN_FILE);
        log.info("Read " + trainExamples.size() + " examples");
        log.info("" + trainExamples.stream().map(Pair::second).filter(NO_RELATION::equals).count() + " are " + NO_RELATION);
        endTrack("Training data");
        // Featurize + create the dataset
        forceTrack("Creating dataset");
        RVFDataset<String, String> dataset = new RVFDataset<>();
        final AtomicInteger i = new AtomicInteger(0);
        long beginTime = System.currentTimeMillis();
        trainExamples.stream().parallel().forEach(example -> {
            if (i.incrementAndGet() % 1000 == 0) {
                log.info("[" + Redwood.formatTimeDifference(System.currentTimeMillis() - beginTime) + "] Featurized " + i.get() + " / " + trainExamples.size() + " examples");
            }
            Counter<String> features = features(example.first);
            synchronized (dataset) {
                dataset.add(new RVFDatum<>(features, example.second));
            }
        });
        // Free up some memory
        trainExamples.clear();
        endTrack("Creating dataset");
        // Train the classifier
        log.info("Training classifier:");
        Classifier<String, String> classifier = trainMultinomialClassifier(dataset, FEATURE_THRESHOLD, SIGMA);
        // Free up some memory
        dataset.clear();
        // Save the classifier
        IOUtils.writeObjectToFile(new KBPStatisticalExtractor(classifier), MODEL_FILE);
    }
    // Read either a newly-trained or pre-trained model
    Object model = IOUtils.readObjectFromURLOrClasspathOrFileSystem(MODEL_FILE);
    KBPStatisticalExtractor classifier;
    if (model instanceof Classifier) {
        //noinspection unchecked
        classifier = new KBPStatisticalExtractor((Classifier<String, String>) model);
    } else {
        classifier = ((KBPStatisticalExtractor) model);
    }
    // Evaluate the model
    classifier.computeAccuracy(testExamples.stream(), PREDICTIONS.map(x -> {
        try {
            return "stdout".equalsIgnoreCase(x) ? System.out : new PrintStream(new FileOutputStream(x));
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }));
}
Also used : edu.stanford.nlp.optimization(edu.stanford.nlp.optimization) CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) Counters(edu.stanford.nlp.stats.Counters) IOUtils(edu.stanford.nlp.io.IOUtils) DefaultPaths(edu.stanford.nlp.pipeline.DefaultPaths) edu.stanford.nlp.util(edu.stanford.nlp.util) Redwood(edu.stanford.nlp.util.logging.Redwood) Util(edu.stanford.nlp.util.logging.Redwood.Util) Datum(edu.stanford.nlp.ling.Datum) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Counter(edu.stanford.nlp.stats.Counter) java.io(java.io) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) edu.stanford.nlp.classify(edu.stanford.nlp.classify) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) Sentence(edu.stanford.nlp.simple.Sentence) RedwoodConfiguration(edu.stanford.nlp.util.logging.RedwoodConfiguration) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RVFDatum(edu.stanford.nlp.ling.RVFDatum) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Aggregations

RVFDatum (edu.stanford.nlp.ling.RVFDatum)11 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)5 edu.stanford.nlp.classify (edu.stanford.nlp.classify)4 IOUtils (edu.stanford.nlp.io.IOUtils)3 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 Counter (edu.stanford.nlp.stats.Counter)3 Redwood (edu.stanford.nlp.util.logging.Redwood)3 Util (edu.stanford.nlp.util.logging.Redwood.Util)3 File (java.io.File)3 ArrayList (java.util.ArrayList)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 Span (edu.stanford.nlp.ie.machinereading.structure.Span)2 ScorePhraseMeasures (edu.stanford.nlp.patterns.ConstantsAndVariables.ScorePhraseMeasures)2 Annotation (edu.stanford.nlp.pipeline.Annotation)2 SentimentClass (edu.stanford.nlp.simple.SentimentClass)2 edu.stanford.nlp.util (edu.stanford.nlp.util)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 RedwoodConfiguration (edu.stanford.nlp.util.logging.RedwoodConfiguration)2