Search in sources :

Example 71 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class Dictionaries method loadGenderLists.

private void loadGenderLists(String maleWordsFile, String neutralWordsFile, String femaleWordsFile) {
    try {
        getWordsFromFile(maleWordsFile, maleWords, false);
        getWordsFromFile(neutralWordsFile, neutralWords, false);
        getWordsFromFile(femaleWordsFile, femaleWords, false);
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Example 72 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class ACEMentionExtractor method nextDoc.

public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<>();
    List<List<Mention>> allGoldMentions = new ArrayList<>();
    List<List<Mention>> allPredictedMentions;
    List<Tree> allTrees = new ArrayList<>();
    Annotation anno;
    try {
        String filename = "";
        while (files.length > fileIndex) {
            if (files[fileIndex].contains("apf.xml")) {
                filename = files[fileIndex];
                fileIndex++;
                break;
            } else {
                fileIndex++;
                filename = "";
            }
        }
        if (files.length <= fileIndex && filename.equals(""))
            return null;
        anno = aceReader.parse(corpusPath + filename);
        stanfordProcessor.annotate(anno);
        List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap s : sentences) {
            int i = 1;
            for (CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)) {
                w.set(CoreAnnotations.IndexAnnotation.class, i++);
                if (!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) {
                    w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
                }
            }
            allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class));
            allWords.add(s.get(CoreAnnotations.TokensAnnotation.class));
            EntityComparator comparator = new EntityComparator();
            extractGoldMentions(s, allGoldMentions, comparator);
        }
        if (Constants.USE_GOLD_MENTIONS)
            allPredictedMentions = allGoldMentions;
        else
            allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
        printRawDoc(sentences, allGoldMentions, filename, true);
        printRawDoc(sentences, allPredictedMentions, filename, false);
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    return arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) ArrayList(java.util.ArrayList) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 73 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class ClauseSplitter method train.

/**
 * Train a clause searcher factory. That is, train a classifier for which arcs should be
 * new clauses.
 *
 * @param trainingData The training data. This is a stream of triples of:
 *                     <ol>
 *                       <li>The sentence containing a known extraction.</li>
 *                       <li>The span of the subject in the sentence, as a token span.</li>
 *                       <li>The span of the object in the sentence, as a token span.</li>
 *                     </ol>
 * @param modelPath The path to save the model to. This is useful for {@link ClauseSplitter#load(String)}.
 * @param trainingDataDump The path to save the training data, as a set of labeled featurized datums.
 * @param featurizer The featurizer to use for this classifier.
 *
 * @return A factory for creating searchers from a given dependency tree.
 */
static ClauseSplitter train(Stream<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData, Optional<File> modelPath, Optional<File> trainingDataDump, Featurizer featurizer) {
    // Parse options
    LinearClassifierFactory<ClauseClassifierLabel, String> factory = new LinearClassifierFactory<>();
    // Generally useful objects
    OpenIE openie = new OpenIE(PropertiesUtils.asProperties("splitter.nomodel", "true", "optimizefor", "GENERAL"));
    WeightedDataset<ClauseClassifierLabel, String> dataset = new WeightedDataset<>();
    AtomicInteger numExamplesProcessed = new AtomicInteger(0);
    final Optional<PrintWriter> datasetDumpWriter = trainingDataDump.map(file -> {
        try {
            return new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(trainingDataDump.get()))));
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    });
    // Step 1: Loop over data
    forceTrack("Training inference");
    trainingData.forEach(rawExample -> {
        // Parse training datum
        CoreMap sentence = rawExample.first;
        Collection<Pair<Span, Span>> spans = rawExample.second;
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        // Create raw clause searcher (no classifier)
        ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
        // Run search
        problem.search(fragmentAndScore -> {
            // Parse the search callback
            List<Counter<String>> features = fragmentAndScore.second;
            SentenceFragment fragment = fragmentAndScore.third.get();
            // Search for extractions
            Set<RelationTriple> extractions = new HashSet<>(openie.relationsInFragments(openie.entailmentsFromClause(fragment)));
            Trilean correct = Trilean.FALSE;
            RELATION_TRIPLE_LOOP: for (RelationTriple extraction : extractions) {
                // Clean up the guesses
                Span subjectGuess = Span.fromValues(extraction.subject.get(0).index() - 1, extraction.subject.get(extraction.subject.size() - 1).index());
                Span objectGuess = Span.fromValues(extraction.object.get(0).index() - 1, extraction.object.get(extraction.object.size() - 1).index());
                for (Pair<Span, Span> candidateGold : spans) {
                    Span subjectSpan = candidateGold.first;
                    Span objectSpan = candidateGold.second;
                    // Check if it matches
                    if ((subjectGuess.equals(subjectSpan) && objectGuess.equals(objectSpan)) || (subjectGuess.equals(objectSpan) && objectGuess.equals(subjectSpan))) {
                        correct = Trilean.TRUE;
                        break RELATION_TRIPLE_LOOP;
                    } else if (Util.nerOverlap(tokens, subjectSpan, subjectGuess) && Util.nerOverlap(tokens, objectSpan, objectGuess) || Util.nerOverlap(tokens, subjectSpan, objectGuess) && Util.nerOverlap(tokens, objectSpan, subjectGuess)) {
                        if (!correct.isTrue()) {
                            correct = Trilean.TRUE;
                            break RELATION_TRIPLE_LOOP;
                        }
                    } else {
                        if (!correct.isTrue()) {
                            correct = Trilean.UNKNOWN;
                            break RELATION_TRIPLE_LOOP;
                        }
                    }
                }
            }
            // Process the datum
            if (!features.isEmpty()) {
                // Convert the path to datums
                List<Pair<Counter<String>, ClauseClassifierLabel>> decisionsToAddAsDatums = new ArrayList<>();
                if (correct.isTrue()) {
                    // If this is a "true" path, add the k-1 decisions as INTERM and the last decision as a SPLIT
                    for (int i = 0; i < features.size(); ++i) {
                        if (i == features.size() - 1) {
                            decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
                        } else {
                            decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
                        }
                    }
                } else if (correct.isFalse()) {
                    // If this is a "false" path, then we know at least the last decision was bad.
                    decisionsToAddAsDatums.add(Pair.makePair(features.get(features.size() - 1), ClauseClassifierLabel.NOT_A_CLAUSE));
                } else if (correct.isUnknown()) {
                    // If this is an "unknown" path, only add it if it was the result of vanilla splits
                    // (check if it is a sequence of simple splits)
                    boolean isSimpleSplit = false;
                    for (Counter<String> feats : features) {
                        if (featurizer.isSimpleSplit(feats)) {
                            isSimpleSplit = true;
                            break;
                        }
                    }
                    // (if so, add it as if it were a True example)
                    if (isSimpleSplit) {
                        for (int i = 0; i < features.size(); ++i) {
                            if (i == features.size() - 1) {
                                decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
                            } else {
                                decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
                            }
                        }
                    }
                }
                // Add the datums
                for (Pair<Counter<String>, ClauseClassifierLabel> decision : decisionsToAddAsDatums) {
                    // (create datum)
                    RVFDatum<ClauseClassifierLabel, String> datum = new RVFDatum<>(decision.first);
                    datum.setLabel(decision.second);
                    // (dump datum to debug log)
                    if (datasetDumpWriter.isPresent()) {
                        datasetDumpWriter.get().println(decision.second + "\t" + StringUtils.join(decision.first.entrySet().stream().map(entry -> entry.getKey() + "->" + entry.getValue()), ";"));
                    }
                    // (add datum to dataset)
                    dataset.add(datum);
                }
            }
            return true;
        }, new LinearClassifier<>(new ClassicCounter<>()), Collections.emptyMap(), featurizer, 10000);
        // Debug info
        if (numExamplesProcessed.incrementAndGet() % 100 == 0) {
            log("processed " + numExamplesProcessed + " training sentences: " + dataset.size() + " datums");
        }
    });
    endTrack("Training inference");
    // Close the file
    if (datasetDumpWriter.isPresent()) {
        datasetDumpWriter.get().close();
    }
    // Step 2: Train classifier
    forceTrack("Training");
    Classifier<ClauseClassifierLabel, String> fullClassifier = factory.trainClassifier(dataset);
    endTrack("Training");
    if (modelPath.isPresent()) {
        Pair<Classifier<ClauseClassifierLabel, String>, Featurizer> toSave = Pair.makePair(fullClassifier, featurizer);
        try {
            IOUtils.writeObjectToFile(toSave, modelPath.get());
            log("SUCCESS: wrote model to " + modelPath.get().getPath());
        } catch (IOException e) {
            log("ERROR: failed to save model to path: " + modelPath.get().getPath());
            err(e);
        }
    }
    // Step 3: Check accuracy of classifier
    forceTrack("Training accuracy");
    dataset.randomize(42L);
    Util.dumpAccuracy(fullClassifier, dataset);
    endTrack("Training accuracy");
    int numFolds = 5;
    forceTrack(numFolds + " fold cross-validation");
    for (int fold = 0; fold < numFolds; ++fold) {
        forceTrack("Fold " + (fold + 1));
        forceTrack("Training");
        Pair<GeneralDataset<ClauseClassifierLabel, String>, GeneralDataset<ClauseClassifierLabel, String>> foldData = dataset.splitOutFold(fold, numFolds);
        Classifier<ClauseClassifierLabel, String> classifier = factory.trainClassifier(foldData.first);
        endTrack("Training");
        forceTrack("Test");
        Util.dumpAccuracy(classifier, foldData.second);
        endTrack("Test");
        endTrack("Fold " + (fold + 1));
    }
    endTrack(numFolds + " fold cross-validation");
    // Step 5: return factory
    return (tree, truth) -> new ClauseSplitterSearchProblem(tree, truth, Optional.of(fullClassifier), Optional.of(featurizer));
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) IOUtils(edu.stanford.nlp.io.IOUtils) edu.stanford.nlp.util(edu.stanford.nlp.util) BiFunction(java.util.function.BiFunction) Redwood(edu.stanford.nlp.util.logging.Redwood) Util(edu.stanford.nlp.util.logging.Redwood.Util) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Counter(edu.stanford.nlp.stats.Counter) Stream(java.util.stream.Stream) java.io(java.io) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) edu.stanford.nlp.classify(edu.stanford.nlp.classify) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) GZIPOutputStream(java.util.zip.GZIPOutputStream) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RVFDatum(edu.stanford.nlp.ling.RVFDatum) ClauseSplitterSearchProblem(edu.stanford.nlp.naturalli.ClauseSplitterSearchProblem) Counter(edu.stanford.nlp.stats.Counter) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) GZIPOutputStream(java.util.zip.GZIPOutputStream) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) ClauseSplitterSearchProblem(edu.stanford.nlp.naturalli.ClauseSplitterSearchProblem) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) Span(edu.stanford.nlp.ie.machinereading.structure.Span) RVFDatum(edu.stanford.nlp.ling.RVFDatum) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 74 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class CustomAnnotationSerializer method read.

@Override
public Pair<Annotation, InputStream> read(InputStream is) throws IOException {
    if (compress && !(is instanceof GZIPInputStream))
        is = new GZIPInputStream(is);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    Annotation doc = new Annotation("");
    String line;
    // read the coref graph (new format)
    Map<Integer, CorefChain> chains = loadCorefChains(reader);
    if (chains != null)
        doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains);
    // read the coref graph (old format)
    line = reader.readLine().trim();
    if (line.length() > 0) {
        String[] bits = line.split(" ");
        if (bits.length % 4 != 0) {
            throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
        }
        List<Pair<IntTuple, IntTuple>> corefGraph = new ArrayList<>();
        for (int i = 0; i < bits.length; i += 4) {
            IntTuple src = new IntTuple(2);
            IntTuple dst = new IntTuple(2);
            src.set(0, Integer.parseInt(bits[i]));
            src.set(1, Integer.parseInt(bits[i + 1]));
            dst.set(0, Integer.parseInt(bits[i + 2]));
            dst.set(1, Integer.parseInt(bits[i + 3]));
            corefGraph.add(new Pair<>(src, dst));
        }
        doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph);
    }
    // read individual sentences
    List<CoreMap> sentences = new ArrayList<>();
    while ((line = reader.readLine()) != null) {
        CoreMap sentence = new Annotation("");
        // first line is the parse tree. construct it with CoreLabels in Tree nodes
        Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
        sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
        // read the dependency graphs
        IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader);
        IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader);
        IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader);
        // the remaining lines until empty line are tokens
        List<CoreLabel> tokens = new ArrayList<>();
        while ((line = reader.readLine()) != null) {
            if (line.length() == 0)
                break;
            CoreLabel token = loadToken(line, haveExplicitAntecedent);
            tokens.add(token);
        }
        sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
        // convert the intermediate graph to an actual SemanticGraph
        SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens);
        sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps);
        SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens);
        sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
        SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens);
        sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
        sentences.add(sentence);
    }
    doc.set(CoreAnnotations.SentencesAnnotation.class, sentences);
    return Pair.makePair(doc, is);
}
Also used : CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) GZIPInputStream(java.util.zip.GZIPInputStream) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Tree(edu.stanford.nlp.trees.Tree) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) PennTreeReader(edu.stanford.nlp.trees.PennTreeReader) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph)

Example 75 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class MaxentTagger method readModelAndInit.

/**
 * This reads the complete tagger from a single model provided as an InputStream,
 *  and initializes the tagger using a
 *  combination of the properties passed in and parameters from the file.
 *  <br>
 *  <i>Note for the future:</i> This assumes that the TaggerConfig in the file
 *  has already been read and used.  This work is done inside the
 *  constructor of TaggerConfig.  It might be better to refactor
 *  things so that is all done inside this method, but for the moment
 *  it seemed better to leave working code alone [cdm 2008].
 *
 *  @param config The tagger config
 *  @param modelStream The model provided as an InputStream
 *  @param printLoading Whether to print a message saying what model file is being loaded and how long it took when finished.
 *  @throws RuntimeIOException if I/O errors or serialization errors
 */
protected void readModelAndInit(Properties config, InputStream modelStream, boolean printLoading) {
    try {
        // first check can open file ... or else leave with exception
        DataInputStream rf = new DataInputStream(modelStream);
        readModelAndInit(config, rf, printLoading);
        rf.close();
    } catch (IOException e) {
        throw new RuntimeIOException("Error while loading a tagger model (probably missing model file)", e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Aggregations

RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)114 IOException (java.io.IOException)61 BufferedReader (java.io.BufferedReader)22 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)11 File (java.io.File)9 ArrayList (java.util.ArrayList)7 Tree (edu.stanford.nlp.trees.Tree)6 CoreMap (edu.stanford.nlp.util.CoreMap)5 BufferedWriter (java.io.BufferedWriter)5 Properties (java.util.Properties)5 Timing (edu.stanford.nlp.util.Timing)4 FileNotFoundException (java.io.FileNotFoundException)4 FileOutputStream (java.io.FileOutputStream)4 ObjectOutputStream (java.io.ObjectOutputStream)4 PrintWriter (java.io.PrintWriter)4 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)3 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)3