use of in project CoreNLP by stanfordnlp.
the class Dictionaries method loadGenderLists.
private void loadGenderLists(String maleWordsFile, String neutralWordsFile, String femaleWordsFile) {
try {
getWordsFromFile(maleWordsFile, maleWords, false);
getWordsFromFile(neutralWordsFile, neutralWords, false);
getWordsFromFile(femaleWordsFile, femaleWords, false);
} catch (IOException e) {
throw new RuntimeIOException(e);
use of in project CoreNLP by stanfordnlp.
the class ACEMentionExtractor method nextDoc.
public Document nextDoc() throws Exception {
List<List<CoreLabel>> allWords = new ArrayList<>();
List<List<Mention>> allGoldMentions = new ArrayList<>();
List<List<Mention>> allPredictedMentions;
List<Tree> allTrees = new ArrayList<>();
Annotation anno;
try {
String filename = "";
while (files.length > fileIndex) {
if (files[fileIndex].contains("apf.xml")) {
filename = files[fileIndex];
} else {
filename = "";
if (files.length <= fileIndex && filename.equals(""))
return null;
anno = aceReader.parse(corpusPath + filename);
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap s : sentences) {
int i = 1;
for (CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)) {
w.set(CoreAnnotations.IndexAnnotation.class, i++);
if (!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) {
w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
EntityComparator comparator = new EntityComparator();
extractGoldMentions(s, allGoldMentions, comparator);
if (Constants.USE_GOLD_MENTIONS)
allPredictedMentions = allGoldMentions;
allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
printRawDoc(sentences, allGoldMentions, filename, true);
printRawDoc(sentences, allPredictedMentions, filename, false);
} catch (IOException e) {
throw new RuntimeIOException(e);
return arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
use of in project CoreNLP by stanfordnlp.
the class ClauseSplitter method train.
* Train a clause searcher factory. That is, train a classifier for which arcs should be
* new clauses.
* @param trainingData The training data. This is a stream of triples of:
* <ol>
* <li>The sentence containing a known extraction.</li>
* <li>The span of the subject in the sentence, as a token span.</li>
* <li>The span of the object in the sentence, as a token span.</li>
* </ol>
* @param modelPath The path to save the model to. This is useful for {@link ClauseSplitter#load(String)}.
* @param trainingDataDump The path to save the training data, as a set of labeled featurized datums.
* @param featurizer The featurizer to use for this classifier.
* @return A factory for creating searchers from a given dependency tree.
static ClauseSplitter train(Stream<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData, Optional<File> modelPath, Optional<File> trainingDataDump, Featurizer featurizer) {
// Parse options
LinearClassifierFactory<ClauseClassifierLabel, String> factory = new LinearClassifierFactory<>();
// Generally useful objects
OpenIE openie = new OpenIE(PropertiesUtils.asProperties("splitter.nomodel", "true", "optimizefor", "GENERAL"));
WeightedDataset<ClauseClassifierLabel, String> dataset = new WeightedDataset<>();
AtomicInteger numExamplesProcessed = new AtomicInteger(0);
final Optional<PrintWriter> datasetDumpWriter = -> {
try {
return new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(trainingDataDump.get()))));
} catch (IOException e) {
throw new RuntimeIOException(e);
// Step 1: Loop over data
forceTrack("Training inference");
trainingData.forEach(rawExample -> {
// Parse training datum
CoreMap sentence = rawExample.first;
Collection<Pair<Span, Span>> spans = rawExample.second;
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
// Create raw clause searcher (no classifier)
ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
// Run search -> {
// Parse the search callback
List<Counter<String>> features = fragmentAndScore.second;
SentenceFragment fragment = fragmentAndScore.third.get();
// Search for extractions
Set<RelationTriple> extractions = new HashSet<>(openie.relationsInFragments(openie.entailmentsFromClause(fragment)));
Trilean correct = Trilean.FALSE;
RELATION_TRIPLE_LOOP: for (RelationTriple extraction : extractions) {
// Clean up the guesses
Span subjectGuess = Span.fromValues(extraction.subject.get(0).index() - 1, extraction.subject.get(extraction.subject.size() - 1).index());
Span objectGuess = Span.fromValues(extraction.object.get(0).index() - 1, extraction.object.get(extraction.object.size() - 1).index());
for (Pair<Span, Span> candidateGold : spans) {
Span subjectSpan = candidateGold.first;
Span objectSpan = candidateGold.second;
// Check if it matches
if ((subjectGuess.equals(subjectSpan) && objectGuess.equals(objectSpan)) || (subjectGuess.equals(objectSpan) && objectGuess.equals(subjectSpan))) {
correct = Trilean.TRUE;
} else if (Util.nerOverlap(tokens, subjectSpan, subjectGuess) && Util.nerOverlap(tokens, objectSpan, objectGuess) || Util.nerOverlap(tokens, subjectSpan, objectGuess) && Util.nerOverlap(tokens, objectSpan, subjectGuess)) {
if (!correct.isTrue()) {
correct = Trilean.TRUE;
} else {
if (!correct.isTrue()) {
correct = Trilean.UNKNOWN;
// Process the datum
if (!features.isEmpty()) {
// Convert the path to datums
List<Pair<Counter<String>, ClauseClassifierLabel>> decisionsToAddAsDatums = new ArrayList<>();
if (correct.isTrue()) {
// If this is a "true" path, add the k-1 decisions as INTERM and the last decision as a SPLIT
for (int i = 0; i < features.size(); ++i) {
if (i == features.size() - 1) {
decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
} else {
decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
} else if (correct.isFalse()) {
// If this is a "false" path, then we know at least the last decision was bad.
decisionsToAddAsDatums.add(Pair.makePair(features.get(features.size() - 1), ClauseClassifierLabel.NOT_A_CLAUSE));
} else if (correct.isUnknown()) {
// If this is an "unknown" path, only add it if it was the result of vanilla splits
// (check if it is a sequence of simple splits)
boolean isSimpleSplit = false;
for (Counter<String> feats : features) {
if (featurizer.isSimpleSplit(feats)) {
isSimpleSplit = true;
// (if so, add it as if it were a True example)
if (isSimpleSplit) {
for (int i = 0; i < features.size(); ++i) {
if (i == features.size() - 1) {
decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
} else {
decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
// Add the datums
for (Pair<Counter<String>, ClauseClassifierLabel> decision : decisionsToAddAsDatums) {
// (create datum)
RVFDatum<ClauseClassifierLabel, String> datum = new RVFDatum<>(decision.first);
// (dump datum to debug log)
if (datasetDumpWriter.isPresent()) {
datasetDumpWriter.get().println(decision.second + "\t" + StringUtils.join(decision.first.entrySet().stream().map(entry -> entry.getKey() + "->" + entry.getValue()), ";"));
// (add datum to dataset)
return true;
}, new LinearClassifier<>(new ClassicCounter<>()), Collections.emptyMap(), featurizer, 10000);
// Debug info
if (numExamplesProcessed.incrementAndGet() % 100 == 0) {
log("processed " + numExamplesProcessed + " training sentences: " + dataset.size() + " datums");
endTrack("Training inference");
// Close the file
if (datasetDumpWriter.isPresent()) {
// Step 2: Train classifier
Classifier<ClauseClassifierLabel, String> fullClassifier = factory.trainClassifier(dataset);
if (modelPath.isPresent()) {
Pair<Classifier<ClauseClassifierLabel, String>, Featurizer> toSave = Pair.makePair(fullClassifier, featurizer);
try {
IOUtils.writeObjectToFile(toSave, modelPath.get());
log("SUCCESS: wrote model to " + modelPath.get().getPath());
} catch (IOException e) {
log("ERROR: failed to save model to path: " + modelPath.get().getPath());
// Step 3: Check accuracy of classifier
forceTrack("Training accuracy");
Util.dumpAccuracy(fullClassifier, dataset);
endTrack("Training accuracy");
int numFolds = 5;
forceTrack(numFolds + " fold cross-validation");
for (int fold = 0; fold < numFolds; ++fold) {
forceTrack("Fold " + (fold + 1));
Pair<GeneralDataset<ClauseClassifierLabel, String>, GeneralDataset<ClauseClassifierLabel, String>> foldData = dataset.splitOutFold(fold, numFolds);
Classifier<ClauseClassifierLabel, String> classifier = factory.trainClassifier(foldData.first);
Util.dumpAccuracy(classifier, foldData.second);
endTrack("Fold " + (fold + 1));
endTrack(numFolds + " fold cross-validation");
// Step 5: return factory
return (tree, truth) -> new ClauseSplitterSearchProblem(tree, truth, Optional.of(fullClassifier), Optional.of(featurizer));
use of in project CoreNLP by stanfordnlp.
the class CustomAnnotationSerializer method read.
public Pair<Annotation, InputStream> read(InputStream is) throws IOException {
if (compress && !(is instanceof GZIPInputStream))
is = new GZIPInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
Annotation doc = new Annotation("");
String line;
// read the coref graph (new format)
Map<Integer, CorefChain> chains = loadCorefChains(reader);
if (chains != null)
doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains);
// read the coref graph (old format)
line = reader.readLine().trim();
if (line.length() > 0) {
String[] bits = line.split(" ");
if (bits.length % 4 != 0) {
throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
List<Pair<IntTuple, IntTuple>> corefGraph = new ArrayList<>();
for (int i = 0; i < bits.length; i += 4) {
IntTuple src = new IntTuple(2);
IntTuple dst = new IntTuple(2);
src.set(0, Integer.parseInt(bits[i]));
src.set(1, Integer.parseInt(bits[i + 1]));
dst.set(0, Integer.parseInt(bits[i + 2]));
dst.set(1, Integer.parseInt(bits[i + 3]));
corefGraph.add(new Pair<>(src, dst));
doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph);
// read individual sentences
List<CoreMap> sentences = new ArrayList<>();
while ((line = reader.readLine()) != null) {
CoreMap sentence = new Annotation("");
// first line is the parse tree. construct it with CoreLabels in Tree nodes
Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
// read the dependency graphs
IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader);
// the remaining lines until empty line are tokens
List<CoreLabel> tokens = new ArrayList<>();
while ((line = reader.readLine()) != null) {
if (line.length() == 0)
CoreLabel token = loadToken(line, haveExplicitAntecedent);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
// convert the intermediate graph to an actual SemanticGraph
SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps);
SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
doc.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return Pair.makePair(doc, is);
use of in project CoreNLP by stanfordnlp.
the class MaxentTagger method readModelAndInit.
* This reads the complete tagger from a single model provided as an InputStream,
* and initializes the tagger using a
* combination of the properties passed in and parameters from the file.
* <br>
* <i>Note for the future:</i> This assumes that the TaggerConfig in the file
* has already been read and used. This work is done inside the
* constructor of TaggerConfig. It might be better to refactor
* things so that is all done inside this method, but for the moment
* it seemed better to leave working code alone [cdm 2008].
* @param config The tagger config
* @param modelStream The model provided as an InputStream
* @param printLoading Whether to print a message saying what model file is being loaded and how long it took when finished.
* @throws RuntimeIOException if I/O errors or serialization errors
protected void readModelAndInit(Properties config, InputStream modelStream, boolean printLoading) {
try {
// first check can open file ... or else leave with exception
DataInputStream rf = new DataInputStream(modelStream);
readModelAndInit(config, rf, printLoading);
} catch (IOException e) {
throw new RuntimeIOException("Error while loading a tagger model (probably missing model file)", e);