use of edu.stanford.nlp.util.concurrent.MulticoreWrapper in project CoreNLP by stanfordnlp.
the class ParseFiles method parseFiles.
public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
if (op.testOptions.verbose) {
if (tokenizerFactory != null)
pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
}
final Timing timer = new Timing();
//Loop over the files
for (int i = argIndex; i < args.length; i++) {
final String filename = args[i];
final DocumentPreprocessor documentPreprocessor;
if (filename.equals("-")) {
try {
documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else {
documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
}
//Unused values are null per the main() method invocation below
//null is the default for these properties
documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
documentPreprocessor.setEscaper(escaper);
documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
documentPreprocessor.setTagDelimiter(tagDelimiter);
documentPreprocessor.setElementDelimiter(elementDelimiter);
if (tokenizerFactory == null)
documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
else
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
//Setup the output
PrintWriter pwo = pwOut;
if (op.testOptions.writeOutputFiles) {
String normalizedName = filename;
try {
// this will exception if not a URL
new URL(normalizedName);
normalizedName = normalizedName.replaceAll("/", "_");
} catch (MalformedURLException e) {
//It isn't a URL, so silently ignore
}
String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
String fname = normalizedName + '.' + ext;
if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
String fseparator = System.getProperty("file.separator");
if (fseparator == null || fseparator.isEmpty()) {
fseparator = "/";
}
File fnameFile = new File(fname);
fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
}
try {
pwo = op.tlpParams.pw(new FileOutputStream(fname));
} catch (IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
pwErr.println("Parsing file: " + filename);
int num = 0;
int numProcessed = 0;
if (op.testOptions.testingThreads != 1) {
MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
wrapper.put(sentence);
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
}
wrapper.join();
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
} else {
ParserQuery pq = pqFactory.parserQuery();
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
pq.parseAndReport(sentence, pwErr);
processResults(pq, numProcessed++, pwo);
}
}
treePrint.printFooter(pwo);
if (op.testOptions.writeOutputFiles)
pwo.close();
pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
}
long millis = timer.stop();
if (summary) {
if (pcfgLL != null)
pcfgLL.display(false, pwErr);
if (depLL != null)
depLL.display(false, pwErr);
if (factLL != null)
factLL.display(false, pwErr);
}
if (saidMemMessage) {
ParserUtils.printOutOfMemory(pwErr);
}
double wordspersec = numWords / (((double) millis) / 1000);
double sentspersec = numSents / (((double) millis) / 1000);
// easier way!
NumberFormat nf = new DecimalFormat("0.00");
pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
if (numFallback > 0) {
pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG.");
}
if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
if (numUnparsable > 0) {
pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability.");
}
if (numNoMemory > 0) {
pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory.");
}
if (numSkipped > 0) {
pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
}
}
}
use of edu.stanford.nlp.util.concurrent.MulticoreWrapper in project CoreNLP by stanfordnlp.
the class HybridCorefSystem method runCoref.
public static void runCoref(Properties props) throws Exception {
/*
* property, environment setting
*/
Redwood.hideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
int nThreads = HybridCorefProperties.getThreadCounts(props);
String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());
// set log file path
if (props.containsKey(HybridCorefProperties.LOG_PROP)) {
File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
RedwoodConfiguration.current().handlers(RedwoodConfiguration.Handlers.file(logFile)).apply();
Redwood.log("Starting coref log");
}
log.info(props.toString());
if (HybridCorefProperties.checkMemory(props))
checkMemoryUsage();
HybridCorefSystem cs = new HybridCorefSystem(props);
/*
output setting
*/
// prepare conll output
String goldOutput = null;
String beforeCorefOutput = null;
String afterCorefOutput = null;
PrintWriter writerGold = null;
PrintWriter writerBeforeCoref = null;
PrintWriter writerAfterCoref = null;
if (HybridCorefProperties.doScore(props)) {
String pathOutput = CorefProperties.conllOutputPath(props);
(new File(pathOutput)).mkdir();
goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
writerGold = new PrintWriter(new FileOutputStream(goldOutput));
writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
}
// run coref
MulticoreWrapper<Pair<Document, HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper<>(nThreads, new ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]>() {
@Override
public StringBuilder[] process(Pair<Document, HybridCorefSystem> input) {
try {
Document document = input.first;
HybridCorefSystem cs = input.second;
// conll output and logs
StringBuilder[] outputs = new StringBuilder[4];
cs.coref(document, outputs);
return outputs;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]> newInstance() {
return this;
}
});
Date startTime = null;
if (HybridCorefProperties.checkTime(props)) {
startTime = new Date();
System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
}
// run processes
int docCnt = 0;
while (true) {
Document document = cs.docMaker.nextDoc();
if (document == null)
break;
wrapper.put(Pair.makePair(document, cs));
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
}
// Finished reading the input. Wait for jobs to finish
wrapper.join();
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
IOUtils.closeIgnoringExceptions(writerGold);
IOUtils.closeIgnoringExceptions(writerBeforeCoref);
IOUtils.closeIgnoringExceptions(writerAfterCoref);
if (HybridCorefProperties.checkTime(props)) {
System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
// System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
}
if (HybridCorefProperties.checkMemory(props))
checkMemoryUsage();
// scoring
if (HybridCorefProperties.doScore(props)) {
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
CorefScorer.printScoreSummary(summary, logger, false);
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary);
}
}
use of edu.stanford.nlp.util.concurrent.MulticoreWrapper in project CoreNLP by stanfordnlp.
the class CacheParseHypotheses method convertToTrees.
public static IdentityHashMap<Tree, List<Tree>> convertToTrees(Collection<Tree> keys, IdentityHashMap<Tree, byte[]> compressed, int numThreads) {
IdentityHashMap<Tree, List<Tree>> uncompressed = Generics.newIdentityHashMap();
MulticoreWrapper<byte[], List<Tree>> wrapper = new MulticoreWrapper<>(numThreads, new DecompressionProcessor());
for (Tree tree : keys) {
wrapper.put(compressed.get(tree));
}
for (Tree tree : keys) {
if (!wrapper.peek()) {
wrapper.join();
}
uncompressed.put(tree, wrapper.poll());
}
return uncompressed;
}
use of edu.stanford.nlp.util.concurrent.MulticoreWrapper in project CoreNLP by stanfordnlp.
the class DVParserCostAndGradient method calculate.
// fill value & derivative
public void calculate(double[] theta) {
dvModel.vectorToParams(theta);
double localValue = 0.0;
double[] localDerivative = new double[theta.length];
TwoDimensionalMap<String, String, SimpleMatrix> binaryW_dfsG, binaryW_dfsB;
binaryW_dfsG = TwoDimensionalMap.treeMap();
binaryW_dfsB = TwoDimensionalMap.treeMap();
TwoDimensionalMap<String, String, SimpleMatrix> binaryScoreDerivativesG, binaryScoreDerivativesB;
binaryScoreDerivativesG = TwoDimensionalMap.treeMap();
binaryScoreDerivativesB = TwoDimensionalMap.treeMap();
Map<String, SimpleMatrix> unaryW_dfsG, unaryW_dfsB;
unaryW_dfsG = new TreeMap<>();
unaryW_dfsB = new TreeMap<>();
Map<String, SimpleMatrix> unaryScoreDerivativesG, unaryScoreDerivativesB;
unaryScoreDerivativesG = new TreeMap<>();
unaryScoreDerivativesB = new TreeMap<>();
Map<String, SimpleMatrix> wordVectorDerivativesG = new TreeMap<>();
Map<String, SimpleMatrix> wordVectorDerivativesB = new TreeMap<>();
for (TwoDimensionalMap.Entry<String, String, SimpleMatrix> entry : dvModel.binaryTransform) {
int numRows = entry.getValue().numRows();
int numCols = entry.getValue().numCols();
binaryW_dfsG.put(entry.getFirstKey(), entry.getSecondKey(), new SimpleMatrix(numRows, numCols));
binaryW_dfsB.put(entry.getFirstKey(), entry.getSecondKey(), new SimpleMatrix(numRows, numCols));
binaryScoreDerivativesG.put(entry.getFirstKey(), entry.getSecondKey(), new SimpleMatrix(1, numRows));
binaryScoreDerivativesB.put(entry.getFirstKey(), entry.getSecondKey(), new SimpleMatrix(1, numRows));
}
for (Map.Entry<String, SimpleMatrix> entry : dvModel.unaryTransform.entrySet()) {
int numRows = entry.getValue().numRows();
int numCols = entry.getValue().numCols();
unaryW_dfsG.put(entry.getKey(), new SimpleMatrix(numRows, numCols));
unaryW_dfsB.put(entry.getKey(), new SimpleMatrix(numRows, numCols));
unaryScoreDerivativesG.put(entry.getKey(), new SimpleMatrix(1, numRows));
unaryScoreDerivativesB.put(entry.getKey(), new SimpleMatrix(1, numRows));
}
if (op.trainOptions.trainWordVectors) {
for (Map.Entry<String, SimpleMatrix> entry : dvModel.wordVectors.entrySet()) {
int numRows = entry.getValue().numRows();
int numCols = entry.getValue().numCols();
wordVectorDerivativesG.put(entry.getKey(), new SimpleMatrix(numRows, numCols));
wordVectorDerivativesB.put(entry.getKey(), new SimpleMatrix(numRows, numCols));
}
}
// Some optimization methods prints out a line without an end, so our
// debugging statements are misaligned
Timing scoreTiming = new Timing();
scoreTiming.doing("Scoring trees");
int treeNum = 0;
MulticoreWrapper<Tree, Pair<DeepTree, DeepTree>> wrapper = new MulticoreWrapper<>(op.trainOptions.trainingThreads, new ScoringProcessor());
for (Tree tree : trainingBatch) {
wrapper.put(tree);
}
wrapper.join();
scoreTiming.done();
while (wrapper.peek()) {
Pair<DeepTree, DeepTree> result = wrapper.poll();
DeepTree goldTree = result.first;
DeepTree bestTree = result.second;
StringBuilder treeDebugLine = new StringBuilder();
Formatter formatter = new Formatter(treeDebugLine);
boolean isDone = (Math.abs(bestTree.getScore() - goldTree.getScore()) <= 0.00001 || goldTree.getScore() > bestTree.getScore());
String done = isDone ? "done" : "";
formatter.format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.getScore(), goldTree.getScore(), done);
log.info(treeDebugLine.toString());
if (!isDone) {
// if the gold tree is better than the best hypothesis tree by
// a large enough margin, then the score difference will be 0
// and we ignore the tree
double valueDelta = bestTree.getScore() - goldTree.getScore();
//double valueDelta = Math.max(0.0, - scoreGold + bestScore);
localValue += valueDelta;
// get the context words for this tree - should be the same
// for either goldTree or bestTree
List<String> words = getContextWords(goldTree.getTree());
// The derivatives affected by this tree are only based on the
// nodes present in this tree, eg not all matrix derivatives
// will be affected by this tree
backpropDerivative(goldTree.getTree(), words, goldTree.getVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG);
backpropDerivative(bestTree.getTree(), words, bestTree.getVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB);
}
++treeNum;
}
double[] localDerivativeGood;
double[] localDerivativeB;
if (op.trainOptions.trainWordVectors) {
localDerivativeGood = NeuralUtils.paramsToVector(theta.length, binaryW_dfsG.valueIterator(), unaryW_dfsG.values().iterator(), binaryScoreDerivativesG.valueIterator(), unaryScoreDerivativesG.values().iterator(), wordVectorDerivativesG.values().iterator());
localDerivativeB = NeuralUtils.paramsToVector(theta.length, binaryW_dfsB.valueIterator(), unaryW_dfsB.values().iterator(), binaryScoreDerivativesB.valueIterator(), unaryScoreDerivativesB.values().iterator(), wordVectorDerivativesB.values().iterator());
} else {
localDerivativeGood = NeuralUtils.paramsToVector(theta.length, binaryW_dfsG.valueIterator(), unaryW_dfsG.values().iterator(), binaryScoreDerivativesG.valueIterator(), unaryScoreDerivativesG.values().iterator());
localDerivativeB = NeuralUtils.paramsToVector(theta.length, binaryW_dfsB.valueIterator(), unaryW_dfsB.values().iterator(), binaryScoreDerivativesB.valueIterator(), unaryScoreDerivativesB.values().iterator());
}
// correct - highest
for (int i = 0; i < localDerivativeGood.length; i++) {
localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i];
}
// TODO: this is where we would combine multiple costs if we had parallelized the calculation
value = localValue;
derivative = localDerivative;
// normalizing by training batch size
value = (1.0 / trainingBatch.size()) * value;
ArrayMath.multiplyInPlace(derivative, (1.0 / trainingBatch.size()));
// add regularization to cost:
double[] currentParams = dvModel.paramsToVector();
double regCost = 0;
for (double currentParam : currentParams) {
regCost += currentParam * currentParam;
}
regCost = op.trainOptions.regCost * 0.5 * regCost;
value += regCost;
// add regularization to gradient
ArrayMath.multiplyInPlace(currentParams, op.trainOptions.regCost);
ArrayMath.pairwiseAddInPlace(derivative, currentParams);
}
use of edu.stanford.nlp.util.concurrent.MulticoreWrapper in project CoreNLP by stanfordnlp.
the class TestClassifier method test.
/**
* Test on a file containing correct tags already. when init'ing from trees
* TODO: Add the ability to have a second transformer to transform output back; possibly combine this method
* with method below
*/
private void test() throws IOException {
numSentences = 0;
confusionMatrix = new ConfusionMatrix<>();
PrintFile pf = null;
PrintFile pf1 = null;
PrintFile pf3 = null;
if (writeWords)
pf = new PrintFile(saveRoot + ".words");
if (writeUnknDict)
pf1 = new PrintFile(saveRoot + ".un.dict");
if (writeTopWords)
pf3 = new PrintFile(saveRoot + ".words.top");
boolean verboseResults = config.getVerboseResults();
if (config.getNThreads() != 1) {
MulticoreWrapper<List<TaggedWord>, TestSentence> wrapper = new MulticoreWrapper<>(config.getNThreads(), new TestSentenceProcessor(maxentTagger));
for (List<TaggedWord> taggedSentence : fileRecord.reader()) {
wrapper.put(taggedSentence);
while (wrapper.peek()) {
processResults(wrapper.poll(), pf, pf1, pf3, verboseResults);
}
}
wrapper.join();
while (wrapper.peek()) {
processResults(wrapper.poll(), pf, pf1, pf3, verboseResults);
}
} else {
for (List<TaggedWord> taggedSentence : fileRecord.reader()) {
TestSentence testS = new TestSentence(maxentTagger);
testS.setCorrectTags(taggedSentence);
testS.tagSentence(taggedSentence, false);
processResults(testS, pf, pf1, pf3, verboseResults);
}
}
if (pf != null)
pf.close();
if (pf1 != null)
pf1.close();
if (pf3 != null)
pf3.close();
}
Aggregations