use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class ParseFiles method parseFiles.
public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
if (op.testOptions.verbose) {
if (tokenizerFactory != null)
pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
}
final Timing timer = new Timing();
// Loop over the files
for (int i = argIndex; i < args.length; i++) {
final String filename = args[i];
final DocumentPreprocessor documentPreprocessor;
if (filename.equals("-")) {
try {
documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else {
documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
}
// Unused values are null per the main() method invocation below
// null is the default for these properties
documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
documentPreprocessor.setEscaper(escaper);
documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
documentPreprocessor.setTagDelimiter(tagDelimiter);
documentPreprocessor.setElementDelimiter(elementDelimiter);
if (tokenizerFactory == null)
documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
else
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
// Setup the output
PrintWriter pwo = pwOut;
if (op.testOptions.writeOutputFiles) {
String normalizedName = filename;
try {
// this will exception if not a URL
new URL(normalizedName);
normalizedName = normalizedName.replaceAll("/", "_");
} catch (MalformedURLException e) {
// It isn't a URL, so silently ignore
}
String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
String fname = normalizedName + '.' + ext;
if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
String fseparator = System.getProperty("file.separator");
if (fseparator == null || fseparator.isEmpty()) {
fseparator = "/";
}
File fnameFile = new File(fname);
fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
}
try {
pwo = op.tlpParams.pw(new FileOutputStream(fname));
} catch (IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
pwErr.println("Parsing file: " + filename);
int num = 0;
int numProcessed = 0;
if (op.testOptions.testingThreads != 1) {
MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
wrapper.put(sentence);
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
}
wrapper.join();
while (wrapper.peek()) {
ParserQuery pq = wrapper.poll();
processResults(pq, numProcessed++, pwo);
}
} else {
ParserQuery pq = pqFactory.parserQuery();
for (List<HasWord> sentence : documentPreprocessor) {
num++;
numSents++;
int len = sentence.size();
numWords += len;
pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
pq.parseAndReport(sentence, pwErr);
processResults(pq, numProcessed++, pwo);
}
}
treePrint.printFooter(pwo);
if (op.testOptions.writeOutputFiles)
pwo.close();
pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
}
long millis = timer.stop();
if (summary) {
if (pcfgLL != null)
pcfgLL.display(false, pwErr);
if (depLL != null)
depLL.display(false, pwErr);
if (factLL != null)
factLL.display(false, pwErr);
}
if (saidMemMessage) {
ParserUtils.printOutOfMemory(pwErr);
}
double wordspersec = numWords / (((double) millis) / 1000);
double sentspersec = numSents / (((double) millis) / 1000);
// easier way!
NumberFormat nf = new DecimalFormat("0.00");
pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
if (numFallback > 0) {
pwErr.println(" " + numFallback + " sentences were parsed by fallback to PCFG.");
}
if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
pwErr.println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
if (numUnparsable > 0) {
pwErr.println(" " + numUnparsable + " were not parsable with non-zero probability.");
}
if (numNoMemory > 0) {
pwErr.println(" " + numNoMemory + " were skipped because of insufficient memory.");
}
if (numSkipped > 0) {
pwErr.println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
}
}
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class ParseAndPrintMatrices method main.
public static void main(String[] args) throws IOException {
String modelPath = null;
String outputPath = null;
String inputPath = null;
String testTreebankPath = null;
FileFilter testTreebankFilter = null;
List<String> unusedArgs = Generics.newArrayList();
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-model")) {
modelPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-output")) {
outputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
testTreebankPath = treebankDescription.first();
testTreebankFilter = treebankDescription.second();
} else {
unusedArgs.add(args[argIndex++]);
}
}
String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
LexicalizedParser parser = LexicalizedParser.loadModel(modelPath, newArgs);
DVModel model = DVParser.getModelFromLexicalizedParser(parser);
File outputFile = new File(outputPath);
FileSystem.checkNotExistsOrFail(outputFile);
FileSystem.mkdirOrFail(outputFile);
int count = 0;
if (inputPath != null) {
Reader input = new BufferedReader(new FileReader(inputPath));
DocumentPreprocessor processor = new DocumentPreprocessor(input);
for (List<HasWord> sentence : processor) {
// index from 1
count++;
ParserQuery pq = parser.parserQuery();
if (!(pq instanceof RerankingParserQuery)) {
throw new IllegalArgumentException("Expected a RerankingParserQuery");
}
RerankingParserQuery rpq = (RerankingParserQuery) pq;
if (!rpq.parse(sentence)) {
throw new RuntimeException("Unparsable sentence: " + sentence);
}
RerankerQuery reranker = rpq.rerankerQuery();
if (!(reranker instanceof DVModelReranker.Query)) {
throw new IllegalArgumentException("Expected a DVModelReranker");
}
DeepTree deepTree = ((DVModelReranker.Query) reranker).getDeepTrees().get(0);
IdentityHashMap<Tree, SimpleMatrix> vectors = deepTree.getVectors();
for (Map.Entry<Tree, SimpleMatrix> entry : vectors.entrySet()) {
log.info(entry.getKey() + " " + entry.getValue());
}
FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
BufferedWriter bout = new BufferedWriter(fout);
bout.write(SentenceUtils.listToString(sentence));
bout.newLine();
bout.write(deepTree.getTree().toString());
bout.newLine();
for (HasWord word : sentence) {
outputMatrix(bout, model.getWordVector(word.word()));
}
Tree rootTree = findRootTree(vectors);
outputTreeMatrices(bout, rootTree, vectors);
bout.flush();
fout.close();
}
}
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class DependencyParser method parseTextFile.
private void parseTextFile(BufferedReader input, PrintWriter output) {
DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
preprocessor.setEscaper(config.escaper);
preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
if (config.preTokenized) {
preprocessor.setTokenizerFactory(edu.stanford.nlp.process.WhitespaceTokenizer.factory());
} else {
preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());
}
Timing timer = new Timing();
MaxentTagger tagger = new MaxentTagger(config.tagger);
List<List<TaggedWord>> tagged = new ArrayList<>();
for (List<HasWord> sentence : preprocessor) {
tagged.add(tagger.tagSentence(sentence));
}
log.info(String.format("Tagging completed in %.2f sec.%n", timer.stop() / 1000.0));
timer.start();
int numSentences = 0;
for (List<TaggedWord> taggedSentence : tagged) {
GrammaticalStructure parse = predict(taggedSentence);
Collection<TypedDependency> deps = parse.typedDependencies();
for (TypedDependency dep : deps) output.println(dep);
output.println();
numSentences++;
}
long millis = timer.stop();
double seconds = millis / 1000.0;
log.info(String.format("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n", numSentences, seconds, numSentences / seconds));
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class DependencyParserDemo method main.
public static void main(String[] args) {
String modelPath = DependencyParser.DEFAULT_MODEL;
String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger";
for (int argIndex = 0; argIndex < args.length; ) {
switch(args[argIndex]) {
case "-tagger":
taggerPath = args[argIndex + 1];
argIndex += 2;
break;
case "-model":
modelPath = args[argIndex + 1];
argIndex += 2;
break;
default:
throw new RuntimeException("Unknown argument " + args[argIndex]);
}
}
String text = "I can almost always tell when movies use fake dinosaurs.";
MaxentTagger tagger = new MaxentTagger(taggerPath);
DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
List<TaggedWord> tagged = tagger.tagSentence(sentence);
GrammaticalStructure gs = parser.predict(tagged);
// Print typed dependencies
log.info(gs);
}
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class BuildBinarizedDataset method main.
/**
* Turns a text file into trees for use in a RNTN classifier such as
* the treebank used in the Sentiment project.
* <br>
* The expected input file is one sentence per line, with sentences
* separated by blank lines. The first line has the main label of the sentence together with the full sentence.
* Lines after the first sentence line but before
* the blank line will be treated as labeled sub-phrases. The
* labels should start with the label and then contain a list of
* tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
* For example:
* <br>
* <code>
* 1 Today is not a good day.<br>
* 3 good<br>
* 3 good day <br>
* 3 a good day <br>
* <br>
* (next block starts here) <br>
* </code>
* <br>
* If you have an example sentence you want to label, you will need
* to manually label the subtrees from there. For example, to build
* a 5 class dataset which matches the existing datasets, you would
* label the very negative phrases with 0, neutral phrases with 2,
* very positive phrases with 4. The binary label dataset uses 0
* for negative, 1 for positive, and -1 for unlabeled (which can
* mean neutral, although the binary model will not predict
* neutral).
* <br>
* In order to determine which sub-phrases would need labeling, you
* can run the sentences through the same parser used to turn the
* sentences into trees. For example, in the case of using the
* englishPCFG model, you can look at the main class of
* edu.stanford.nlp.parser.lexparser.LexicalizedParser . You will
* definitely want to provide a label for the entire sentence. Any
* subphrases which have a significantly different sentiment should
* be labeled, such as the previous example of "not a good day" vs
* "a good day".
* <br>
* Although it would be excessive to do so, a list of ALL of the
* subphrases contained in a parsed tree can be produced by first
* running the parser, then using the tool
* edu.stanford.nlp.trees.OutputSubtrees
* <br>
* By default the englishPCFG parser is used. This can be changed
* with the {@code -parserModel} flag. Specify an input file
* with {@code -input}.
* <br>
* If a sentiment model is provided with -sentimentModel, that model
* will be used to prelabel the sentences. Any spans with given
* labels will then be used to adjust those labels.
*/
public static void main(String[] args) {
CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String inputPath = null;
String sentimentModelPath = null;
SentimentModel sentimentModel = null;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
parserModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
sentimentModelPath = args[argIndex + 1];
argIndex += 2;
} else {
log.info("Unknown argument " + args[argIndex]);
System.exit(2);
}
}
if (inputPath == null) {
throw new IllegalArgumentException("Must specify input file with -input");
}
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
if (sentimentModelPath != null) {
sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
}
String text = IOUtils.slurpFileNoExceptions(inputPath);
// need blank line to make a new chunk
String[] chunks = text.split("\\n\\s*\\n+");
for (String chunk : chunks) {
if (chunk.trim().isEmpty()) {
continue;
}
// The expected format is that line 0 will be the text of the
// sentence, and each subsequence line, if any, will be a value
// followed by the sequence of tokens that get that value.
// Here we take the first line and tokenize it as one sentence.
String[] lines = chunk.trim().split("\\n");
String sentence = lines[0];
StringReader sin = new StringReader(sentence);
DocumentPreprocessor document = new DocumentPreprocessor(sin);
document.setSentenceFinalPuncWords(new String[] { "\n" });
List<HasWord> tokens = document.iterator().next();
Integer mainLabel = Integer.valueOf(tokens.get(0).word());
// System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
tokens = tokens.subList(1, tokens.size());
// log.info(tokens);
Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
for (int i = 1; i < lines.length; ++i) {
extractLabels(spanToLabels, tokens, lines[i]);
}
// TODO: add an option which treats the spans as constraints when parsing
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
// label here and then use the user given labels to adjust
if (sentimentModel != null) {
Trees.convertToCoreLabels(collapsedUnary);
SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
scorer.forwardPropagateTree(collapsedUnary);
setPredictedLabels(collapsedUnary);
} else {
setUnknownLabels(collapsedUnary, mainLabel);
}
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
}
System.out.println(collapsedUnary);
// System.out.println();
}
}
Aggregations