Examples with DocumentPreprocessor - edu.stanford.nlp.process.DocumentPreprocessor

Example 1 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class ParseFiles method parseFiles.

public void parseFiles(String[] args, int argIndex, boolean tokenized, TokenizerFactory<? extends HasWord> tokenizerFactory, String elementDelimiter, String sentenceDelimiter, Function<List<HasWord>, List<HasWord>> escaper, String tagDelimiter) {
    final DocType docType = (elementDelimiter == null) ? DocType.Plain : DocType.XML;
    if (op.testOptions.verbose) {
        if (tokenizerFactory != null)
            pwErr.println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
    }
    final Timing timer = new Timing();
    // Loop over the files
    for (int i = argIndex; i < args.length; i++) {
        final String filename = args[i];
        final DocumentPreprocessor documentPreprocessor;
        if (filename.equals("-")) {
            try {
                documentPreprocessor = new DocumentPreprocessor(IOUtils.readerFromStdin(op.tlpParams.getInputEncoding()), docType);
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } else {
            documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.getInputEncoding());
        }
        // Unused values are null per the main() method invocation below
        // null is the default for these properties
        documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
        documentPreprocessor.setEscaper(escaper);
        documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
        documentPreprocessor.setTagDelimiter(tagDelimiter);
        documentPreprocessor.setElementDelimiter(elementDelimiter);
        if (tokenizerFactory == null)
            documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
        else
            documentPreprocessor.setTokenizerFactory(tokenizerFactory);
        // Setup the output
        PrintWriter pwo = pwOut;
        if (op.testOptions.writeOutputFiles) {
            String normalizedName = filename;
            try {
                // this will exception if not a URL
                new URL(normalizedName);
                normalizedName = normalizedName.replaceAll("/", "_");
            } catch (MalformedURLException e) {
            // It isn't a URL, so silently ignore
            }
            String ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
            String fname = normalizedName + '.' + ext;
            if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.isEmpty()) {
                String fseparator = System.getProperty("file.separator");
                if (fseparator == null || fseparator.isEmpty()) {
                    fseparator = "/";
                }
                File fnameFile = new File(fname);
                fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.getName();
            }
            try {
                pwo = op.tlpParams.pw(new FileOutputStream(fname));
            } catch (IOException ioe) {
                throw new RuntimeIOException(ioe);
            }
        }
        treePrint.printHeader(pwo, op.tlpParams.getOutputEncoding());
        pwErr.println("Parsing file: " + filename);
        int num = 0;
        int numProcessed = 0;
        if (op.testOptions.testingThreads != 1) {
            MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                wrapper.put(sentence);
                while (wrapper.peek()) {
                    ParserQuery pq = wrapper.poll();
                    processResults(pq, numProcessed++, pwo);
                }
            }
            wrapper.join();
            while (wrapper.peek()) {
                ParserQuery pq = wrapper.poll();
                processResults(pq, numProcessed++, pwo);
            }
        } else {
            ParserQuery pq = pqFactory.parserQuery();
            for (List<HasWord> sentence : documentPreprocessor) {
                num++;
                numSents++;
                int len = sentence.size();
                numWords += len;
                pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.listToString(sentence, true));
                pq.parseAndReport(sentence, pwErr);
                processResults(pq, numProcessed++, pwo);
            }
        }
        treePrint.printFooter(pwo);
        if (op.testOptions.writeOutputFiles)
            pwo.close();
        pwErr.println("Parsed file: " + filename + " [" + num + " sentences].");
    }
    long millis = timer.stop();
    if (summary) {
        if (pcfgLL != null)
            pcfgLL.display(false, pwErr);
        if (depLL != null)
            depLL.display(false, pwErr);
        if (factLL != null)
            factLL.display(false, pwErr);
    }
    if (saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
    }
    double wordspersec = numWords / (((double) millis) / 1000);
    double sentspersec = numSents / (((double) millis) / 1000);
    // easier way!
    NumberFormat nf = new DecimalFormat("0.00");
    pwErr.println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.format(wordspersec) + " wds/sec; " + nf.format(sentspersec) + " sents/sec).");
    if (numFallback > 0) {
        pwErr.println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
    }
    if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) {
        pwErr.println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
        if (numUnparsable > 0) {
            pwErr.println("    " + numUnparsable + " were not parsable with non-zero probability.");
        }
        if (numNoMemory > 0) {
            pwErr.println("    " + numNoMemory + " were skipped because of insufficient memory.");
        }
        if (numSkipped > 0) {
            pwErr.println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
        }
    }
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) MalformedURLException(java.net.MalformedURLException) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) DecimalFormat(java.text.DecimalFormat) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) URL(java.net.URL) ParsingThreadsafeProcessor(edu.stanford.nlp.parser.common.ParsingThreadsafeProcessor) FileOutputStream(java.io.FileOutputStream) List(java.util.List) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File(java.io.File) DocType(edu.stanford.nlp.process.DocumentPreprocessor.DocType) PrintWriter(java.io.PrintWriter) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) NumberFormat(java.text.NumberFormat)

Example 2 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class ParseAndPrintMatrices method main.

public static void main(String[] args) throws IOException {
    String modelPath = null;
    String outputPath = null;
    String inputPath = null;
    String testTreebankPath = null;
    FileFilter testTreebankFilter = null;
    List<String> unusedArgs = Generics.newArrayList();
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-model")) {
            modelPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-output")) {
            outputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-input")) {
            inputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testTreebankPath = treebankDescription.first();
            testTreebankFilter = treebankDescription.second();
        } else {
            unusedArgs.add(args[argIndex++]);
        }
    }
    String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
    LexicalizedParser parser = LexicalizedParser.loadModel(modelPath, newArgs);
    DVModel model = DVParser.getModelFromLexicalizedParser(parser);
    File outputFile = new File(outputPath);
    FileSystem.checkNotExistsOrFail(outputFile);
    FileSystem.mkdirOrFail(outputFile);
    int count = 0;
    if (inputPath != null) {
        Reader input = new BufferedReader(new FileReader(inputPath));
        DocumentPreprocessor processor = new DocumentPreprocessor(input);
        for (List<HasWord> sentence : processor) {
            // index from 1
            count++;
            ParserQuery pq = parser.parserQuery();
            if (!(pq instanceof RerankingParserQuery)) {
                throw new IllegalArgumentException("Expected a RerankingParserQuery");
            }
            RerankingParserQuery rpq = (RerankingParserQuery) pq;
            if (!rpq.parse(sentence)) {
                throw new RuntimeException("Unparsable sentence: " + sentence);
            }
            RerankerQuery reranker = rpq.rerankerQuery();
            if (!(reranker instanceof DVModelReranker.Query)) {
                throw new IllegalArgumentException("Expected a DVModelReranker");
            }
            DeepTree deepTree = ((DVModelReranker.Query) reranker).getDeepTrees().get(0);
            IdentityHashMap<Tree, SimpleMatrix> vectors = deepTree.getVectors();
            for (Map.Entry<Tree, SimpleMatrix> entry : vectors.entrySet()) {
                log.info(entry.getKey() + "   " + entry.getValue());
            }
            FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
            BufferedWriter bout = new BufferedWriter(fout);
            bout.write(SentenceUtils.listToString(sentence));
            bout.newLine();
            bout.write(deepTree.getTree().toString());
            bout.newLine();
            for (HasWord word : sentence) {
                outputMatrix(bout, model.getWordVector(word.word()));
            }
            Tree rootTree = findRootTree(vectors);
            outputTreeMatrices(bout, rootTree, vectors);
            bout.flush();
            fout.close();
        }
    }
}

Also used : RerankerQuery(edu.stanford.nlp.parser.lexparser.RerankerQuery) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) FileWriter(java.io.FileWriter) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) BufferedWriter(java.io.BufferedWriter) SimpleMatrix(org.ejml.simple.SimpleMatrix) DeepTree(edu.stanford.nlp.trees.DeepTree) Tree(edu.stanford.nlp.trees.Tree) FileReader(java.io.FileReader) DeepTree(edu.stanford.nlp.trees.DeepTree) FileFilter(java.io.FileFilter) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) RerankerQuery(edu.stanford.nlp.parser.lexparser.RerankerQuery) BufferedReader(java.io.BufferedReader) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) File(java.io.File) Map(java.util.Map) IdentityHashMap(java.util.IdentityHashMap) RerankingParserQuery(edu.stanford.nlp.parser.lexparser.RerankingParserQuery) ParserQuery(edu.stanford.nlp.parser.common.ParserQuery)

Example 3 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class DependencyParser method parseTextFile.

private void parseTextFile(BufferedReader input, PrintWriter output) {
    DocumentPreprocessor preprocessor = new DocumentPreprocessor(input);
    preprocessor.setSentenceFinalPuncWords(config.tlp.sentenceFinalPunctuationWords());
    preprocessor.setEscaper(config.escaper);
    preprocessor.setSentenceDelimiter(config.sentenceDelimiter);
    if (config.preTokenized) {
        preprocessor.setTokenizerFactory(edu.stanford.nlp.process.WhitespaceTokenizer.factory());
    } else {
        preprocessor.setTokenizerFactory(config.tlp.getTokenizerFactory());
    }
    Timing timer = new Timing();
    MaxentTagger tagger = new MaxentTagger(config.tagger);
    List<List<TaggedWord>> tagged = new ArrayList<>();
    for (List<HasWord> sentence : preprocessor) {
        tagged.add(tagger.tagSentence(sentence));
    }
    log.info(String.format("Tagging completed in %.2f sec.%n", timer.stop() / 1000.0));
    timer.start();
    int numSentences = 0;
    for (List<TaggedWord> taggedSentence : tagged) {
        GrammaticalStructure parse = predict(taggedSentence);
        Collection<TypedDependency> deps = parse.typedDependencies();
        for (TypedDependency dep : deps) output.println(dep);
        output.println();
        numSentences++;
    }
    long millis = timer.stop();
    double seconds = millis / 1000.0;
    log.info(String.format("Parsed %d sentences in %.2f seconds (%.2f sents/sec).%n", numSentences, seconds, numSentences / seconds));
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) TypedDependency(edu.stanford.nlp.trees.TypedDependency) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) GrammaticalStructure(edu.stanford.nlp.trees.GrammaticalStructure) ChineseGrammaticalStructure(edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure) EnglishGrammaticalStructure(edu.stanford.nlp.trees.EnglishGrammaticalStructure) UniversalEnglishGrammaticalStructure(edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructure) Collectors.toList(java.util.stream.Collectors.toList) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 4 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class DependencyParserDemo method main.

public static void main(String[] args) {
    String modelPath = DependencyParser.DEFAULT_MODEL;
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger";
    for (int argIndex = 0; argIndex < args.length; ) {
        switch(args[argIndex]) {
            case "-tagger":
                taggerPath = args[argIndex + 1];
                argIndex += 2;
                break;
            case "-model":
                modelPath = args[argIndex + 1];
                argIndex += 2;
                break;
            default:
                throw new RuntimeException("Unknown argument " + args[argIndex]);
        }
    }
    String text = "I can almost always tell when movies use fake dinosaurs.";
    MaxentTagger tagger = new MaxentTagger(taggerPath);
    DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        GrammaticalStructure gs = parser.predict(tagged);
        // Print typed dependencies
        log.info(gs);
    }
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) DependencyParser(edu.stanford.nlp.parser.nndep.DependencyParser) StringReader(java.io.StringReader) GrammaticalStructure(edu.stanford.nlp.trees.GrammaticalStructure) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 5 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class BuildBinarizedDataset method main.

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * <br>
 * If you have an example sentence you want to label, you will need
 * to manually label the subtrees from there.  For example, to build
 * a 5 class dataset which matches the existing datasets, you would
 * label the very negative phrases with 0, neutral phrases with 2,
 * very positive phrases with 4.  The binary label dataset uses 0
 * for negative, 1 for positive, and -1 for unlabeled (which can
 * mean neutral, although the binary model will not predict
 * neutral).
 * <br>
 * In order to determine which sub-phrases would need labeling, you
 * can run the sentences through the same parser used to turn the
 * sentences into trees.  For example, in the case of using the
 * englishPCFG model, you can look at the main class of
 * edu.stanford.nlp.parser.lexparser.LexicalizedParser .  You will
 * definitely want to provide a label for the entire sentence.  Any
 * subphrases which have a significantly different sentiment should
 * be labeled, such as the previous example of "not a good day" vs
 * "a good day".
 * <br>
 * Although it would be excessive to do so, a list of ALL of the
 * subphrases contained in a parsed tree can be produced by first
 * running the parser, then using the tool
 * edu.stanford.nlp.trees.OutputSubtrees
 * <br>
 * By default the englishPCFG parser is used.  This can be changed
 * with the {@code -parserModel} flag.  Specify an input file
 * with {@code -input}.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] args) {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String inputPath = null;
    String sentimentModelPath = null;
    SentimentModel sentimentModel = null;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-input")) {
            inputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
            sentimentModelPath = args[argIndex + 1];
            argIndex += 2;
        } else {
            log.info("Unknown argument " + args[argIndex]);
            System.exit(2);
        }
    }
    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }
    String text = IOUtils.slurpFileNoExceptions(inputPath);
    // need blank line to make a new chunk
    String[] chunks = text.split("\\n\\s*\\n+");
    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.
        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = Integer.valueOf(tokens.get(0).word());
        // System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        // log.info(tokens);
        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }
        // TODO: add an option which treats the spans as constraints when parsing
        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
        }
        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();
        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        System.out.println(collapsedUnary);
    // System.out.println();
    }
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) TreeBinarizer(edu.stanford.nlp.parser.lexparser.TreeBinarizer) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair)

Aggregations

DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)16 HasWord (edu.stanford.nlp.ling.HasWord)13 StringReader (java.io.StringReader)8 TaggedWord (edu.stanford.nlp.ling.TaggedWord)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)3 Tree (edu.stanford.nlp.trees.Tree)3 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3 ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)2 CoreLabelTokenFactory (edu.stanford.nlp.process.CoreLabelTokenFactory)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 Pair (edu.stanford.nlp.util.Pair)2 Timing (edu.stanford.nlp.util.Timing)2 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 PrintWriter (java.io.PrintWriter)2 Map (java.util.Map)2 Twokenize (cmu.arktweetnlp.Twokenize)1