Examples with DocumentPreprocessor - edu.stanford.nlp.process.DocumentPreprocessor

Example 11 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class MaxentTagger method runTaggerStdin.

public void runTaggerStdin(BufferedReader reader, BufferedWriter writer, OutputStyle outputStyle) throws IOException {
    final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();
    // Counts
    long totalMillis = 0;
    int numWords = 0;
    int numSentences = 0;
    boolean outputVerbosity = config.getOutputVerbosity();
    boolean outputLemmas = config.getOutputLemmas();
    Morphology morpha = (outputLemmas) ? new Morphology() : null;
    if (outputStyle == OutputStyle.XML || outputStyle == OutputStyle.INLINE_XML) {
        writer.write("<?xml version=\"1.0\" encoding=\"" + config.getEncoding() + "\"?>\n");
        writer.write("<pos>\n");
    }
    String sentenceDelimiter = config.getSentenceDelimiter();
    if (sentenceDelimiter != null && sentenceDelimiter.equals("newline")) {
        sentenceDelimiter = "\n";
    }
    while (true) {
        // Now we do everything through the doc preprocessor
        final DocumentPreprocessor docProcessor;
        String line = reader.readLine();
        // this happens when we reach end of file
        if (line == null)
            break;
        docProcessor = new DocumentPreprocessor(new StringReader(line));
        docProcessor.setTokenizerFactory(tokenizerFactory);
        docProcessor.setSentenceDelimiter(sentenceDelimiter);
        if (config.keepEmptySentences()) {
            docProcessor.setKeepEmptySentences(true);
        }
        for (List<HasWord> sentence : docProcessor) {
            numWords += sentence.size();
            Timing t = new Timing();
            tagAndOutputSentence(sentence, outputLemmas, morpha, outputStyle, outputVerbosity, numSentences, "", writer);
            totalMillis += t.stop();
            writer.newLine();
            writer.flush();
            numSentences++;
        }
    }
    if (outputStyle == OutputStyle.XML || outputStyle == OutputStyle.INLINE_XML) {
        writer.write("</pos>\n");
    }
    writer.flush();
    printErrWordsPerSec(totalMillis, numWords);
}

Also used : Morphology(edu.stanford.nlp.process.Morphology) Timing(edu.stanford.nlp.util.Timing) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 12 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class ShiftReduceDemo method main.

public static void main(String[] args) {
    String modelPath = "edu/stanford/nlp/models/srparser/englishSR.ser.gz";
    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger";
    for (int argIndex = 0; argIndex < args.length; ) {
        switch(args[argIndex]) {
            case "-tagger":
                taggerPath = args[argIndex + 1];
                argIndex += 2;
                break;
            case "-model":
                modelPath = args[argIndex + 1];
                argIndex += 2;
                break;
            default:
                throw new RuntimeException("Unknown argument " + args[argIndex]);
        }
    }
    String text = "My dog likes to shake his stuffed chickadee toy.";
    MaxentTagger tagger = new MaxentTagger(taggerPath);
    ShiftReduceParser model = ShiftReduceParser.loadModel(modelPath);
    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
    for (List<HasWord> sentence : tokenizer) {
        List<TaggedWord> tagged = tagger.tagSentence(sentence);
        Tree tree = model.apply(tagged);
        log.info(tree);
    }
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) ShiftReduceParser(edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 13 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class ParserDemo2 method main.

/**
 * This example shows a few more ways of providing input to a parser.
 *
 *  Usage: ParserDemo2 [grammar [textFile]]
 */
public static void main(String[] args) throws IOException {
    String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    Iterable<List<? extends HasWord>> sentences;
    if (args.length > 1) {
        DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
        List<List<? extends HasWord>> tmp = new ArrayList<>();
        for (List<HasWord> sentence : dp) {
            tmp.add(sentence);
        }
        sentences = tmp;
    } else {
        // Showing tokenization and parsing in code a couple of different ways.
        String[] sent = { "This", "is", "an", "easy", "sentence", "." };
        List<HasWord> sentence = new ArrayList<>();
        for (String word : sent) {
            sentence.add(new Word(word));
        }
        String sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
        // Use the default tokenizer for this TreebankLanguagePack
        Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
        List<? extends HasWord> sentence2 = toke.tokenize();
        String[] sent3 = { "It", "can", "can", "it", "." };
        // Parser gets second "can" wrong without help
        String[] tag3 = { "PRP", "MD", "VB", "PRP", "." };
        List<TaggedWord> sentence3 = new ArrayList<>();
        for (int i = 0; i < sent3.length; i++) {
            sentence3.add(new TaggedWord(sent3[i], tag3[i]));
        }
        Tree parse = lp.parse(sentence3);
        parse.pennPrint();
        List<List<? extends HasWord>> tmp = new ArrayList<>();
        tmp.add(sentence);
        tmp.add(sentence2);
        tmp.add(sentence3);
        sentences = tmp;
    }
    for (List<? extends HasWord> sentence : sentences) {
        Tree parse = lp.parse(sentence);
        parse.pennPrint();
        System.out.println();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        System.out.println();
        System.out.println("The words of the sentence:");
        for (Label lab : parse.yield()) {
            if (lab instanceof CoreLabel) {
                System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
            } else {
                System.out.println(lab);
            }
        }
        System.out.println();
        System.out.println(parse.taggedYield());
        System.out.println();
    }
    // This method turns the String into a single sentence using the
    // default tokenizer for the TreebankLanguagePack.
    String sent3 = "This is one last test!";
    lp.parse(sent3).pennPrint();
}

Also used : Word(edu.stanford.nlp.ling.Word) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) StringReader(java.io.StringReader) HasWord(edu.stanford.nlp.ling.HasWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TaggedWord(edu.stanford.nlp.ling.TaggedWord) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 14 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class ParserDemo method demoDP.

/**
 * demoDP demonstrates turning a file into tokens and then parse
 * trees.  Note that the trees are printed by calling pennPrint on
 * the Tree object.  It is also possible to pass a PrintWriter to
 * pennPrint if you want to capture the output.
 * This code will work with any supported language.
 */
public static void demoDP(LexicalizedParser lp, String filename) {
    // This option shows loading, sentence-segmenting and tokenizing
    // a file using DocumentPreprocessor.
    // a PennTreebankLanguagePack for English
    TreebankLanguagePack tlp = lp.treebankLanguagePack();
    GrammaticalStructureFactory gsf = null;
    if (tlp.supportsGrammaticalStructures()) {
        gsf = tlp.grammaticalStructureFactory();
    }
    // to DocumentPreprocessor
    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
        Tree parse = lp.apply(sentence);
        parse.pennPrint();
        System.out.println();
        if (gsf != null) {
            GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
            Collection tdl = gs.typedDependenciesCCprocessed();
            System.out.println(tdl);
            System.out.println();
        }
    }
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) Collection(java.util.Collection) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor)

Example 15 with DocumentPreprocessor

use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseFile.

public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
    Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
    String backgroundSymbol = "O";
    List<CoreMap> sentences = new ArrayList<>();
    int lineNum = -1;
    String l = null;
    while ((l = reader.readLine()) != null) {
        lineNum++;
        String[] t = l.split("\t", 2);
        String id = null;
        String text = null;
        if (t.length == 2) {
            id = t[0];
            text = t[1];
        } else if (t.length == 1) {
            text = t[0];
            id = String.valueOf(lineNum);
        }
        id = sentIDprefix + id;
        DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
        PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
        dp.setTokenizerFactory(tokenizerFactory);
        String label = backgroundSymbol;
        int sentNum = -1;
        for (List<HasWord> sentence : dp) {
            sentNum++;
            String sentStr = "";
            List<CoreLabel> sent = new ArrayList<>();
            for (HasWord tokw : sentence) {
                String tok = tokw.word();
                Matcher startingMatcher = startingLabelToken.matcher(tok);
                Matcher endMatcher = endLabelToken.matcher(tok);
                if (startingMatcher.matches()) {
                    // System.out.println("matched starting");
                    label = startingMatcher.group(1);
                } else if (endMatcher.matches()) {
                    // System.out.println("matched end");
                    label = backgroundSymbol;
                } else {
                    CoreLabel c = new CoreLabel();
                    List<String> toks = new ArrayList<>();
                    toks.add(tok);
                    for (String toksplit : toks) {
                        sentStr += " " + toksplit;
                        c.setWord(toksplit);
                        c.setLemma(toksplit);
                        c.setValue(toksplit);
                        c.set(CoreAnnotations.TextAnnotation.class, toksplit);
                        c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
                        if (setGoldClass) {
                            c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
                        }
                        if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                            c.set(setClassForTheseLabels.get(label), label);
                        sent.add(c);
                    }
                }
            }
            CoreMap sentcm = new ArrayCoreMap();
            sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
            sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
            sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
            sentences.add(sentcm);
        }
    }
    return sentences;
}

Also used : HasWord(edu.stanford.nlp.ling.HasWord) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) CoreMap(edu.stanford.nlp.util.CoreMap) ArrayCoreMap(edu.stanford.nlp.util.ArrayCoreMap)

Aggregations

DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)16 HasWord (edu.stanford.nlp.ling.HasWord)13 StringReader (java.io.StringReader)8 TaggedWord (edu.stanford.nlp.ling.TaggedWord)5 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)5 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)3 Tree (edu.stanford.nlp.trees.Tree)3 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3 ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)2 CoreLabelTokenFactory (edu.stanford.nlp.process.CoreLabelTokenFactory)2 GrammaticalStructure (edu.stanford.nlp.trees.GrammaticalStructure)2 Pair (edu.stanford.nlp.util.Pair)2 Timing (edu.stanford.nlp.util.Timing)2 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 PrintWriter (java.io.PrintWriter)2 Map (java.util.Map)2 Twokenize (cmu.arktweetnlp.Twokenize)1