use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class MaxentTagger method runTaggerStdin.
public void runTaggerStdin(BufferedReader reader, BufferedWriter writer, OutputStyle outputStyle) throws IOException {
final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();
// Counts
long totalMillis = 0;
int numWords = 0;
int numSentences = 0;
boolean outputVerbosity = config.getOutputVerbosity();
boolean outputLemmas = config.getOutputLemmas();
Morphology morpha = (outputLemmas) ? new Morphology() : null;
if (outputStyle == OutputStyle.XML || outputStyle == OutputStyle.INLINE_XML) {
writer.write("<?xml version=\"1.0\" encoding=\"" + config.getEncoding() + "\"?>\n");
writer.write("<pos>\n");
}
String sentenceDelimiter = config.getSentenceDelimiter();
if (sentenceDelimiter != null && sentenceDelimiter.equals("newline")) {
sentenceDelimiter = "\n";
}
while (true) {
// Now we do everything through the doc preprocessor
final DocumentPreprocessor docProcessor;
String line = reader.readLine();
// this happens when we reach end of file
if (line == null)
break;
docProcessor = new DocumentPreprocessor(new StringReader(line));
docProcessor.setTokenizerFactory(tokenizerFactory);
docProcessor.setSentenceDelimiter(sentenceDelimiter);
if (config.keepEmptySentences()) {
docProcessor.setKeepEmptySentences(true);
}
for (List<HasWord> sentence : docProcessor) {
numWords += sentence.size();
Timing t = new Timing();
tagAndOutputSentence(sentence, outputLemmas, morpha, outputStyle, outputVerbosity, numSentences, "", writer);
totalMillis += t.stop();
writer.newLine();
writer.flush();
numSentences++;
}
}
if (outputStyle == OutputStyle.XML || outputStyle == OutputStyle.INLINE_XML) {
writer.write("</pos>\n");
}
writer.flush();
printErrWordsPerSec(totalMillis, numWords);
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class ShiftReduceDemo method main.
public static void main(String[] args) {
String modelPath = "edu/stanford/nlp/models/srparser/englishSR.ser.gz";
String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger";
for (int argIndex = 0; argIndex < args.length; ) {
switch(args[argIndex]) {
case "-tagger":
taggerPath = args[argIndex + 1];
argIndex += 2;
break;
case "-model":
modelPath = args[argIndex + 1];
argIndex += 2;
break;
default:
throw new RuntimeException("Unknown argument " + args[argIndex]);
}
}
String text = "My dog likes to shake his stuffed chickadee toy.";
MaxentTagger tagger = new MaxentTagger(taggerPath);
ShiftReduceParser model = ShiftReduceParser.loadModel(modelPath);
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
List<TaggedWord> tagged = tagger.tagSentence(sentence);
Tree tree = model.apply(tagged);
log.info(tree);
}
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class ParserDemo2 method main.
/**
* This example shows a few more ways of providing input to a parser.
*
* Usage: ParserDemo2 [grammar [textFile]]
*/
public static void main(String[] args) throws IOException {
String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
TreebankLanguagePack tlp = lp.getOp().langpack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
Iterable<List<? extends HasWord>> sentences;
if (args.length > 1) {
DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
List<List<? extends HasWord>> tmp = new ArrayList<>();
for (List<HasWord> sentence : dp) {
tmp.add(sentence);
}
sentences = tmp;
} else {
// Showing tokenization and parsing in code a couple of different ways.
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<HasWord> sentence = new ArrayList<>();
for (String word : sent) {
sentence.add(new Word(word));
}
String sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
// Use the default tokenizer for this TreebankLanguagePack
Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
List<? extends HasWord> sentence2 = toke.tokenize();
String[] sent3 = { "It", "can", "can", "it", "." };
// Parser gets second "can" wrong without help
String[] tag3 = { "PRP", "MD", "VB", "PRP", "." };
List<TaggedWord> sentence3 = new ArrayList<>();
for (int i = 0; i < sent3.length; i++) {
sentence3.add(new TaggedWord(sent3[i], tag3[i]));
}
Tree parse = lp.parse(sentence3);
parse.pennPrint();
List<List<? extends HasWord>> tmp = new ArrayList<>();
tmp.add(sentence);
tmp.add(sentence2);
tmp.add(sentence3);
sentences = tmp;
}
for (List<? extends HasWord> sentence : sentences) {
Tree parse = lp.parse(sentence);
parse.pennPrint();
System.out.println();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
System.out.println("The words of the sentence:");
for (Label lab : parse.yield()) {
if (lab instanceof CoreLabel) {
System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
} else {
System.out.println(lab);
}
}
System.out.println();
System.out.println(parse.taggedYield());
System.out.println();
}
// This method turns the String into a single sentence using the
// default tokenizer for the TreebankLanguagePack.
String sent3 = "This is one last test!";
lp.parse(sent3).pennPrint();
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class ParserDemo method demoDP.
/**
* demoDP demonstrates turning a file into tokens and then parse
* trees. Note that the trees are printed by calling pennPrint on
* the Tree object. It is also possible to pass a PrintWriter to
* pennPrint if you want to capture the output.
* This code will work with any supported language.
*/
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading, sentence-segmenting and tokenizing
// a file using DocumentPreprocessor.
// a PennTreebankLanguagePack for English
TreebankLanguagePack tlp = lp.treebankLanguagePack();
GrammaticalStructureFactory gsf = null;
if (tlp.supportsGrammaticalStructures()) {
gsf = tlp.grammaticalStructureFactory();
}
// to DocumentPreprocessor
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
if (gsf != null) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
}
}
}
use of edu.stanford.nlp.process.DocumentPreprocessor in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseFile.
public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
String backgroundSymbol = "O";
List<CoreMap> sentences = new ArrayList<>();
int lineNum = -1;
String l = null;
while ((l = reader.readLine()) != null) {
lineNum++;
String[] t = l.split("\t", 2);
String id = null;
String text = null;
if (t.length == 2) {
id = t[0];
text = t[1];
} else if (t.length == 1) {
text = t[0];
id = String.valueOf(lineNum);
}
id = sentIDprefix + id;
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
dp.setTokenizerFactory(tokenizerFactory);
String label = backgroundSymbol;
int sentNum = -1;
for (List<HasWord> sentence : dp) {
sentNum++;
String sentStr = "";
List<CoreLabel> sent = new ArrayList<>();
for (HasWord tokw : sentence) {
String tok = tokw.word();
Matcher startingMatcher = startingLabelToken.matcher(tok);
Matcher endMatcher = endLabelToken.matcher(tok);
if (startingMatcher.matches()) {
// System.out.println("matched starting");
label = startingMatcher.group(1);
} else if (endMatcher.matches()) {
// System.out.println("matched end");
label = backgroundSymbol;
} else {
CoreLabel c = new CoreLabel();
List<String> toks = new ArrayList<>();
toks.add(tok);
for (String toksplit : toks) {
sentStr += " " + toksplit;
c.setWord(toksplit);
c.setLemma(toksplit);
c.setValue(toksplit);
c.set(CoreAnnotations.TextAnnotation.class, toksplit);
c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
if (setGoldClass) {
c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
c.set(setClassForTheseLabels.get(label), label);
sent.add(c);
}
}
}
CoreMap sentcm = new ArrayCoreMap();
sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
sentences.add(sentcm);
}
}
return sentences;
}
Aggregations