use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class ThreadedParserSlowITest method processFile.
public static List<Tree> processFile(LexicalizedParser parser, List<Tree> input) {
List<Tree> results = new ArrayList<Tree>();
for (Tree tree : input) {
List<HasWord> sentence = tree.yieldHasWord();
Tree output = parser.parseTree(sentence);
results.add(output);
if (results.size() % 10 == 0 || results.size() == input.size()) {
System.out.println("Processed " + results.size() + " trees");
}
}
return results;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class WordToTaggedWordProcessor method main.
/**
* This will print out some text, recognizing tags. It can be used to
* test tag breaking. <br> Usage: <code>
* java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl
* </code>
*
* @param args Command line argument: a file or URL
*/
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl");
System.exit(0);
}
String filename = args[0];
try {
Document<HasWord, Word, Word> d;
if (filename.startsWith("http://")) {
Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
d = notags.processDocument(dpre);
} else {
d = new BasicDocument<HasWord>().init(new File(filename));
}
DocumentProcessor<Word, HasWord, HasWord, Word> proc = new WordToTaggedWordProcessor<>();
Document<HasWord, Word, HasWord> sentd = proc.processDocument(d);
// System.out.println(sentd);
int i = 0;
for (HasWord w : sentd) {
System.out.println(i + ": " + w);
i++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class DocumentPreprocessor method main.
/**
* A simple, deterministic sentence-splitter. This method only supports the English
* tokenizer, so for other languages you should run the tokenizer first and then
* run this sentence splitter with the "-whitespaceTokenization" option.
*
* @param args Command-line arguments
*/
public static void main(String[] args) throws IOException {
final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
if (options.containsKey("help")) {
log.info(usage());
return;
}
// Command-line flags
String encoding = options.getProperty("encoding", "utf-8");
boolean printSentenceLengths = PropertiesUtils.getBool(options, "printSentenceLengths", false);
String xmlElementDelimiter = options.getProperty("xml", null);
DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
String tagDelimiter = options.getProperty("tag", null);
String[] sentenceDelims = null;
// Setup the TokenizerFactory
int numFactoryFlags = 0;
boolean suppressEscaping = options.containsKey("suppressEscaping");
if (suppressEscaping)
numFactoryFlags += 1;
boolean customTokenizer = options.containsKey("tokenizerOptions");
if (customTokenizer)
numFactoryFlags += 1;
boolean printOriginalText = options.containsKey("printOriginalText");
if (printOriginalText)
numFactoryFlags += 1;
boolean whitespaceTokenization = options.containsKey("whitespaceTokenization");
if (whitespaceTokenization)
numFactoryFlags += 1;
if (numFactoryFlags > 1) {
log.info("Only one tokenizer flag allowed at a time: ");
log.info(" -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
return;
}
TokenizerFactory<? extends HasWord> tf = null;
if (suppressEscaping) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
} else if (customTokenizer) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
} else if (printOriginalText) {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
} else if (whitespaceTokenization) {
List<String> whitespaceDelims = new ArrayList<>(Arrays.asList(DocumentPreprocessor.DEFAULT_SENTENCE_DELIMS));
whitespaceDelims.add(WhitespaceLexer.NEWLINE);
sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
} else {
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
}
String fileList = options.getProperty("", null);
String[] files = fileList == null ? new String[1] : fileList.split("\\s+");
int numSents = 0;
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
for (String file : files) {
DocumentPreprocessor docPreprocessor;
if (file == null || file.isEmpty()) {
docPreprocessor = new DocumentPreprocessor(new InputStreamReader(System.in, encoding));
} else {
docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
}
if (docType == DocType.XML) {
docPreprocessor.setElementDelimiter(xmlElementDelimiter);
}
docPreprocessor.setTokenizerFactory(tf);
if (sentenceDelimiter != null) {
docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
}
if (tagDelimiter != null) {
docPreprocessor.setTagDelimiter(tagDelimiter);
}
if (sentenceDelims != null) {
docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
}
for (List<HasWord> sentence : docPreprocessor) {
numSents++;
if (printSentenceLengths) {
System.err.printf("Length: %d%n", sentence.size());
}
boolean printSpace = false;
for (HasWord word : sentence) {
if (printOriginalText) {
CoreLabel cl = (CoreLabel) word;
if (!printSpace) {
pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
printSpace = true;
}
pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
} else {
if (printSpace)
pw.print(" ");
printSpace = true;
pw.print(word.word());
}
}
pw.println();
}
}
pw.close();
System.err.printf("Read in %d sentences.%n", numSents);
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class PTBEscapingProcessor method main.
/**
* This will do the escaping on an input file. Input file should already be tokenized,
* with tokens separated by whitespace. <br>
* Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
*
* @param args Command line argument: a file or URL
*/
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
return;
}
String filename = args[0];
try {
// initialized below
Document<String, Word, Word> d;
if (filename.startsWith("http://")) {
Document<String, Word, Word> dpre = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new URL(filename));
DocumentProcessor<Word, Word, String, Word> notags = new StripTagsProcessor<>();
d = notags.processDocument(dpre);
} else {
d = new BasicDocument<String>(WhitespaceTokenizer.factory()).init(new File(filename));
}
DocumentProcessor<Word, HasWord, String, Word> proc = new PTBEscapingProcessor<>();
Document<String, Word, HasWord> newD = proc.processDocument(d);
for (HasWord word : newD) {
System.out.println(word);
}
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class SemanticHeadFinder method isVerbalAuxiliary.
private boolean isVerbalAuxiliary(Tree preterminal, Set<String> verbalSet, boolean allowJustTagMatch) {
if (preterminal.isPreTerminal()) {
Label kidLabel = preterminal.label();
String tag = null;
if (kidLabel instanceof HasTag) {
tag = ((HasTag) kidLabel).tag();
}
if (tag == null) {
tag = preterminal.value();
}
Label wordLabel = preterminal.firstChild().label();
String word = null;
if (wordLabel instanceof HasWord) {
word = ((HasWord) wordLabel).word();
}
if (word == null) {
word = wordLabel.value();
}
if (DEBUG) {
log.info("Checking " + preterminal.value() + " head is " + word + '/' + tag);
}
String lcWord = word.toLowerCase();
if (allowJustTagMatch && unambiguousAuxiliaryTags.contains(tag) || verbalTags.contains(tag) && verbalSet.contains(lcWord)) {
if (DEBUG) {
log.info("isAuxiliary found desired type of aux");
}
return true;
}
}
return false;
}
Aggregations