use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class Tree method yieldHasWord.
@SuppressWarnings("unchecked")
public <X extends HasWord> ArrayList<X> yieldHasWord(ArrayList<X> y) {
if (isLeaf()) {
Label lab = label();
// LabeledScoredTreeFactory but passes in a StringLabel to e.g. newLeaf().
if (lab instanceof HasWord) {
if (lab instanceof CoreLabel) {
CoreLabel cl = (CoreLabel) lab;
if (cl.word() == null)
cl.setWord(cl.value());
y.add((X) cl);
} else {
y.add((X) lab);
}
} else {
y.add((X) new Word(lab));
}
} else {
Tree[] kids = children();
for (Tree kid : kids) {
kid.yield(y);
}
}
return y;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class Tree method makeDependencyLabel.
/**
* Convert a constituency label to a dependency label. Options are provided for selecting annotations
* to copy.
*
* @param oldLabel
* @param copyLabel
* @param copyIndex
* @param copyPosTag
*/
private static Label makeDependencyLabel(Label oldLabel, boolean copyLabel, boolean copyIndex, boolean copyPosTag) {
if (!copyLabel)
return oldLabel;
String wordForm = (oldLabel instanceof HasWord) ? ((HasWord) oldLabel).word() : oldLabel.value();
Label newLabel = oldLabel.labelFactory().newLabel(wordForm);
if (newLabel instanceof HasWord)
((HasWord) newLabel).setWord(wordForm);
if (copyPosTag && newLabel instanceof HasTag && oldLabel instanceof HasTag) {
String tag = ((HasTag) oldLabel).tag();
((HasTag) newLabel).setTag(tag);
}
if (copyIndex && newLabel instanceof HasIndex && oldLabel instanceof HasIndex) {
int index = ((HasIndex) oldLabel).index();
((HasIndex) newLabel).setIndex(index);
}
return newLabel;
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class BuildBinarizedDataset method main.
/**
* Turns a text file into trees for use in a RNTN classifier such as
* the treebank used in the Sentiment project.
* <br>
* The expected input file is one sentence per line, with sentences
* separated by blank lines. The first line has the main label of the sentence together with the full sentence.
* Lines after the first sentence line but before
* the blank line will be treated as labeled sub-phrases. The
* labels should start with the label and then contain a list of
* tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
* For example:
* <br>
* <code>
* 1 Today is not a good day.<br>
* 3 good<br>
* 3 good day <br>
* 3 a good day <br>
* <br>
* (next block starts here) <br>
* </code>
* By default the englishPCFG parser is used. This can be changed
* with the <code>-parserModel</code> flag. Specify an input file
* with <code>-input</code>.
* <br>
* If a sentiment model is provided with -sentimentModel, that model
* will be used to prelabel the sentences. Any spans with given
* labels will then be used to adjust those labels.
*/
public static void main(String[] args) {
CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String inputPath = null;
String sentimentModelPath = null;
SentimentModel sentimentModel = null;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
parserModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
sentimentModelPath = args[argIndex + 1];
argIndex += 2;
} else {
log.info("Unknown argument " + args[argIndex]);
System.exit(2);
}
}
if (inputPath == null) {
throw new IllegalArgumentException("Must specify input file with -input");
}
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
if (sentimentModelPath != null) {
sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
}
String text = IOUtils.slurpFileNoExceptions(inputPath);
// need blank line to make a new chunk
String[] chunks = text.split("\\n\\s*\\n+");
for (String chunk : chunks) {
if (chunk.trim().isEmpty()) {
continue;
}
// The expected format is that line 0 will be the text of the
// sentence, and each subsequence line, if any, will be a value
// followed by the sequence of tokens that get that value.
// Here we take the first line and tokenize it as one sentence.
String[] lines = chunk.trim().split("\\n");
String sentence = lines[0];
StringReader sin = new StringReader(sentence);
DocumentPreprocessor document = new DocumentPreprocessor(sin);
document.setSentenceFinalPuncWords(new String[] { "\n" });
List<HasWord> tokens = document.iterator().next();
Integer mainLabel = new Integer(tokens.get(0).word());
//System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
tokens = tokens.subList(1, tokens.size());
//log.info(tokens);
Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
for (int i = 1; i < lines.length; ++i) {
extractLabels(spanToLabels, tokens, lines[i]);
}
// TODO: add an option which treats the spans as constraints when parsing
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
// label here and then use the user given labels to adjust
if (sentimentModel != null) {
Trees.convertToCoreLabels(collapsedUnary);
SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
scorer.forwardPropagateTree(collapsedUnary);
setPredictedLabels(collapsedUnary);
} else {
setUnknownLabels(collapsedUnary, mainLabel);
}
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
}
System.out.println(collapsedUnary);
//System.out.println();
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class SequenceGibbsSampler method printSamples.
public void printSamples(List samples, PrintStream out) {
for (int i = 0; i < document.size(); i++) {
HasWord word = (HasWord) document.get(i);
String s = "null";
if (word != null) {
s = word.word();
}
out.print(StringUtils.padOrTrim(s, 10));
for (Object sample : samples) {
int[] sequence = (int[]) sample;
out.print(" " + StringUtils.padLeft(sequence[i], 2));
}
out.println();
}
}
use of edu.stanford.nlp.ling.HasWord in project CoreNLP by stanfordnlp.
the class AbstractSequenceClassifier method preprocessTokens.
private List<IN> preprocessTokens(List<? extends HasWord> tokenSequence) {
// log.info("knownLCWords.size is " + knownLCWords.size() + "; knownLCWords.maxSize is " + knownLCWords.getMaxSize() +
// ", prior to NER for " + getClass().toString());
List<IN> document = new ArrayList<>();
int i = 0;
for (HasWord word : tokenSequence) {
// initialized below
IN wi;
if (word instanceof CoreMap) {
// copy all annotations! some are required later in
// AbstractSequenceClassifier.classifyWithInlineXML
// wi = (IN) new ArrayCoreMap((ArrayCoreMap) word);
wi = tokenFactory.makeToken((IN) word);
} else {
wi = tokenFactory.makeToken();
wi.set(CoreAnnotations.TextAnnotation.class, word.word());
// wi.setWord(word.word());
}
wi.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(i));
wi.set(CoreAnnotations.AnswerAnnotation.class, backgroundSymbol());
document.add(wi);
i++;
}
// TODO get rid of ObjectBankWrapper
ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<>(flags, null, knownLCWords);
wrapper.processDocument(document);
// log.info("Size of knownLCWords is " + knownLCWords.size() + ", after NER for " + getClass().toString());
return document;
}
Aggregations