use of edu.stanford.nlp.parser.lexparser.TreeBinarizer in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method binarizeTreebank.
public static List<Tree> binarizeTreebank(Treebank treebank, Options op) {
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(op.tlpParams.headFinder(), op.tlpParams.treebankLanguagePack());
BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.langpack());
CompositeTreeTransformer transformer = new CompositeTreeTransformer();
transformer.addTransformer(binarizer);
transformer.addTransformer(basicTransformer);
treebank = treebank.transform(transformer);
HeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.headFinder());
List<Tree> binarizedTrees = Generics.newArrayList();
for (Tree tree : treebank) {
Trees.convertToCoreLabels(tree);
tree.percolateHeadAnnotations(binaryHeadFinder);
// Index from 1. Tools downstream expect index from 1, so for
// uses internal to the srparser we have to renormalize the
// indices, with the result that here we have to index from 1
tree.indexLeaves(1, true);
binarizedTrees.add(tree);
}
return binarizedTrees;
}
use of edu.stanford.nlp.parser.lexparser.TreeBinarizer in project CoreNLP by stanfordnlp.
the class BuildBinarizedDataset method main.
/**
* Turns a text file into trees for use in a RNTN classifier such as
* the treebank used in the Sentiment project.
* <br>
* The expected input file is one sentence per line, with sentences
* separated by blank lines. The first line has the main label of the sentence together with the full sentence.
* Lines after the first sentence line but before
* the blank line will be treated as labeled sub-phrases. The
* labels should start with the label and then contain a list of
* tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
* For example:
* <br>
* <code>
* 1 Today is not a good day.<br>
* 3 good<br>
* 3 good day <br>
* 3 a good day <br>
* <br>
* (next block starts here) <br>
* </code>
* <br>
* If you have an example sentence you want to label, you will need
* to manually label the subtrees from there. For example, to build
* a 5 class dataset which matches the existing datasets, you would
* label the very negative phrases with 0, neutral phrases with 2,
* very positive phrases with 4. The binary label dataset uses 0
* for negative, 1 for positive, and -1 for unlabeled (which can
* mean neutral, although the binary model will not predict
* neutral).
* <br>
* In order to determine which sub-phrases would need labeling, you
* can run the sentences through the same parser used to turn the
* sentences into trees. For example, in the case of using the
* englishPCFG model, you can look at the main class of
* edu.stanford.nlp.parser.lexparser.LexicalizedParser . You will
* definitely want to provide a label for the entire sentence. Any
* subphrases which have a significantly different sentiment should
* be labeled, such as the previous example of "not a good day" vs
* "a good day".
* <br>
* Although it would be excessive to do so, a list of ALL of the
* subphrases contained in a parsed tree can be produced by first
* running the parser, then using the tool
* edu.stanford.nlp.trees.OutputSubtrees
* <br>
* By default the englishPCFG parser is used. This can be changed
* with the {@code -parserModel} flag. Specify an input file
* with {@code -input}.
* <br>
* If a sentiment model is provided with -sentimentModel, that model
* will be used to prelabel the sentences. Any spans with given
* labels will then be used to adjust those labels.
*/
public static void main(String[] args) {
CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
String inputPath = null;
String sentimentModelPath = null;
SentimentModel sentimentModel = null;
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-input")) {
inputPath = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
parserModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
sentimentModelPath = args[argIndex + 1];
argIndex += 2;
} else {
log.info("Unknown argument " + args[argIndex]);
System.exit(2);
}
}
if (inputPath == null) {
throw new IllegalArgumentException("Must specify input file with -input");
}
LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
if (sentimentModelPath != null) {
sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
}
String text = IOUtils.slurpFileNoExceptions(inputPath);
// need blank line to make a new chunk
String[] chunks = text.split("\\n\\s*\\n+");
for (String chunk : chunks) {
if (chunk.trim().isEmpty()) {
continue;
}
// The expected format is that line 0 will be the text of the
// sentence, and each subsequence line, if any, will be a value
// followed by the sequence of tokens that get that value.
// Here we take the first line and tokenize it as one sentence.
String[] lines = chunk.trim().split("\\n");
String sentence = lines[0];
StringReader sin = new StringReader(sentence);
DocumentPreprocessor document = new DocumentPreprocessor(sin);
document.setSentenceFinalPuncWords(new String[] { "\n" });
List<HasWord> tokens = document.iterator().next();
Integer mainLabel = Integer.valueOf(tokens.get(0).word());
// System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
tokens = tokens.subList(1, tokens.size());
// log.info(tokens);
Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
for (int i = 1; i < lines.length; ++i) {
extractLabels(spanToLabels, tokens, lines[i]);
}
// TODO: add an option which treats the spans as constraints when parsing
Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
// label here and then use the user given labels to adjust
if (sentimentModel != null) {
Trees.convertToCoreLabels(collapsedUnary);
SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
scorer.forwardPropagateTree(collapsedUnary);
setPredictedLabels(collapsedUnary);
} else {
setUnknownLabels(collapsedUnary, mainLabel);
}
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
}
System.out.println(collapsedUnary);
// System.out.println();
}
}
use of edu.stanford.nlp.parser.lexparser.TreeBinarizer in project CoreNLP by stanfordnlp.
the class ShiftReduceParser method binarizeTreebank.
public static List<Tree> binarizeTreebank(Iterable<Tree> treebank, Options op) {
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(op.tlpParams.headFinder(), op.tlpParams.treebankLanguagePack());
BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.langpack());
CompositeTreeTransformer transformer = new CompositeTreeTransformer();
transformer.addTransformer(binarizer);
transformer.addTransformer(basicTransformer);
List<Tree> transformedTrees = new ArrayList<>();
for (Tree tree : treebank) {
transformedTrees.add(transformer.transformTree(tree));
}
HeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.headFinder());
List<Tree> binarizedTrees = new ArrayList<>();
for (Tree tree : transformedTrees) {
if (!tree.isBinarized()) {
log.warn("Found a tree which was not properly binarized. So-called binarized tree is as follows:\n" + tree.pennString());
continue;
}
Trees.convertToCoreLabels(tree);
tree.percolateHeadAnnotations(binaryHeadFinder);
// Index from 1. Tools downstream expect index from 1, so for
// uses internal to the srparser we have to renormalize the
// indices, with the result that here we have to index from 1
tree.indexLeaves(1, true);
binarizedTrees.add(tree);
}
return binarizedTrees;
}
use of edu.stanford.nlp.parser.lexparser.TreeBinarizer in project CoreNLP by stanfordnlp.
the class ParseAndSetLabels method main.
public static void main(String[] args) {
// TODO: rather than always rolling our own arg parser, we should
// find a library which does it for us nicely
String outputFile = null;
String sentencesFile = null;
String labelsFile = null;
String parserFile = LexicalizedParser.DEFAULT_PARSER_LOC;
String taggerFile = null;
MissingLabels missing = MissingLabels.DEFAULT;
String defaultLabel = "-1";
String separator = "\\t+";
String saveUnknownsFile = null;
String remapLabels = null;
int argIndex = 0;
boolean binarize = true;
boolean useLabelKeys = false;
while (argIndex < args.length) {
if (args[argIndex].equalsIgnoreCase("-output")) {
outputFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-sentences")) {
sentencesFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-labels")) {
labelsFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parser")) {
parserFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-tagger")) {
taggerFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-missing")) {
missing = MissingLabels.valueOf(args[argIndex + 1]);
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-separator")) {
separator = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-default")) {
defaultLabel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveUnknowns")) {
saveUnknownsFile = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-remapLabels")) {
remapLabels = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-binarize")) {
binarize = true;
argIndex += 1;
} else if (args[argIndex].equalsIgnoreCase("-nobinarize")) {
binarize = false;
argIndex += 1;
} else if (args[argIndex].equalsIgnoreCase("-useLabelKeys")) {
useLabelKeys = true;
argIndex += 1;
} else if (args[argIndex].equalsIgnoreCase("-nouseLabelKeys")) {
useLabelKeys = false;
argIndex += 1;
} else {
throw new IllegalArgumentException("Unknown argument " + args[argIndex]);
}
}
if (outputFile == null) {
throw new IllegalArgumentException("-output is required");
}
if (sentencesFile == null && !useLabelKeys) {
throw new IllegalArgumentException("-sentences or -useLabelKeys is required");
}
if (sentencesFile != null && useLabelKeys) {
throw new IllegalArgumentException("Use only one of -sentences or -useLabelKeys");
}
if (labelsFile == null) {
throw new IllegalArgumentException("-labels is required");
}
ParserGrammar parser = loadParser(parserFile, taggerFile);
TreeBinarizer binarizer = null;
if (binarize) {
binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
}
Map<String, String> labelMap = readLabelMap(labelsFile, separator, remapLabels);
List<String> sentences;
if (sentencesFile != null) {
sentences = readSentences(sentencesFile);
} else {
sentences = new ArrayList<String>(labelMap.keySet());
}
List<Tree> trees = parseSentences(sentences, parser, binarizer);
Set<String> unknowns = setLabels(trees, labelMap, missing, defaultLabel);
writeTrees(trees, outputFile);
}
use of edu.stanford.nlp.parser.lexparser.TreeBinarizer in project CoreNLP by stanfordnlp.
the class ParserAnnotator method finishSentence.
private void finishSentence(CoreMap sentence, List<Tree> trees) {
if (treeMap != null) {
List<Tree> mappedTrees = Generics.newLinkedList();
for (Tree tree : trees) {
Tree mappedTree = treeMap.apply(tree);
mappedTrees.add(mappedTree);
}
trees = mappedTrees;
}
if (maxHeight > 0) {
trees = ParserUtils.flattenTallTrees(maxHeight, trees);
}
ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, extraDependencies);
if (saveBinaryTrees) {
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
Tree binarized = binarizer.transformTree(trees.get(0));
Trees.convertToCoreLabels(binarized);
sentence.set(TreeCoreAnnotations.BinarizedTreeAnnotation.class, binarized);
}
// for some reason in some corner cases nodes aren't having sentenceIndex set
// do a pass and make sure all nodes have sentenceIndex set
SemanticGraph sg = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
if (sg != null) {
for (IndexedWord iw : sg.vertexSet()) {
if (iw.get(CoreAnnotations.SentenceIndexAnnotation.class) == null && sentence.get(CoreAnnotations.SentenceIndexAnnotation.class) != null) {
iw.setSentIndex(sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
}
}
}
}
Aggregations