use of edu.stanford.nlp.ling.StringLabel in project CoreNLP by stanfordnlp.
the class NegraPennCollinizer method transformTree.
public Tree transformTree(Tree tree) {
Label l = tree.label();
if (tree.isLeaf()) {
return tf.newLeaf(l);
}
String s = l.value();
s = tlpp.treebankLanguagePack().basicCategory(s);
if (deletePunct) {
// since it ignores punctuation anyway
if (tree.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
return null;
}
}
// TEMPORARY: eliminate the TOPP constituent
if (tree.children()[0].label().value().equals("TOPP")) {
log.info("Found a TOPP");
tree.setChildren(tree.children()[0].children());
}
// Negra has lots of non-unary roots; delete unary roots
if (tlpp.treebankLanguagePack().isStartSymbol(s) && tree.numChildren() == 1) {
// NB: This deletes the boundary symbol, which is in the tree!
return transformTree(tree.getChild(0));
}
List<Tree> children = new ArrayList<>();
for (int cNum = 0, numC = tree.numChildren(); cNum < numC; cNum++) {
Tree child = tree.getChild(cNum);
Tree newChild = transformTree(child);
if (newChild != null) {
children.add(newChild);
}
}
if (children.isEmpty()) {
return null;
}
return tf.newTreeNode(new StringLabel(s), children);
}
use of edu.stanford.nlp.ling.StringLabel in project CoreNLP by stanfordnlp.
the class ChineseCollinizer method transformTree.
private Tree transformTree(Tree tree, boolean isRoot) {
String label = tree.label().value();
if (tree.isLeaf()) {
if (deletePunct && ctlp.isPunctuationWord(label)) {
return null;
} else {
return tf.newLeaf(new StringLabel(label));
}
}
if (tree.isPreTerminal() && deletePunct && ctlp.isPunctuationTag(label)) {
// System.out.println("Deleting punctuation");
return null;
}
List<Tree> children = new ArrayList<>();
if (label.matches("ROOT.*") && tree.numChildren() == 1) {
// keep non-unary roots for now
return transformTree(tree.children()[0], true);
}
//System.out.println("Enhanced label is " + label);
// remove all functional and machine-generated annotations
label = label.replaceFirst("[^A-Z].*$", "");
// merge parentheticals with adverb phrases
label = label.replaceFirst("PRN", "ADVP");
for (int cNum = 0; cNum < tree.children().length; cNum++) {
Tree child = tree.children()[cNum];
Tree newChild = transformTree(child, false);
if (newChild != null) {
children.add(newChild);
}
}
// Chinese treebank that only have punctuation in them!!!
if (children.isEmpty() && !isRoot) {
if (VERBOSE) {
log.info("ChineseCollinizer: all children of " + label + " deleted; returning null");
}
return null;
}
return tf.newTreeNode(new StringLabel(label), children);
}
use of edu.stanford.nlp.ling.StringLabel in project CoreNLP by stanfordnlp.
the class Treebank method textualSummary.
/**
* Return various statistics about the treebank (number of sentences,
* words, tag set, etc.).
*
* @param tlp The TreebankLanguagePack used to determine punctuation and an
* appropriate character encoding
* @return A big string for human consumption describing the treebank
*/
public String textualSummary(TreebankLanguagePack tlp) {
int numTrees = 0;
int numTreesLE40 = 0;
int numNonUnaryRoots = 0;
Tree nonUnaryEg = null;
ClassicCounter<Tree> nonUnaries = new ClassicCounter<>();
ClassicCounter<String> roots = new ClassicCounter<>();
ClassicCounter<String> starts = new ClassicCounter<>();
ClassicCounter<String> puncts = new ClassicCounter<>();
int numUnenclosedLeaves = 0;
int numLeaves = 0;
int numNonPhrasal = 0;
int numPreTerminalWithMultipleChildren = 0;
int numWords = 0;
int numTags = 0;
int shortestSentence = Integer.MAX_VALUE;
int longestSentence = 0;
int numNullLabel = 0;
Set<String> words = Generics.newHashSet();
ClassicCounter<String> tags = new ClassicCounter<>();
ClassicCounter<String> cats = new ClassicCounter<>();
Tree leafEg = null;
Tree preTerminalMultipleChildrenEg = null;
Tree nullLabelEg = null;
Tree rootRewritesAsTaggedWordEg = null;
for (Tree t : this) {
roots.incrementCount(t.value());
numTrees++;
int leng = t.yield().size();
if (leng <= 40) {
numTreesLE40++;
}
if (leng < shortestSentence) {
shortestSentence = leng;
}
if (leng > longestSentence) {
longestSentence = leng;
}
if (t.numChildren() > 1) {
if (numNonUnaryRoots == 0) {
nonUnaryEg = t;
}
if (numNonUnaryRoots < 100) {
nonUnaries.incrementCount(t.localTree());
}
numNonUnaryRoots++;
} else if (t.isLeaf()) {
numUnenclosedLeaves++;
} else {
Tree t2 = t.firstChild();
if (t2.isLeaf()) {
numLeaves++;
leafEg = t;
} else if (t2.isPreTerminal()) {
if (numNonPhrasal == 0) {
rootRewritesAsTaggedWordEg = t;
}
numNonPhrasal++;
}
starts.incrementCount(t2.value());
}
for (Tree subtree : t) {
Label lab = subtree.label();
if (lab == null || lab.value() == null || "".equals(lab.value())) {
if (numNullLabel == 0) {
nullLabelEg = subtree;
}
numNullLabel++;
if (lab == null) {
subtree.setLabel(new StringLabel(""));
} else if (lab.value() == null) {
subtree.label().setValue("");
}
}
if (subtree.isLeaf()) {
numWords++;
words.add(subtree.value());
} else if (subtree.isPreTerminal()) {
numTags++;
tags.incrementCount(subtree.value());
if (tlp != null && tlp.isPunctuationTag(subtree.value())) {
puncts.incrementCount(subtree.firstChild().value());
}
} else if (subtree.isPhrasal()) {
boolean hasLeafChild = false;
for (Tree kt : subtree.children()) {
if (kt.isLeaf()) {
hasLeafChild = true;
}
}
if (hasLeafChild) {
numPreTerminalWithMultipleChildren++;
if (preTerminalMultipleChildrenEg == null) {
preTerminalMultipleChildrenEg = subtree;
}
}
cats.incrementCount(subtree.value());
} else {
throw new IllegalStateException("Treebank: Bad tree in treebank!: " + subtree);
}
}
}
StringWriter sw = new StringWriter(2000);
PrintWriter pw = new PrintWriter(sw);
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
pw.println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
if (numTrees > 0) {
if (numTags != numWords) {
pw.println(" Warning! numTags differs and is " + numTags);
}
if (roots.size() == 1) {
String root = (String) roots.keySet().toArray()[0];
pw.println(" The root category is: " + root);
} else {
pw.println(" Warning! " + roots.size() + " different roots in treebank: " + Counters.toString(roots, nf));
}
if (numNonUnaryRoots > 0) {
pw.print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. ");
if (numNonUnaryRoots > 100) {
pw.print("First 100 ");
}
pw.println("Rewrites: " + Counters.toString(nonUnaries, nf));
pw.println(" Example: " + nonUnaryEg);
}
if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) {
pw.println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
if (numLeaves > 0) {
pw.println(" Example bad root rewrites as leaf: " + leafEg);
}
if (numNonPhrasal > 0) {
pw.println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
}
}
if (numNullLabel > 0) {
pw.println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
pw.println(" " + nullLabelEg);
}
if (numPreTerminalWithMultipleChildren > 0) {
pw.println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
pw.println(" Example: " + preTerminalMultipleChildrenEg);
}
pw.println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
pw.println(" " + cats.size() + " phrasal category types, " + tags.size() + " tag types, and " + words.size() + " word types");
String[] empties = { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
// What a dopey choice using 0 as an empty element name!!
// The problem with the below is that words aren't turned into a basic
// category, but empties commonly are indexed.... Would need to look
// for them with a suffix of -[0-9]+
Set<String> knownEmpties = Generics.newHashSet(Arrays.asList(empties));
Set<String> emptiesIntersection = Sets.intersection(words, knownEmpties);
if (!emptiesIntersection.isEmpty()) {
pw.println(" Caution! " + emptiesIntersection.size() + " word types are known empty elements: " + emptiesIntersection);
}
Set<String> joint = Sets.intersection(cats.keySet(), tags.keySet());
if (!joint.isEmpty()) {
pw.println(" Warning! " + joint.size() + " items are tags and categories: " + joint);
}
for (String cat : cats.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat);
break;
}
}
for (String cat : tags.keySet()) {
if (cat != null && cat.contains("@")) {
pw.println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat);
break;
}
}
pw.println(" Cats: " + Counters.toString(cats, nf));
pw.println(" Tags: " + Counters.toString(tags, nf));
pw.println(" " + starts.size() + " start categories: " + Counters.toString(starts, nf));
if (!puncts.isEmpty()) {
pw.println(" Puncts: " + Counters.toString(puncts, nf));
}
}
return sw.toString();
}
use of edu.stanford.nlp.ling.StringLabel in project CoreNLP by stanfordnlp.
the class ChineseCharacterBasedLexiconTraining method main.
public static void main(String[] args) throws IOException {
Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
flagsToNumArgs.put("-parser", Integer.valueOf(3));
flagsToNumArgs.put("-lex", Integer.valueOf(3));
flagsToNumArgs.put("-test", Integer.valueOf(2));
flagsToNumArgs.put("-out", Integer.valueOf(1));
flagsToNumArgs.put("-lengthPenalty", Integer.valueOf(1));
flagsToNumArgs.put("-penaltyType", Integer.valueOf(1));
flagsToNumArgs.put("-maxLength", Integer.valueOf(1));
flagsToNumArgs.put("-stats", Integer.valueOf(2));
Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
boolean eval = argMap.containsKey("-eval");
PrintWriter pw = null;
if (argMap.containsKey("-out")) {
pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap.get("-out"))[0]), "GB18030"), true);
}
log.info("ChineseCharacterBasedLexicon called with args:");
ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
for (int i = 0; i < args.length; i++) {
ctpp.setOptionFlag(args, i);
log.info(" " + args[i]);
}
log.info();
Options op = new Options(ctpp);
if (argMap.containsKey("-stats")) {
String[] statArgs = (argMap.get("-stats"));
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
trainTreebank.add(annotator.transformTree(tree));
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
printStats(trainTreebank, pw);
System.exit(0);
}
int maxLength = 1000000;
// Test.verbose = true;
if (argMap.containsKey("-norm")) {
op.testOptions.lengthNormalization = true;
}
if (argMap.containsKey("-maxLength")) {
maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
}
op.testOptions.maxLength = 120;
boolean combo = argMap.containsKey("-combo");
if (combo) {
ctpp.useCharacterBasedLexicon = true;
op.testOptions.maxSpanForTags = 10;
op.doDep = false;
op.dcTags = false;
}
LexicalizedParser lp = null;
Lexicon lex = null;
if (argMap.containsKey("-parser")) {
String[] parserArgs = (argMap.get("-parser"));
if (parserArgs.length > 1) {
FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
if (parserArgs.length == 3) {
String filename = parserArgs[2];
log.info("Writing parser in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lp);
out.close();
log.info("done.");
}
} else {
String parserFile = parserArgs[0];
lp = LexicalizedParser.loadModel(parserFile, op);
}
lex = lp.getLexicon();
op = lp.getOp();
ctpp = (ChineseTreebankParserParams) op.tlpParams;
}
if (argMap.containsKey("-rad")) {
ctpp.useUnknownCharacterModel = true;
}
if (argMap.containsKey("-lengthPenalty")) {
ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
}
if (argMap.containsKey("-penaltyType")) {
ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
}
if (argMap.containsKey("-lex")) {
String[] lexArgs = (argMap.get("-lex"));
if (lexArgs.length > 1) {
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
lex = ctpp.lex(op, wordIndex, tagIndex);
MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
log.info("Done reading trees.");
MemoryTreebank trainTreebank;
if (argMap.containsKey("-annotate")) {
trainTreebank = new MemoryTreebank();
TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
for (Tree tree : rawTrainTreebank) {
tree = annotator.transformTree(tree);
trainTreebank.add(tree);
}
log.info("Done annotating trees.");
} else {
trainTreebank = rawTrainTreebank;
}
lex.initializeTraining(trainTreebank.size());
lex.train(trainTreebank);
lex.finishTraining();
log.info("Done training lexicon.");
if (lexArgs.length == 3) {
String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
log.info("Writing lexicon in serialized format to file " + filename + " ");
System.err.flush();
ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
out.writeObject(lex);
out.close();
log.info("done.");
}
} else {
String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
log.info("Reading Lexicon from file " + lexFile);
ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
try {
lex = (Lexicon) in.readObject();
} catch (ClassNotFoundException e) {
throw new RuntimeException("Bad serialized file: " + lexFile);
}
in.close();
}
}
if (argMap.containsKey("-test")) {
boolean segmentWords = ctpp.segment;
boolean parse = lp != null;
assert (parse || segmentWords);
// WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
// WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
WordSegmenter seg = null;
if (segmentWords) {
seg = (WordSegmenter) lex;
}
String[] testArgs = (argMap.get("-test"));
MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
testTreebank.loadPath(new File(testArgs[0]), testFilt);
TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
TreeTransformer collinizer = ctpp.collinizer();
WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic");
EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
List<String> evalTypes = new ArrayList<>(3);
boolean goodPOS = false;
if (segmentWords) {
evalTypes.add(WordCatConstituent.wordType);
if (ctpp.segmentMarkov && !parse) {
evalTypes.add(WordCatConstituent.tagType);
goodPOS = true;
}
}
if (parse) {
evalTypes.add(WordCatConstituent.tagType);
evalTypes.add(WordCatConstituent.catType);
if (combo) {
evalTypes.add(WordCatConstituent.wordType);
goodPOS = true;
}
}
TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
log.info("Testing...");
for (Tree goldTop : testTreebank) {
Tree gold = goldTop.firstChild();
List<HasWord> goldSentence = gold.yieldHasWord();
if (goldSentence.size() > maxLength) {
log.info("Skipping sentence; too long: " + goldSentence.size());
continue;
} else {
log.info("Processing sentence; length: " + goldSentence.size());
}
List<HasWord> s;
if (segmentWords) {
StringBuilder goldCharBuf = new StringBuilder();
for (HasWord aGoldSentence : goldSentence) {
StringLabel word = (StringLabel) aGoldSentence;
goldCharBuf.append(word.value());
}
String goldChars = goldCharBuf.toString();
s = seg.segment(goldChars);
} else {
s = goldSentence;
}
Tree tree;
if (parse) {
tree = lp.parseTree(s);
if (tree == null) {
throw new RuntimeException("PARSER RETURNED NULL!!!");
}
} else {
tree = Trees.toFlatTree(s);
tree = subcategoryStripper.transformTree(tree);
}
if (pw != null) {
if (parse) {
tree.pennPrint(pw);
} else {
Iterator sentIter = s.iterator();
for (; ; ) {
Word word = (Word) sentIter.next();
pw.print(word.word());
if (sentIter.hasNext()) {
pw.print(" ");
} else {
break;
}
}
}
pw.println();
}
if (eval) {
Collection ourBrackets, goldBrackets;
ourBrackets = proc.allBrackets(tree);
goldBrackets = proc.allBrackets(gold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(tree, gold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(gold, tree));
}
basicEval.eval(ourBrackets, goldBrackets);
System.out.println("\nScores:");
basicEval.displayLast();
Tree collinsTree = collinizer.transformTree(tree);
Tree collinsGold = collinizer.transformTree(gold);
ourBrackets = proc.allBrackets(collinsTree);
goldBrackets = proc.allBrackets(collinsGold);
if (goodPOS) {
ourBrackets.addAll(proc.commonWordTagTypeBrackets(collinsTree, collinsGold));
goldBrackets.addAll(proc.commonWordTagTypeBrackets(collinsGold, collinsTree));
}
collinsEval.eval(ourBrackets, goldBrackets);
System.out.println("\nCollinized scores:");
collinsEval.displayLast();
System.out.println();
}
}
if (eval) {
basicEval.display();
System.out.println();
collinsEval.display();
}
}
}
use of edu.stanford.nlp.ling.StringLabel in project CoreNLP by stanfordnlp.
the class ConstituentTest method testConstituents.
public void testConstituents() {
Set<Constituent> set = new HashSet<Constituent>();
Constituent c1 = new LabeledScoredConstituent(9, 15, new StringLabel("S"), 0);
Constituent c2 = new LabeledScoredConstituent(9, 15, new StringLabel("VP"), 0);
// System.err.println("c1 "+c1+" c2 "+c2+" equal? "+c1.equals(c2));
assertNotSame(c1, c2);
set.add(c1);
// System.err.println("Set has c1? "+set.contains(c1));
// System.err.println("Set has c2? "+set.contains(c2));
assertTrue(set.contains(c1));
assertFalse(set.contains(c2));
set.add(c2);
// System.err.println("Set has c1? "+set.contains(c1));
// System.err.println("Set has c2? "+set.contains(c2));
assertTrue(set.contains(c1));
assertTrue(set.contains(c2));
// System.err.println("Set size is " + set.size());
assertTrue(set.size() == 2);
for (Constituent c : set) {
// System.err.println(" "+c+" is c1? "+c.equals(c1)+" or "+c1.equals(c)+" is c2? "+c.equals(c2)+" or "+c2.equals(c));
assertTrue((c.equals(c1) || c.equals(c2)));
}
// there used to be a parallel test for Constituents in TreeSets,
// but given that Constituents do not implement Comparable(),
// this test just always failed.
}
Aggregations