use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.
the class TsarfatyEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
int maxGuessYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
boolean skipGuess = false;
boolean tagMode = false;
String guessFile = null;
String goldFile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-y":
maxGoldYield = Integer.parseInt(args[++i].trim());
break;
case "-t":
tagMode = true;
break;
case "-v":
VERBOSE = true;
break;
case "-g":
maxGuessYield = Integer.parseInt(args[++i].trim());
skipGuess = true;
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
//Required parameters
goldFile = args[i++];
guessFile = args[i];
break;
}
}
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
final ArrayList<Label> guessSent = guess.yield();
final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
if (guessSent.size() > maxGuessYield) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
final ArrayList<Label> goldSent = gold.yield();
final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
if (goldSent.size() > maxGoldYield) {
continue;
} else if (goldChars.length() != guessChars.length()) {
pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
eval.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.
the class CollinsDepEval method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, optionArgDefs());
boolean VERBOSE = PropertiesUtils.getBool(options, "v", false);
Language LANGUAGE = PropertiesUtils.get(options, "l", Language.English, Language.class);
int MAX_GOLD_YIELD = PropertiesUtils.getInt(options, "g", Integer.MAX_VALUE);
int MAX_GUESS_YIELD = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
String[] parsedArgs = options.getProperty("", "").split("\\s+");
if (parsedArgs.length != MIN_ARGS) {
log.info(usage());
System.exit(-1);
}
File goldFile = new File(parsedArgs[0]);
File guessFile = new File(parsedArgs[1]);
final TreebankLangParserParams tlpp = LANGUAGE.params;
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final CollinsDepEval depEval = new CollinsDepEval("CollinsDep", true, tlpp.headFinder(), tlpp.treebankLanguagePack().startSymbol());
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
if (guess.yield().size() > MAX_GUESS_YIELD) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
if (gold.yield().size() > MAX_GOLD_YIELD) {
continue;
} else if (evalGold.yield().size() != evalGuess.yield().size()) {
pwOut.println("Yield mismatch at gold line " + goldLineId);
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
depEval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((MAX_GUESS_YIELD < Integer.MAX_VALUE) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
depEval.display(true, pwOut);
pwOut.close();
}
use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.
the class LeafAncestorEval method main.
/**
* Execute with no arguments for usage.
*/
public static void main(String[] args) {
if (!validateCommandLine(args)) {
log.info(USAGE);
System.exit(-1);
}
final TreebankLangParserParams tlpp = LANGUAGE.params;
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final LeafAncestorEval metric = new LeafAncestorEval("LeafAncestor");
final TreeTransformer tc = tlpp.collinizer();
//The evalb ref implementation assigns status for each tree pair as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
final Iterator<Tree> guessItr = guessTreebank.iterator();
int goldLineId = 0;
int guessLineId = 0;
int skippedGuessTrees = 0;
while (guessItr.hasNext() && goldItr.hasNext()) {
Tree guessTree = guessItr.next();
List<Label> guessYield = guessTree.yield();
guessLineId++;
Tree goldTree = goldItr.next();
List<Label> goldYield = goldTree.yield();
goldLineId++;
// Check that we should evaluate this tree
if (goldYield.size() > MAX_GOLD_YIELD) {
skippedGuessTrees++;
continue;
}
// Only trees with equal yields can be evaluated
if (goldYield.size() != guessYield.size()) {
pwOut.printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.size(), guessYield.size(), goldLineId, guessLineId);
skippedGuessTrees++;
continue;
}
final Tree evalGuess = tc.transformTree(guessTree);
final Tree evalGold = tc.transformTree(goldTree);
metric.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
}
if (guessItr.hasNext() || goldItr.hasNext()) {
System.err.printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId);
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees);
metric.display(true, pwOut);
pwOut.close();
}
use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.
the class GrammaticalStructureConversionUtils method convertTrees.
/**
* Given sentences or trees, output the typed dependencies.
* <p>
* By default, the method outputs the collapsed typed dependencies with
* processing of conjuncts. The input can be given as plain text (one sentence
* by line) using the option -sentFile, or as trees using the option
* -treeFile. For -sentFile, the input has to be strictly one sentence per
* line. You can specify where to find a parser with -parserFile
* serializedParserPath. See LexicalizedParser for more flexible processing of
* text files (including with Stanford Dependencies output). The above options
* assume a file as input. You can also feed trees (only) via stdin by using
* the option -filter. If one does not specify a -parserFile, one
* can specify which language pack to use with -tLPP, This option
* specifies a class which determines which GrammaticalStructure to
* use, which HeadFinder to use, etc. It will default to
* edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams,
* but any TreebankLangParserParams can be specified.
* <p>
* If no method of producing trees is given other than to use the
* LexicalizedParser, but no parser is specified, a default parser
* is used, the English parser. You can specify options to load
* with the parser using the -parserOpts flag. If the default
* parser is used, and no options are provided, the option
* -retainTmpSubcategories is used.
* <p>
* The following options can be used to specify the types of dependencies
* wanted: </p>
* <ul>
* <li> -collapsed collapsed dependencies
* <li> -basic non-collapsed dependencies that preserve a tree structure
* <li> -nonCollapsed non-collapsed dependencies that do not preserve a tree
* structure (the basic dependencies plus the extra ones)
* <li> -CCprocessed
* collapsed dependencies and conjunctions processed (dependencies are added
* for each conjunct) -- this is the default if no options are passed
* <li> -collapsedTree collapsed dependencies retaining a tree structure
* <li> -makeCopulaHead Contrary to the approach argued for in the SD papers,
* nevertheless make the verb 'to be' the head, not the predicate noun, adjective,
* etc. (However, when the verb 'to be' is used as an auxiliary verb, the main
* verb is still treated as the head.)
* <li> -originalDependencies generate the dependencies using the original converter
* instead of the Universal Dependencies converter.
* </ul>
* <p>
* The {@code -conllx} option will output the dependencies in the CoNLL format,
* instead of in the standard Stanford format (relation(governor,dependent))
* and will retain punctuation by default.
* When used in the "collapsed" format, words such as prepositions, conjunctions
* which get collapsed into the grammatical relations and are not part of the
* sentence per se anymore will be annotated with "erased" as grammatical relation
* and attached to the fake "ROOT" node with index 0.
* <p/><p>
* There is also an option to retain dependencies involving punctuation:
* {@code -keepPunct}
* </p><p>
* The {@code -extraSep} option used with -nonCollapsed will print the basic
* dependencies first, then a separator ======, and then the extra
* dependencies that do not preserve the tree structure. The -test option is
* used for debugging: it prints the grammatical structure, as well as the
* basic, collapsed and CCprocessed dependencies. It also checks the
* connectivity of the collapsed dependencies. If the collapsed dependencies
* list doesn't constitute a connected graph, it prints the possible offending
* nodes (one of them is the real root of the graph).
* </p><p>
* Using the -conllxFile, you can pass a file containing Stanford dependencies
* in the CoNLL format (e.g., the basic dependencies), and obtain another
* representation using one of the representation options.
* </p><p>
* Usage: <br>
* <code>java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter] <br>
* [-collapsed -basic -CCprocessed -test -generateOriginalDependencies]</code>
*
* @param args Command-line arguments, as above
*/
@SuppressWarnings("unchecked")
public static void convertTrees(String[] args, String defaultLang) {
/* Use a tree normalizer that removes all empty nodes.
This prevents wrong indexing of the nodes in the dependency relations. */
Iterable<GrammaticalStructure> gsBank = null;
Properties props = StringUtils.argsToProperties(args);
String language = props.getProperty("language", defaultLang);
ConverterOptions opts = ConverterOptions.getConverterOptions(language);
MemoryTreebank tb = new MemoryTreebank(opts.treeNormalizer);
Iterable<Tree> trees = tb;
String encoding = props.getProperty("encoding", "utf-8");
try {
System.setOut(new PrintStream(System.out, true, encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
String treeFileName = props.getProperty("treeFile");
String sentFileName = props.getProperty("sentFile");
String conllXFileName = props.getProperty("conllxFile");
String altDepPrinterName = props.getProperty("altprinter");
String altDepReaderName = props.getProperty("altreader");
String altDepReaderFilename = props.getProperty("altreaderfile");
String filter = props.getProperty("filter");
boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null;
boolean generateOriginalDependencies = props.getProperty("originalDependencies") != null || opts.stanfordDependencies;
// TODO: if a parser is specified, load this from the parser
// instead of ever loading it from this way
String tLPP = props.getProperty("tLPP", opts.tlPPClassName);
TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP);
params.setGenerateOriginalDependencies(generateOriginalDependencies);
if (makeCopulaHead) {
// TODO: generalize and allow for more options
String[] options = { "-makeCopulaHead" };
params.setOptionFlag(options, 0);
}
if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) {
try {
System.err.printf("Usage: java %s%n", GrammaticalStructure.class.getCanonicalName());
System.err.println("Options:");
System.err.println(" Dependency representation:");
System.err.println(" -basic:\t\tGenerate basic dependencies.");
System.err.println(" -enhanced:\t\tGenerate enhanced dependencies, currently only implemented for English UD.");
System.err.println(" -enhanced++:\tGenerate enhanced++ dependencies (default), currently only implemented for English UD.");
System.err.println(" -collapsed:\t\tGenerate collapsed dependencies, deprecated.");
System.err.println(" -CCprocessed:\tGenerate CC-processed dependencies, deprecated.");
System.err.println(" -collapsedTree:\tGenerate collapsed-tree dependencies, deprecated.");
System.err.println("");
System.err.println(" Input:");
System.err.println(" -treeFile <FILE>:\tConvert from constituency trees in <FILE>");
System.err.println(" -sentFile <FILE>:\tParse and convert sentences from <FILE>. Only implemented for English.");
System.err.println("");
System.err.println(" Output:");
System.err.println(" -conllx:\t\tOutput dependencies in CoNLL format.");
System.err.println("");
System.err.println(" Language:");
System.err.println(" -language [en|zh|en-sd|zh-sd]:\t (Universal English Dependencies, Universal Chinese Dependencies, English Stanford Dependencies, Chinese Stanford Dependencies)");
System.err.println("");
System.err.println("");
System.err.println("");
System.err.println("Example:");
TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))"));
tb.add(tr.readTree());
} catch (Exception e) {
log.info("Horrible error: " + e);
e.printStackTrace();
}
} else if (altDepReaderName != null && altDepReaderFilename != null) {
DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName);
try {
gsBank = altDepReader.readDependencies(altDepReaderFilename);
} catch (IOException e) {
log.info("Error reading " + altDepReaderFilename);
return;
}
} else if (treeFileName != null) {
tb.loadPath(treeFileName);
} else if (filter != null) {
tb.load(IOUtils.readerFromStdin());
} else if (conllXFileName != null) {
try {
gsBank = params.readGrammaticalStructureFromFile(conllXFileName);
} catch (RuntimeIOException e) {
log.info("Error reading " + conllXFileName);
return;
}
} else {
String parserFile = props.getProperty("parserFile");
String parserOpts = props.getProperty("parserOpts");
boolean tokenized = props.getProperty("tokenized") != null;
Function<List<? extends HasWord>, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead);
trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp);
// necessarily have to use LexicalizedParser
try {
Method method = lp.getClass().getMethod("getTLPParams");
params = (TreebankLangParserParams) method.invoke(lp);
params.setGenerateOriginalDependencies(generateOriginalDependencies);
} catch (Exception cnfe) {
throw new RuntimeException(cnfe);
}
}
// treats the output according to the options passed
boolean basic = props.getProperty("basic") != null;
boolean collapsed = props.getProperty("collapsed") != null;
boolean CCprocessed = props.getProperty("CCprocessed") != null;
boolean collapsedTree = props.getProperty("collapsedTree") != null;
boolean nonCollapsed = props.getProperty("nonCollapsed") != null;
boolean extraSep = props.getProperty("extraSep") != null;
boolean parseTree = props.getProperty("parseTree") != null;
boolean test = props.getProperty("test") != null;
//always keep punctuation marks
boolean keepPunct = true;
boolean conllx = props.getProperty("conllx") != null;
// todo: Support checkConnected on more options (including basic)
boolean checkConnected = props.getProperty("checkConnected") != null;
boolean portray = props.getProperty("portray") != null;
boolean enhanced = props.getProperty("enhanced") != null;
boolean enhancedPlusPlus = props.getProperty("enhanced++") != null;
// If requested load alternative printer
DependencyPrinter altDepPrinter = null;
if (altDepPrinterName != null) {
altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName);
}
// log.info("First tree in tb is");
// log.info(((MemoryTreebank) tb).get(0));
Method m = null;
if (test) {
// Do this by reflection to avoid this becoming a dependency when we distribute the parser
try {
Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory");
m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Predicate.class, String.class, int.class);
} catch (Exception e) {
log.info("Test cannot check for cycles in tree format (classes not available)");
}
}
if (gsBank == null) {
gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params);
}
for (GrammaticalStructure gs : gsBank) {
Tree tree;
if (gsBank instanceof TreeBankGrammaticalStructureWrapper) {
// log.info("Using TreeBankGrammaticalStructureWrapper branch");
tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs);
// log.info("Tree is: ");
// log.info(t);
} else {
// log.info("Using gs.root() branch");
// recover tree
tree = gs.root();
// log.info("Tree from gs is");
// log.info(t);
}
if (test) {
// print the grammatical structure, the basic, collapsed and CCprocessed
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
System.out.println("------------- GrammaticalStructure -------------");
System.out.println(gs);
boolean allConnected = true;
boolean connected;
Collection<TypedDependency> bungRoots = null;
System.out.println("------------- basic dependencies ---------------");
List<TypedDependency> gsb = gs.typedDependencies(GrammaticalStructure.Extras.NONE);
System.out.println(StringUtils.join(gsb, "\n"));
connected = GrammaticalStructure.isConnected(gsb);
if (!connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gsb);
}
allConnected = connected && allConnected;
System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------");
List<TypedDependency> gse = gs.typedDependencies(GrammaticalStructure.Extras.MAXIMAL);
System.out.println(StringUtils.join(gse, "\n"));
connected = GrammaticalStructure.isConnected(gse);
if (!connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gse);
}
allConnected = connected && allConnected;
System.out.println("------------- collapsed dependencies -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "\n"));
System.out.println("------------- collapsed dependencies tree -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n"));
System.out.println("------------- CCprocessed dependencies --------");
List<TypedDependency> gscc = gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL);
System.out.println(StringUtils.join(gscc, "\n"));
System.out.println("-----------------------------------------------");
// connectivity tests
connected = GrammaticalStructure.isConnected(gscc);
if (!connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gscc);
}
allConnected = connected && allConnected;
if (allConnected) {
System.out.println("dependencies form connected graphs.");
} else {
System.out.println("dependency graph NOT connected! possible offending nodes: " + bungRoots);
}
// libraries
if (m != null) {
try {
// the first arg is null because it's a static method....
Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0);
Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph");
Method mDag = sg.getDeclaredMethod("isDag");
boolean isDag = (Boolean) mDag.invoke(semGraph);
System.out.println("tree dependencies form a DAG: " + isDag);
} catch (Exception e) {
e.printStackTrace();
}
}
} else // end of "test" output
{
if (parseTree) {
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
}
if (basic) {
if (collapsed || CCprocessed || collapsedTree || nonCollapsed || enhanced || enhancedPlusPlus) {
System.out.println("------------- basic dependencies ---------------");
}
if (altDepPrinter == null) {
printDependencies(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree, conllx, false, opts.convertToUPOS);
} else {
System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree));
}
}
if (nonCollapsed) {
if (basic || CCprocessed || collapsed || collapsedTree) {
System.out.println("----------- non-collapsed dependencies (basic + extra) -----------");
}
printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep, opts.convertToUPOS);
}
if (collapsed) {
if (basic || CCprocessed || collapsedTree || nonCollapsed) {
System.out.println("----------- collapsed dependencies -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
}
if (CCprocessed) {
if (basic || collapsed || collapsedTree || nonCollapsed) {
System.out.println("---------- CCprocessed dependencies ----------");
}
List<TypedDependency> deps = gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL);
if (checkConnected) {
if (!GrammaticalStructure.isConnected(deps)) {
log.info("Graph is not connected for:");
log.info(tree);
log.info("possible offending nodes: " + GrammaticalStructure.getRoots(deps));
}
}
printDependencies(gs, deps, tree, conllx, false, opts.convertToUPOS);
}
if (collapsedTree) {
if (basic || CCprocessed || collapsed || nonCollapsed) {
System.out.println("----------- collapsed dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false, opts.convertToUPOS);
}
if (enhanced) {
if (basic || enhancedPlusPlus) {
System.out.println("----------- enhanced dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesEnhanced(), tree, conllx, false, opts.convertToUPOS);
}
if (enhancedPlusPlus) {
if (basic || enhanced) {
System.out.println("----------- enhanced++ dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
}
// default use: enhanced++ for UD, CCprocessed for SD (to parallel what happens within the parser)
if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed && !enhanced && !enhancedPlusPlus) {
if (generateOriginalDependencies) {
printDependencies(gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
} else {
printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
}
}
}
if (portray) {
try {
// put up a window showing it
Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils");
Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class);
// the first arg is null because it's a static method....
mRender.invoke(null, gs, "Collapsed, CC processed deps");
} catch (Exception e) {
throw new RuntimeException("Couldn't use swing to portray semantic graph", e);
}
}
}
// end for
}
use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.
the class TreebankFactoredLexiconStats method main.
// private static String stripTag(String tag) {
// if (tag.startsWith("DT")) {
// String newTag = tag.substring(2, tag.length());
// return newTag.length() > 0 ? newTag : tag;
// }
// return tag;
// }
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
System.exit(-1);
}
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
if (language.equals(Language.Arabic)) {
String[] options = { "-arabicFactored" };
tlpp.setOptionFlag(options, 0);
} else {
String[] options = { "-frenchFactored" };
tlpp.setOptionFlag(options, 0);
}
Treebank tb = tlpp.diskTreebank();
tb.loadPath(args[1]);
MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
String[] features = args[2].trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
// Counters
Counter<String> wordTagCounter = new ClassicCounter<>(30000);
Counter<String> morphTagCounter = new ClassicCounter<>(500);
// Counter<String> signatureTagCounter = new ClassicCounter<String>();
Counter<String> morphCounter = new ClassicCounter<>(500);
Counter<String> wordCounter = new ClassicCounter<>(30000);
Counter<String> tagCounter = new ClassicCounter<>(300);
Counter<String> lemmaCounter = new ClassicCounter<>(25000);
Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
Counter<String> richTagCounter = new ClassicCounter<>(1000);
Counter<String> reducedTagCounter = new ClassicCounter<>(500);
Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
int numTrees = 0;
for (Tree tree : tb) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
List<Label> pretermList = tree.preTerminalYield();
List<Label> yield = tree.yield();
assert yield.size() == pretermList.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
String tag = pretermList.get(i).value();
String word = yield.get(i).value();
String morph = ((CoreLabel) yield.get(i)).originalText();
// Note: if there is no lemma, then we use the surface form.
Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
String lemma = lemmaTag.first();
String richTag = lemmaTag.second();
// WSGDEBUG
if (tag.contains("MW"))
lemma += "-MWE";
lemmaCounter.incrementCount(lemma);
lemmaTagCounter.incrementCount(lemma + tag);
richTagCounter.incrementCount(richTag);
String reducedTag = morphoSpec.strToFeatures(richTag).toString();
reducedTagCounter.incrementCount(reducedTag);
reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
wordTagCounter.incrementCount(word + tag);
morphTagCounter.incrementCount(morph + tag);
morphCounter.incrementCount(morph);
wordCounter.incrementCount(word);
tagCounter.incrementCount(tag);
reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
if (wordLemmaMap.containsKey(word)) {
wordLemmaMap.get(word).add(lemma);
} else {
Set<String> lemmas = Generics.newHashSet(1);
wordLemmaMap.put(word, lemmas);
}
lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
tagReducedTagCounter.incrementCount(tag, reducedTag);
}
++numTrees;
}
// Barf...
System.out.println("Language: " + language.toString());
System.out.printf("#trees:\t%d%n", numTrees);
System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
// Extra
System.out.println("==================");
StringBuilder sbNoLemma = new StringBuilder();
StringBuilder sbMultLemmas = new StringBuilder();
for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
String word = wordLemmas.getKey();
Set<String> lemmas = wordLemmas.getValue();
if (lemmas.size() == 0) {
sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
continue;
}
if (lemmas.size() > 1) {
sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
continue;
}
String lemma = lemmas.iterator().next();
Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
if (reducedTags.size() > 1) {
System.out.printf("%s --> %s%n", word, lemma);
for (String reducedTag : reducedTags) {
int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
}
System.out.println();
}
}
System.out.println("==================");
System.out.println(sbNoLemma.toString());
System.out.println(sbMultLemmas.toString());
System.out.println("==================");
List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
Collections.sort(tags);
for (String tag : tags) {
System.out.println(tag);
Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
for (String reducedTag : reducedTags) {
int count = tagReducedTagCounter.getCount(tag, reducedTag);
// reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
System.out.printf("\t%s\t%d%n", reducedTag, count);
}
System.out.println();
}
System.out.println("==================");
}
Aggregations