use of edu.stanford.nlp.trees.Tree in project lucida by claritylab.
the class StanfordParser method mapOffsets.
/**
* Maps Tree node offsets using provided mapping.
* @param tree the Tree whose begin and end extents should be mapped.
* @param mapping the list of RangeMap objects which defines the mapping.
*/
protected static void mapOffsets(Tree tree, List<RangeMap> mapping) {
// if mapping is empty, then assume 1-to-1 mapping.
if (mapping == null || mapping.size() == 0)
return;
int begin_map_index = 0;
RangeMap begin_rmap = mapping.get(begin_map_index);
TREE: for (Tree t : tree) {
if (t.isLeaf())
continue;
MapLabel label = (MapLabel) t.label();
int begin = (Integer) label.get(BEGIN_KEY);
// "end" must be index of last char in range
int end = (Integer) label.get(END_KEY) - 1;
// annotation.begin");
while (begin_rmap.end <= begin) {
begin_map_index++;
if (begin_map_index >= mapping.size())
break TREE;
begin_rmap = mapping.get(begin_map_index);
}
// mapping is 1-to-1).
if (begin_rmap.begin > end) {
// mapping)");
continue;
}
// if beginning of current annotation falls within current range
// map, then map it back to source space.
int new_begin = begin;
if (begin_rmap.begin <= new_begin) {
// log.debug("Applying RangeMap to begin offset");
new_begin = begin_rmap.map(new_begin);
}
// find the first rangemap whose end is greater than the end of
// current annotation.
// log.debug("Finding RangeMap whose extents include
// annotation.end");
int end_map_index = begin_map_index;
RangeMap end_rmap = begin_rmap;
END_OFFSET: while (end_rmap.end <= end) {
end_map_index++;
if (end_map_index >= mapping.size())
break END_OFFSET;
end_rmap = mapping.get(end_map_index);
}
// if end of current annotation falls within "end" range map,
// then map it back to source space.
int new_end = end;
if (end_rmap.begin <= end) {
// log.debug("Applying RangeMap to end offset");
new_end = end_rmap.map(end);
}
label.put(BEGIN_KEY, new_begin);
label.put(END_KEY, new_end + 1);
}
}
use of edu.stanford.nlp.trees.Tree in project lucida by claritylab.
the class StanfordParser method updateTreeLabels.
protected static void updateTreeLabels(Tree root, Tree tree, MutableInteger offset, MutableInteger leafIndex) {
if (tree.isLeaf()) {
leafIndex.value++;
return;
}
String labelValue = tree.label().value().toUpperCase();
int begin = root.leftCharEdge(tree);
int end = root.rightCharEdge(tree);
//System.out.println(labelValue+"("+begin+","+end+")");
int length = end - begin;
// apply offset to begin extent
begin += offset.value;
// calculate offset delta based on label
if (double_quote_lable_pattern.matcher(labelValue).matches() && length > 1) {
offset.value--;
log.debug("Quotes label pattern fired: " + offset);
} else if (bracket_label_pattern.matcher(labelValue).matches()) {
offset.value -= 4;
log.debug("Bracket label pattern fired: " + offset);
} else if (tree.isPreTerminal()) {
Tree leaf = tree.firstChild();
String text = leaf.label().value();
Matcher matcher = escaped_char_pattern.matcher(text);
while (matcher.find()) {
offset.value--;
}
}
for (Tree child : tree.children()) updateTreeLabels(root, child, offset, leafIndex);
// apply offset to end extent
end += offset.value;
// set begin and end offsets on node
MapLabel label = new MapLabel(tree.label());
label.put(BEGIN_KEY, begin);
label.put(END_KEY, end);
label.put(MapLabel.INDEX_KEY, leafIndex.value);
tree.setLabel(label);
}
use of edu.stanford.nlp.trees.Tree in project lucida by claritylab.
the class StanfordParser method parse.
/**
* Parses a sentence and returns a string representation of the parse tree.
*
* @param sentence a sentence
* @return Tree whose Label is a MapLabel containing correct begin and end
* character offsets in keys BEGIN_KEY and END_KEY
*/
@SuppressWarnings("unchecked")
public static String parse(String sentence) {
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce stanford Tree
log.debug("Parsing sentence");
Tree tree = null;
synchronized (parser) {
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
tree = parser.getBestParse();
}
return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class TsarfatyEval method main.
/**
* Run the scoring metric on guess/gold input. This method performs "Collinization."
* The default language is English.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
int maxGoldYield = Integer.MAX_VALUE;
int maxGuessYield = Integer.MAX_VALUE;
boolean VERBOSE = false;
boolean skipGuess = false;
boolean tagMode = false;
String guessFile = null;
String goldFile = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-y":
maxGoldYield = Integer.parseInt(args[++i].trim());
break;
case "-t":
tagMode = true;
break;
case "-v":
VERBOSE = true;
break;
case "-g":
maxGuessYield = Integer.parseInt(args[++i].trim());
skipGuess = true;
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
//Required parameters
goldFile = args[i++];
guessFile = args[i];
break;
}
}
final PrintWriter pwOut = tlpp.pw();
final Treebank guessTreebank = tlpp.diskTreebank();
guessTreebank.loadPath(guessFile);
pwOut.println("GUESS TREEBANK:");
pwOut.println(guessTreebank.textualSummary());
final Treebank goldTreebank = tlpp.diskTreebank();
goldTreebank.loadPath(goldFile);
pwOut.println("GOLD TREEBANK:");
pwOut.println(goldTreebank.textualSummary());
final String evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";
final TsarfatyEval eval = new TsarfatyEval(evalName, tagMode);
final TreeTransformer tc = tlpp.collinizer();
//PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
//don't match, we need to keep looking for the next gold tree that matches.
//The evalb ref implementation differs slightly as it expects one tree per line. It assigns
//status as follows:
//
// 0 - Ok (yields match)
// 1 - length mismatch
// 2 - null parse e.g. (()).
//
//In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
final Iterator<Tree> goldItr = goldTreebank.iterator();
int goldLineId = 0;
int skippedGuessTrees = 0;
for (final Tree guess : guessTreebank) {
final Tree evalGuess = tc.transformTree(guess);
final ArrayList<Label> guessSent = guess.yield();
final String guessChars = SentenceUtils.listToString(guessSent).replaceAll("\\s+", "");
if (guessSent.size() > maxGuessYield) {
skippedGuessTrees++;
continue;
}
boolean doneEval = false;
while (goldItr.hasNext() && !doneEval) {
final Tree gold = goldItr.next();
final Tree evalGold = tc.transformTree(gold);
goldLineId++;
final ArrayList<Label> goldSent = gold.yield();
final String goldChars = SentenceUtils.listToString(goldSent).replaceAll("\\s+", "");
if (goldSent.size() > maxGoldYield) {
continue;
} else if (goldChars.length() != guessChars.length()) {
pwOut.printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.length(), goldChars.length());
skippedGuessTrees++;
//Default evalb behavior -- skip this guess tree
break;
}
eval.evaluate(evalGuess, evalGold, ((VERBOSE) ? pwOut : null));
//Move to the next guess parse
doneEval = true;
}
}
pwOut.println("================================================================================");
if (skippedGuessTrees != 0)
pwOut.printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
eval.display(true, pwOut);
pwOut.println();
pwOut.close();
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class MLEDependencyGrammar method tune.
/** Tune the smoothing and interpolation parameters of the dependency
* grammar based on a tuning treebank.
*
* @param trees A Collection of Trees for setting parameters
*/
@Override
public void tune(Collection<Tree> trees) {
List<IntDependency> deps = new ArrayList<>();
for (Tree tree : trees) {
deps.addAll(treeToDependencyList(tree, wordIndex, tagIndex));
}
double bestScore = Double.NEGATIVE_INFINITY;
double bestSmooth_stop = 0.0;
double bestSmooth_aTW_hTWd = 0.0;
double bestSmooth_aT_hTWd = 0.0;
double bestInterp = 0.0;
log.info("Tuning smooth_stop...");
for (smooth_stop = 1.0 / 100.0; smooth_stop < 100.0; smooth_stop *= 1.25) {
double totalScore = 0.0;
for (IntDependency dep : deps) {
if (!rootTW(dep.head)) {
double stopProb = getStopProb(dep);
if (!dep.arg.equals(stopTW)) {
stopProb = 1.0 - stopProb;
}
if (stopProb > 0.0) {
totalScore += Math.log(stopProb);
}
}
}
if (totalScore > bestScore) {
bestScore = totalScore;
bestSmooth_stop = smooth_stop;
}
}
smooth_stop = bestSmooth_stop;
log.info("Tuning selected smooth_stop: " + smooth_stop);
for (Iterator<IntDependency> iter = deps.iterator(); iter.hasNext(); ) {
IntDependency dep = iter.next();
if (dep.arg.equals(stopTW)) {
iter.remove();
}
}
log.info("Tuning other parameters...");
if (!useSmoothTagProjection) {
bestScore = Double.NEGATIVE_INFINITY;
for (smooth_aTW_hTWd = 0.5; smooth_aTW_hTWd < 100.0; smooth_aTW_hTWd *= 1.25) {
log.info(".");
for (smooth_aT_hTWd = 0.5; smooth_aT_hTWd < 100.0; smooth_aT_hTWd *= 1.25) {
for (interp = 0.02; interp < 1.0; interp += 0.02) {
double totalScore = 0.0;
for (IntDependency dep : deps) {
double score = score(dep);
if (score > Double.NEGATIVE_INFINITY) {
totalScore += score;
}
}
if (totalScore > bestScore) {
bestScore = totalScore;
bestInterp = interp;
bestSmooth_aTW_hTWd = smooth_aTW_hTWd;
bestSmooth_aT_hTWd = smooth_aT_hTWd;
log.info("Current best interp: " + interp + " with score " + totalScore);
}
}
}
}
smooth_aTW_hTWd = bestSmooth_aTW_hTWd;
smooth_aT_hTWd = bestSmooth_aT_hTWd;
interp = bestInterp;
} else {
// for useSmoothTagProjection
double bestSmooth_aTW_aT = 0.0;
double bestSmooth_aTW_hTd = 0.0;
double bestSmooth_aT_hTd = 0.0;
bestScore = Double.NEGATIVE_INFINITY;
for (smooth_aTW_hTWd = 1.125; smooth_aTW_hTWd < 100.0; smooth_aTW_hTWd *= 1.5) {
log.info("#");
for (smooth_aT_hTWd = 1.125; smooth_aT_hTWd < 100.0; smooth_aT_hTWd *= 1.5) {
log.info(":");
for (smooth_aTW_aT = 1.125; smooth_aTW_aT < 200.0; smooth_aTW_aT *= 1.5) {
log.info(".");
for (smooth_aTW_hTd = 1.125; smooth_aTW_hTd < 100.0; smooth_aTW_hTd *= 1.5) {
for (smooth_aT_hTd = 1.125; smooth_aT_hTd < 100.0; smooth_aT_hTd *= 1.5) {
for (interp = 0.2; interp <= 0.8; interp += 0.02) {
double totalScore = 0.0;
for (IntDependency dep : deps) {
double score = score(dep);
if (score > Double.NEGATIVE_INFINITY) {
totalScore += score;
}
}
if (totalScore > bestScore) {
bestScore = totalScore;
bestInterp = interp;
bestSmooth_aTW_hTWd = smooth_aTW_hTWd;
bestSmooth_aT_hTWd = smooth_aT_hTWd;
bestSmooth_aTW_aT = smooth_aTW_aT;
bestSmooth_aTW_hTd = smooth_aTW_hTd;
bestSmooth_aT_hTd = smooth_aT_hTd;
log.info("Current best interp: " + interp + " with score " + totalScore);
}
}
}
}
}
}
log.info();
}
smooth_aTW_hTWd = bestSmooth_aTW_hTWd;
smooth_aT_hTWd = bestSmooth_aT_hTWd;
smooth_aTW_aT = bestSmooth_aTW_aT;
smooth_aTW_hTd = bestSmooth_aTW_hTd;
smooth_aT_hTd = bestSmooth_aT_hTd;
interp = bestInterp;
}
log.info("\nTuning selected smooth_aTW_hTWd: " + smooth_aTW_hTWd + " smooth_aT_hTWd: " + smooth_aT_hTWd + " interp: " + interp + " smooth_aTW_aT: " + smooth_aTW_aT + " smooth_aTW_hTd: " + smooth_aTW_hTd + " smooth_aT_hTd: " + smooth_aT_hTd);
}
Aggregations