use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class FrenchXMLTreeReader method getTreeFromXML.
private Tree getTreeFromXML(Node root) {
final Element eRoot = (Element) root;
if (eRoot.getNodeName().equals(NODE_WORD) && eRoot.getElementsByTagName(NODE_WORD).getLength() == 0) {
String posStr = getPOS(eRoot);
posStr = treeNormalizer.normalizeNonterminal(posStr);
List<String> lemmas = getLemma(eRoot);
String morph = getMorph(eRoot);
List<String> leafToks = getWordString(eRoot.getTextContent().trim());
String subcat = getSubcat(eRoot);
if (lemmas != null && lemmas.size() != leafToks.size()) {
// If this happens (and it does for a few poorly editted trees)
// we assume something has gone wrong and ignore the lemmas.
log.info("Lemmas don't match tokens, ignoring lemmas: " + "lemmas " + lemmas + ", tokens " + leafToks);
lemmas = null;
}
//Terminals can have multiple tokens (MWEs). Make these into a
//flat structure for now.
Tree t = null;
List<Tree> kids = new ArrayList<>();
if (leafToks.size() > 1) {
for (int i = 0; i < leafToks.size(); ++i) {
String tok = leafToks.get(i);
String s = treeNormalizer.normalizeTerminal(tok);
List<Tree> leafList = new ArrayList<>();
Tree leafNode = treeFactory.newLeaf(s);
if (leafNode.label() instanceof HasWord)
((HasWord) leafNode.label()).setWord(s);
if (leafNode.label() instanceof CoreLabel && lemmas != null) {
((CoreLabel) leafNode.label()).setLemma(lemmas.get(i));
}
if (leafNode.label() instanceof HasContext) {
((HasContext) leafNode.label()).setOriginalText(morph);
}
if (leafNode.label() instanceof HasCategory) {
((HasCategory) leafNode.label()).setCategory(subcat);
}
leafList.add(leafNode);
Tree posNode = treeFactory.newTreeNode(MISSING_POS, leafList);
if (posNode.label() instanceof HasTag)
((HasTag) posNode.label()).setTag(MISSING_POS);
kids.add(posNode);
}
t = treeFactory.newTreeNode(MISSING_PHRASAL, kids);
} else {
String leafStr = treeNormalizer.normalizeTerminal(leafToks.get(0));
Tree leafNode = treeFactory.newLeaf(leafStr);
if (leafNode.label() instanceof HasWord)
((HasWord) leafNode.label()).setWord(leafStr);
if (leafNode.label() instanceof CoreLabel && lemmas != null) {
((CoreLabel) leafNode.label()).setLemma(lemmas.get(0));
}
if (leafNode.label() instanceof HasContext) {
((HasContext) leafNode.label()).setOriginalText(morph);
}
if (leafNode.label() instanceof HasCategory) {
((HasCategory) leafNode.label()).setCategory(subcat);
}
kids.add(leafNode);
t = treeFactory.newTreeNode(posStr, kids);
if (t.label() instanceof HasTag)
((HasTag) t.label()).setTag(posStr);
}
return t;
}
List<Tree> kids = new ArrayList<>();
for (Node childNode = eRoot.getFirstChild(); childNode != null; childNode = childNode.getNextSibling()) {
if (childNode.getNodeType() != Node.ELEMENT_NODE)
continue;
Tree t = getTreeFromXML(childNode);
if (t == null) {
System.err.printf("%s: Discarding empty tree (root: %s)%n", this.getClass().getName(), childNode.getNodeName());
} else {
kids.add(t);
}
}
// MWEs have a label with a
String rootLabel = eRoot.getNodeName().trim();
boolean isMWE = rootLabel.equals("w") && eRoot.hasAttribute(ATTR_POS);
if (isMWE)
rootLabel = eRoot.getAttribute(ATTR_POS).trim();
Tree t = (kids.size() == 0) ? null : treeFactory.newTreeNode(treeNormalizer.normalizeNonterminal(rootLabel), kids);
if (t != null && isMWE)
t = postProcessMWE(t);
return t;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class FrenchXMLTreeReader method main.
/**
* For debugging.
*
* @param args
*/
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s tree_file(s)%n%n", FrenchXMLTreeReader.class.getName());
System.exit(-1);
}
List<File> fileList = new ArrayList<>();
for (String arg : args) fileList.add(new File(arg));
TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
int totalTrees = 0;
Set<String> morphAnalyses = Generics.newHashSet();
try {
for (File file : fileList) {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
Tree t;
int numTrees;
String canonicalFileName = file.getName().substring(0, file.getName().lastIndexOf('.'));
for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
String ftbID = ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
System.out.printf("%s-%s\t%s%n", canonicalFileName, ftbID, t.toString());
List<Label> leaves = t.yield();
for (Label label : leaves) {
if (label instanceof CoreLabel)
morphAnalyses.add(((CoreLabel) label).originalText());
}
}
tr.close();
System.err.printf("%s: %d trees%n", file.getName(), numTrees);
totalTrees += numTrees;
}
//wsg2011: Print out the observed morphological analyses
// for(String analysis : morphAnalyses)
// log.info(analysis);
System.err.printf("%nRead %d trees%n", totalTrees);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class SplitMaker method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s tree_file%n", SplitMaker.class.getName());
System.exit(-1);
}
TreebankLanguagePack tlp = new HebrewTreebankLanguagePack();
String inputFile = args[0];
File treeFile = new File(inputFile);
try {
TreeReaderFactory trf = new HebrewTreeReaderFactory();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.getEncoding()));
TreeReader tr = trf.newTreeReader(br);
PrintWriter pwDev = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.dev"), false, tlp.getEncoding()));
PrintWriter pwTrain = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.train"), false, tlp.getEncoding()));
PrintWriter pwTest = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.test"), false, tlp.getEncoding()));
int numTrees = 0;
for (Tree t; ((t = tr.readTree()) != null); numTrees++) {
if (numTrees < 483)
pwDev.println(t.toString());
else if (numTrees >= 483 && numTrees < 5724)
pwTrain.println(t.toString());
else
pwTest.println(t.toString());
}
tr.close();
pwDev.close();
pwTrain.close();
pwTest.close();
System.err.printf("Processed %d trees.%n", numTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class NegraHeadFinder method determineNonTrivialHead.
/** Called by determineHead and may be overridden in subclasses
* if special treatment is necessary for particular categories.
*/
protected Tree determineNonTrivialHead(Tree t, Tree parent) {
Tree theHead = null;
String motherCat = basicCategory(t.label().value());
if (motherCat.startsWith("@")) {
motherCat = motherCat.substring(1);
}
if (DEBUG) {
log.info("Looking for head of " + t.label() + "; value is |" + t.label().value() + "|, " + " baseCat is |" + motherCat + "|");
}
// We know we have nonterminals underneath
// (a bit of a Penn Treebank assumption, but).
// Look at label.
String[][] how = nonTerminalInfo.get(motherCat);
if (how == null) {
if (DEBUG) {
log.info("Warning: No rule found for " + motherCat + " (first char: " + motherCat.charAt(0) + ")");
log.info("Known nonterms are: " + nonTerminalInfo.keySet());
}
if (defaultRule != null) {
if (DEBUG) {
log.info(" Using defaultRule");
}
return traverseLocate(t.children(), defaultRule, true);
} else {
return null;
}
}
for (int i = 0; i < how.length; i++) {
boolean deflt = (i == how.length - 1);
theHead = traverseLocate(t.children(), how[i], deflt);
if (theHead != null) {
break;
}
}
if (DEBUG) {
log.info(" Chose " + theHead.label());
}
return theHead;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class SpanishXMLTreeReader method process.
/**
* Read trees from the given file and output their processed forms to
* standard output.
*/
public static void process(File file, TreeReader tr, Pattern posPattern, Pattern wordPattern, boolean plainPrint) throws IOException {
Tree t;
int numTrees = 0, numTreesRetained = 0;
String canonicalFileName = file.getName().substring(0, file.getName().lastIndexOf('.'));
while ((t = tr.readTree()) != null) {
numTrees++;
if (!shouldPrintTree(t, posPattern, wordPattern))
continue;
numTreesRetained++;
String ftbID = ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
String output = toString(t, plainPrint);
System.out.printf("%s-%s\t%s%n", canonicalFileName, ftbID, output);
}
System.err.printf("%s: %d trees, %d matched and printed%n", file.getName(), numTrees, numTreesRetained);
}
Aggregations