use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.
the class MungeTreesWithMorfetteAnalyses method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
System.exit(-1);
}
String treeFile = args[0];
String morfetteFile = args[1];
TreeReaderFactory trf = new FrenchTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext(); ) {
List<CoreLabel> analysis = morfetteItr.next();
List<Label> yield = tree.yield();
assert analysis.size() == yield.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
CoreLabel tokenAnalysis = analysis.get(i);
Label token = yield.get(i);
String lemma = getLemma(token.value(), tokenAnalysis.lemma());
String newLeaf = String.format("%s%s%s%s%s", token.value(), MorphoFeatureSpecification.MORPHO_MARK, lemma, MorphoFeatureSpecification.LEMMA_MARK, tokenAnalysis.tag());
((CoreLabel) token).setValue(newLeaf);
}
System.out.println(tree.toString());
}
if (tr.readTree() != null || morfetteItr.hasNext()) {
log.info("WARNING: Uneven input files!");
}
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.
the class TreeToMorfette method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToMorfette.class.getName());
System.exit(-1);
}
String treeFile = args[0];
TreeReaderFactory trf = new FrenchTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
for (Tree tree1; (tree1 = tr.readTree()) != null; ) {
List<Label> pretermYield = tree1.preTerminalYield();
List<Label> yield = tree1.yield();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
CoreLabel rawToken = (CoreLabel) yield.get(i);
String word = rawToken.value();
String morphStr = rawToken.originalText();
Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr);
String lemma = lemmaMorph.first();
String morph = lemmaMorph.second();
if (morph == null || morph.equals("") || morph.equals("XXX")) {
morph = ((CoreLabel) pretermYield.get(i)).value();
}
System.out.printf("%s %s %s%n", word, lemma, morph);
}
System.out.println();
}
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.
the class FTBCorrector method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new FTBCorrector();
File f = new File(args[0]);
try {
//These bad trees in the Candito training set should be thrown out:
// (ROOT (SENT (" ") (. .)))
// (ROOT (SENT (. .)))
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
TregexMatcher m = pBadTree.matcher(t);
TregexMatcher m2 = pBadTree2.matcher(t);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + t.toString());
} else {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method resolveDummyTags.
private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
traverseAndFix(t, pretermLabel, unigramTagger);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " + nTrees + " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method resolveDummyTags.
private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
TreeFactory tf = new LabeledScoredTreeFactory();
MultiWordTreeExpander expander = new MultiWordTreeExpander();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
traverseAndFix(t, null, unigramTagger, retainNER);
// Now "decompress" further the expanded trees formed by
// multiword token splitting
t = expander.expandPhrases(t, tn, tf);
if (tn != null)
t = tn.normalizeWholeTree(t, tf);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " + nTrees + " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations