use of edu.stanford.nlp.international.french.pipeline.FTBCorrector in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method outputSplits.
/**
* Right now this outputs trees in PTB format. It outputs one tree
* at a time until we have output enough trees to fill the given
* file, then moves on to the next file. Trees are output in the
* order given in the <code>ids</code> list.
* <br>
* Trees have their words replaced with the words' lemmas, if those
* lemmas exist.
*/
public static void outputSplits(List<String> ids, Map<String, Tree> treeMap) throws IOException {
Queue<Integer> fSizeQueue = new LinkedList<>(Arrays.asList(fSizes));
Queue<String> fNameQueue = new LinkedList<>(Arrays.asList(fNames));
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
final TreeTransformer tt = new FTBCorrector();
int size = fSizeQueue.remove();
String filename = fNameQueue.remove();
log.info("Outputing " + filename);
PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
int outputCount = 0;
for (String id : ids) {
if (!treeMap.containsKey(id)) {
log.info("Missing id: " + id);
continue;
}
Tree tree = treeMap.get(id);
TregexMatcher m = pBadTree.matcher(tree);
TregexMatcher m2 = pBadTree2.matcher(tree);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + tree.toString());
continue;
}
// Punctuation normalization, etc.
Tree backupCopy = tree.deepCopy();
tree = tt.transformTree(tree);
if (tree.firstChild().children().length == 0) {
// Some trees have only punctuation. Tregex will mangle these. Don't throw those away.
log.info("Saving tree: " + tree.toString());
log.info("Backup: " + backupCopy.toString());
tree = backupCopy;
}
if (LEMMAS_AS_LEAVES || ADD_MORPHO_TO_LEAVES) {
mungeLeaves(tree, LEMMAS_AS_LEAVES, ADD_MORPHO_TO_LEAVES);
}
if (CC_TAGSET) {
replacePOSTags(tree);
}
if (MORFETTE_OUTPUT) {
writer.println(treeToMorfette(tree));
} else {
writer.println(tree.toString());
}
++outputCount;
if (outputCount == size) {
outputCount = 0;
size = fSizeQueue.remove();
filename = fNameQueue.remove();
log.info("Outputing " + filename);
writer.close();
writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
}
}
writer.close();
}
Aggregations