use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.
the class FTBCorrector method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new FTBCorrector();
File f = new File(args[0]);
try {
//These bad trees in the Candito training set should be thrown out:
// (ROOT (SENT (" ") (. .)))
// (ROOT (SENT (. .)))
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
TregexMatcher m = pBadTree.matcher(t);
TregexMatcher m2 = pBadTree2.matcher(t);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + t.toString());
} else {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.
the class FTBDataset method build.
@Override
public void build() {
for (File path : pathsToData) {
treebank.loadPath(path, treeFileExtension, false);
}
PrintWriter outfile = null;
PrintWriter flatFile = null;
try {
outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName), "UTF-8"))) : null;
outputFileList.add(outFileName);
if (makeFlatFile) {
outputFileList.add(flatFileName);
toStringBuffer.append(" Made flat files\n");
}
preprocessMWEs();
List<TregexPattern> badTrees = new ArrayList<>();
//These trees appear in the Candito training set
//They are mangled by the TreeCorrector, so discard them ahead of time.
badTrees.add(TregexPattern.compile("@SENT <: @PUNC"));
badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __"));
//wsg2011: This filters out tree #552 in the Candito test set. We saved this tree for the
//EMNLP2011 paper, but since it consists entirely of punctuation, it won't be evaluated anyway.
//Since we aren't doing the split in this data set, just remove the tree.
badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC <3 @PUNC <4 @PUNC !<5 __"));
for (Tree t : treebank) {
//Filter out bad trees
boolean skipTree = false;
for (TregexPattern p : badTrees) {
skipTree = p.matcher(t).find();
if (skipTree)
break;
}
if (skipTree) {
log.info("Discarding tree: " + t.toString());
continue;
}
// Filter out trees that aren't in this part of the split
if (splitSet != null) {
String canditoTreeID = getCanditoTreeID(t);
if (!splitSet.contains(canditoTreeID)) {
continue;
}
}
if (customTreeVisitor != null)
customTreeVisitor.visitTree(t);
// outfile.printf("%s\t%s%n",treeName,t.toString());
outfile.println(t.toString());
if (makeFlatFile) {
String flatString = (removeEscapeTokens) ? ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(t)) : ATBTreeUtils.flattenTree(t);
flatFile.println(flatString);
}
}
} catch (UnsupportedEncodingException e) {
System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName());
e.printStackTrace();
} catch (FileNotFoundException e) {
System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName);
} catch (TregexParseException e) {
System.err.printf("%s: Could not compile Tregex expressions%n", this.getClass().getName());
e.printStackTrace();
} finally {
if (outfile != null)
outfile.close();
if (flatFile != null)
flatFile.close();
}
}
use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.
the class ATBCorrector method loadOps.
private List<Pair<TregexPattern, TsurgeonPattern>> loadOps() {
List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<>();
String line = null;
try {
BufferedReader br = new BufferedReader(new StringReader(editStr));
List<TsurgeonPattern> tsp = new ArrayList<>();
while ((line = br.readLine()) != null) {
if (DEBUG)
log.info("Pattern is " + line);
TregexPattern matchPattern = TregexPattern.compile(line);
if (DEBUG)
log.info(" [" + matchPattern + "]");
tsp.clear();
while (continuing(line = br.readLine())) {
TsurgeonPattern p = Tsurgeon.parseOperation(line);
if (DEBUG)
log.info("Operation is " + line + " [" + p + "]");
tsp.add(p);
}
if (!tsp.isEmpty()) {
TsurgeonPattern tp = Tsurgeon.collectOperations(tsp);
ops.add(new Pair<>(matchPattern, tp));
}
}
// while not at end of file
} catch (IOException ioe) {
ioe.printStackTrace();
}
return ops;
}
use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.
the class RHSFrequency method main.
public static void main(String[] args) {
if (args.length < minArgs) {
System.out.println(usage.toString());
System.exit(-1);
}
TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
DiskTreebank tb = null;
String encoding = "UTF-8";
TregexPattern rootMatch = null;
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
switch(args[i]) {
case "-l":
Language lang = Language.valueOf(args[++i].trim());
tlpp = lang.params;
break;
case "-e":
encoding = args[++i];
break;
default:
System.out.println(usage.toString());
System.exit(-1);
}
} else {
rootMatch = TregexPattern.compile("@" + args[i++]);
if (tb == null) {
if (tlpp == null) {
System.out.println(usage.toString());
System.exit(-1);
} else {
tlpp.setInputEncoding(encoding);
tlpp.setOutputEncoding(encoding);
tb = tlpp.diskTreebank();
}
}
tb.loadPath(args[i++]);
}
}
Counter<String> rhsCounter = new ClassicCounter<>();
for (Tree t : tb) {
TregexMatcher m = rootMatch.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
StringBuilder sb = new StringBuilder();
for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
rhsCounter.incrementCount(sb.toString().trim());
}
}
List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
PrintWriter pw = tlpp.pw();
for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
pw.close();
}
use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.
the class MentionExtractor method findTreePattern.
/** Find syntactic pattern in a sentence by tregex */
private void findTreePattern(Tree tree, String tregex, Set<Pair<Integer, Integer>> foundPairs) {
try {
TregexPattern tgrepPattern = TregexPattern.compile(tregex);
findTreePattern(tree, tgrepPattern, foundPairs);
} catch (Exception e) {
// shouldn't happen....
throw new RuntimeException(e);
}
}
Aggregations