use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class ThreadedParserSlowITest method readTrees.
public static List<Tree> readTrees(String filename, String encoding) {
ArrayList<Tree> trees = new ArrayList<Tree>();
try {
TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
TreeReader tr = trf.newTreeReader(new InputStreamReader(new FileInputStream(filename), encoding));
Tree next;
while ((next = tr.readTree()) != null) {
trees.add(next);
}
System.out.println("Read " + trees.size() + " trees from " + filename);
return trees;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class ATBCorrector method main.
//For those trees that lack a sentence-final punc, add one.
// ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n"
// + "insert (PUNC .) $- loc\n"
// + "\n");
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + ATBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new ATBCorrector();
File f = new File(args[0]);
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method readTrees.
static Map<String, Tree> readTrees(String[] filenames) throws IOException {
// TODO: perhaps we can just pass in CC_TAGSET and get rid of replacePOSTags
// need to test that
final TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
Map<String, Tree> treeMap = Generics.newHashMap();
for (String filename : filenames) {
File file = new File(filename);
String canonicalFilename = file.getName().substring(0, file.getName().lastIndexOf('.'));
FrenchXMLTreeReader tr = (FrenchXMLTreeReader) trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO8859_1")));
Tree t = null;
int numTrees;
for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
String id = canonicalFilename + "-" + ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
treeMap.put(id, t);
}
tr.close();
System.err.printf("%s: %d trees%n", file.getName(), numTrees);
}
return treeMap;
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method main.
/**
*
* @param args
*/
public static void main(String[] args) {
Properties options = StringUtils.argsToProperties(args, argOptionDefs);
if (!options.containsKey("") || options.containsKey("help")) {
log.info(usage());
return;
}
boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
final File treeFile = new File(options.getProperty(""));
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for (Tree t; (t = tr.readTree()) != null; ) {
updateTagger(unigramTagger, t);
}
//Closes the underlying reader
tr.close();
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double) nFixedPOS / nMissingPOS * 100));
System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double) nFixedPhrasal / nMissingPhrasal * 100));
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEPreprocessor.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for (Tree t; (t = tr.readTree()) != null; ) {
countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
}
//Closes the underlying reader
tr.close();
System.out.println("Generating {MWE Type -> Terminal}");
printCounter(labelTerm, "label_term.csv");
System.out.println("Generating {Terminal -> MWE Type}");
printCounter(termLabel, "term_label.csv");
System.out.println("Generating {MWE Type -> POS sequence}");
printCounter(labelPreterm, "label_pos.csv");
System.out.println("Generating {POS sequence -> MWE Type}");
printCounter(pretermLabel, "pos_label.csv");
if (RESOLVE_DUMMY_TAGS) {
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, pretermLabel, unigramTagger);
}
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println("#Missing POS: " + nMissingPOS);
System.out.println("#Missing Phrasal: " + nMissingPhrasal);
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations