use of edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method readTrees.
static Map<String, Tree> readTrees(String[] filenames) throws IOException {
// TODO: perhaps we can just pass in CC_TAGSET and get rid of replacePOSTags
// need to test that
final TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
Map<String, Tree> treeMap = Generics.newHashMap();
for (String filename : filenames) {
File file = new File(filename);
String canonicalFilename = file.getName().substring(0, file.getName().lastIndexOf('.'));
FrenchXMLTreeReader tr = (FrenchXMLTreeReader) trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO8859_1")));
Tree t = null;
int numTrees;
for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
String id = canonicalFilename + "-" + ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
treeMap.put(id, t);
}
tr.close();
System.err.printf("%s: %d trees%n", file.getName(), numTrees);
}
return treeMap;
}
use of edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory in project CoreNLP by stanfordnlp.
the class FTBDataset method setOptions.
@Override
public boolean setOptions(Properties opts) {
boolean ret = super.setOptions(opts);
if (opts.containsKey(ConfigParser.paramSplit)) {
String splitFileName = opts.getProperty(ConfigParser.paramSplit);
splitSet = makeSplitSet(splitFileName);
}
CC_TAGSET = PropertiesUtils.getBool(opts, ConfigParser.paramCCTagset, false);
treebank = new MemoryTreebank(new FrenchXMLTreeReaderFactory(CC_TAGSET), FrenchTreebankLanguagePack.FTB_ENCODING);
if (lexMapper == null) {
lexMapper = new DefaultMapper();
lexMapper.setup(null, lexMapOptions.split(","));
}
if (pathsToMappings.size() != 0) {
if (posMapper == null)
posMapper = new DefaultMapper();
for (File path : pathsToMappings) posMapper.setup(path);
}
return ret;
}
Aggregations