use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.
the class Preferences method lookupTreeReaderFactory.
static TreeReaderFactory lookupTreeReaderFactory(String trfName) {
if (trfName.equalsIgnoreCase("ArabicTreeReaderFactory")) {
return new ArabicTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("ArabicTreeReaderFactory.ArabicRawTreeReaderFactory")) {
return new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("CTBTreeReaderFactory")) {
return new CTBTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("NoEmptiesCTBTreeReaderFactory")) {
return new NoEmptiesCTBTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("Basic categories only (LabeledScoredTreeReaderFactory)")) {
return new LabeledScoredTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("FrenchTreeReaderFactory")) {
//PTB format
return new FrenchTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("PennTreeReaderFactory")) {
return new PennTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("StringLabeledScoredTreeReaderFactory")) {
return new StringLabeledScoredTreeReaderFactory();
} else if (trfName.equalsIgnoreCase("TregexTreeReaderFactory")) {
return new TregexPattern.TRegexTreeReaderFactory();
} else {
//try to find the class
try {
Class<?> trfClass = Class.forName(trfName);
TreeReaderFactory trf = (TreeReaderFactory) trfClass.newInstance();
return trf;
} catch (Exception e) {
return new PennTreeReaderFactory();
}
}
}
use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEPreprocessor.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for (Tree t; (t = tr.readTree()) != null; ) {
countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
}
//Closes the underlying reader
tr.close();
System.out.println("Generating {MWE Type -> Terminal}");
printCounter(labelTerm, "label_term.csv");
System.out.println("Generating {Terminal -> MWE Type}");
printCounter(termLabel, "term_label.csv");
System.out.println("Generating {MWE Type -> POS sequence}");
printCounter(labelPreterm, "label_pos.csv");
System.out.println("Generating {POS sequence -> MWE Type}");
printCounter(pretermLabel, "pos_label.csv");
if (RESOLVE_DUMMY_TAGS) {
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, pretermLabel, unigramTagger);
}
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println("#Missing POS: " + nMissingPOS);
System.out.println("#Missing Phrasal: " + nMissingPhrasal);
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.
the class MWEFrequencyDist method main.
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
System.exit(-1);
}
final File treeFile = new File(args[0]);
TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
Set<String> uniquePOSSequences = Generics.newHashSet();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
final TregexPattern pMWE = TregexPattern.compile("/^MW/");
for (Tree t; (t = tr.readTree()) != null; ) {
//Count MWE statistics
TregexMatcher m = pMWE.matcher(t);
while (m.findNextMatchingNode()) {
Tree match = m.getMatch();
String label = match.value();
List<CoreLabel> yield = match.taggedLabeledYield();
StringBuilder termYield = new StringBuilder();
StringBuilder posYield = new StringBuilder();
for (CoreLabel cl : yield) {
termYield.append(cl.word()).append(" ");
posYield.append(cl.tag()).append(" ");
}
mweLabelToString.incrementCount(label, termYield.toString().trim());
uniquePOSSequences.add(posYield.toString().trim());
}
}
//Closes the underlying reader
tr.close();
System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
double nMWEs = mweLabelToString.totalCount();
int nAllSingletons = 0;
int nTokens = 0;
for (String mweLabel : mweLabelToString.firstKeySet()) {
int nSingletons = 0;
double totalCount = mweLabelToString.totalCount(mweLabel);
Counter<String> mc = mweLabelToString.getCounter(mweLabel);
for (String term : mc.keySet()) {
if (mc.getCount(term) == 1.0)
nSingletons++;
nTokens += term.split("\\s+").length * (int) mc.getCount(term);
}
nAllSingletons += nSingletons;
System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
}
System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
System.out.println("#tokens = " + nTokens);
System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.
the class MungeTreesWithMorfetteAnalyses method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
System.exit(-1);
}
String treeFile = args[0];
String morfetteFile = args[1];
TreeReaderFactory trf = new FrenchTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext(); ) {
List<CoreLabel> analysis = morfetteItr.next();
List<Label> yield = tree.yield();
assert analysis.size() == yield.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
CoreLabel tokenAnalysis = analysis.get(i);
Label token = yield.get(i);
String lemma = getLemma(token.value(), tokenAnalysis.lemma());
String newLeaf = String.format("%s%s%s%s%s", token.value(), MorphoFeatureSpecification.MORPHO_MARK, lemma, MorphoFeatureSpecification.LEMMA_MARK, tokenAnalysis.tag());
((CoreLabel) token).setValue(newLeaf);
}
System.out.println(tree.toString());
}
if (tr.readTree() != null || morfetteItr.hasNext()) {
log.info("WARNING: Uneven input files!");
}
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.
the class TreeToMorfette method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToMorfette.class.getName());
System.exit(-1);
}
String treeFile = args[0];
TreeReaderFactory trf = new FrenchTreeReaderFactory();
try {
TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
for (Tree tree1; (tree1 = tr.readTree()) != null; ) {
List<Label> pretermYield = tree1.preTerminalYield();
List<Label> yield = tree1.yield();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
CoreLabel rawToken = (CoreLabel) yield.get(i);
String word = rawToken.value();
String morphStr = rawToken.originalText();
Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr);
String lemma = lemmaMorph.first();
String morph = lemmaMorph.second();
if (morph == null || morph.equals("") || morph.equals("XXX")) {
morph = ((CoreLabel) pretermYield.get(i)).value();
}
System.out.printf("%s %s %s%n", word, lemma, morph);
}
System.out.println();
}
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations