use of edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method main.
/**
*
* @param args
*/
public static void main(String[] args) {
Properties options = StringUtils.argsToProperties(args, argOptionDefs);
if (!options.containsKey("") || options.containsKey("help")) {
log.info(usage());
return;
}
boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
final File treeFile = new File(options.getProperty(""));
TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for (Tree t; (t = tr.readTree()) != null; ) {
updateTagger(unigramTagger, t);
}
//Closes the underlying reader
tr.close();
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double) nFixedPOS / nMissingPOS * 100));
System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double) nFixedPhrasal / nMissingPhrasal * 100));
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory in project CoreNLP by stanfordnlp.
the class TreeToTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
System.exit(-1);
}
String treeFile = args[0];
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
StringBuilder sb = new StringBuilder();
String nl = System.getProperty("line.separator");
Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
Pattern npPattern = Pattern.compile("^np0000.$");
for (Tree tree; (tree = tr.readTree()) != null; ) {
for (Tree t : tree) {
if (!t.isPreTerminal())
continue;
char type = 'O';
Tree grandma = t.ancestor(1, tree);
String grandmaValue = ((CoreLabel) grandma.label()).value();
// grup.nom.x
if (nePattern.matcher(grandmaValue).find())
type = grandmaValue.charAt(9);
else // else check the pos for np0000x or not
{
String pos = ((CoreLabel) t.label()).value();
if (npPattern.matcher(pos).find())
type = pos.charAt(6);
}
Tree wordNode = t.firstChild();
String word = ((CoreLabel) wordNode.label()).value();
sb.append(word).append("\t");
switch(type) {
case 'p':
sb.append("PERS");
break;
case 'l':
sb.append("LUG");
break;
case 'o':
sb.append("ORG");
break;
case '0':
sb.append("OTROS");
break;
default:
sb.append("O");
}
sb.append(nl);
}
sb.append(nl);
}
System.out.print(sb.toString());
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method resolveDummyTags.
private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
TreeFactory tf = new LabeledScoredTreeFactory();
MultiWordTreeExpander expander = new MultiWordTreeExpander();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
traverseAndFix(t, null, unigramTagger, retainNER);
// Now "decompress" further the expanded trees formed by
// multiword token splitting
t = expander.expandPhrases(t, tn, tf);
if (tn != null)
t = tn.normalizeWholeTree(t, tf);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " + nTrees + " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations