use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class AddMorphoAnnotations method main.
/**
*
* @param args
*/
public static void main(String[] args) {
if (args.length < minArgs) {
log.info(usage());
System.exit(-1);
}
Properties options = StringUtils.argsToProperties(args, argSpec());
String encoding = options.getProperty("e", "UTF-8");
boolean isMorphTreeFile = PropertiesUtils.getBool(options, "g", false);
String[] parsedArgs = options.getProperty("").split("\\s+");
if (parsedArgs.length != 2) {
log.info(usage());
System.exit(-1);
}
YieldIterator morphIter = new YieldIterator(parsedArgs[0], isMorphTreeFile);
YieldIterator lemmaIter = new YieldIterator(parsedArgs[1], false);
final Pattern pParenStripper = Pattern.compile("[\\(\\)]");
try {
BufferedReader brIn = new BufferedReader(new InputStreamReader(System.in, encoding));
TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true);
int nTrees = 0;
for (String line; (line = brIn.readLine()) != null; ++nTrees) {
Tree tree = trf.newTreeReader(new StringReader(line)).readTree();
List<Tree> leaves = tree.getLeaves();
if (!morphIter.hasNext()) {
throw new RuntimeException("Mismatch between number of morpho analyses and number of input lines.");
}
List<String> morphTags = morphIter.next();
if (!lemmaIter.hasNext()) {
throw new RuntimeException("Mismatch between number of lemmas and number of input lines.");
}
List<String> lemmas = lemmaIter.next();
// Sanity checks
assert morphTags.size() == lemmas.size();
assert lemmas.size() == leaves.size();
for (int i = 0; i < leaves.size(); ++i) {
String morphTag = morphTags.get(i);
if (pParenStripper.matcher(morphTag).find()) {
morphTag = pParenStripper.matcher(morphTag).replaceAll("");
}
String newLeaf = String.format("%s%s%s%s%s", leaves.get(i).value(), MorphoFeatureSpecification.MORPHO_MARK, lemmas.get(i), MorphoFeatureSpecification.LEMMA_MARK, morphTag);
leaves.get(i).setValue(newLeaf);
}
System.out.println(tree.toString());
}
// Sanity checks
assert !morphIter.hasNext();
assert !lemmaIter.hasNext();
System.err.printf("Processed %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class FTBCorrector method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
System.exit(-1);
}
TreeTransformer tt = new FTBCorrector();
File f = new File(args[0]);
try {
//These bad trees in the Candito training set should be thrown out:
// (ROOT (SENT (" ") (. .)))
// (ROOT (SENT (. .)))
TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
TregexMatcher m = pBadTree.matcher(t);
TregexMatcher m2 = pBadTree2.matcher(t);
if (m.find() || m2.find()) {
log.info("Discarding tree: " + t.toString());
} else {
Tree fixedT = tt.transformTree(t);
System.out.println(fixedT.toString());
}
}
tr.close();
System.err.printf("Wrote %d trees%n", nTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (TregexParseException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class MWEPreprocessor method resolveDummyTags.
private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new FrenchTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
traverseAndFix(t, pretermLabel, unigramTagger);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " + nTrees + " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class MultiWordPreprocessor method resolveDummyTags.
private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
TreeFactory tf = new LabeledScoredTreeFactory();
MultiWordTreeExpander expander = new MultiWordTreeExpander();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
int nTrees = 0;
for (Tree t; (t = tr.readTree()) != null; nTrees++) {
traverseAndFix(t, null, unigramTagger, retainNER);
// Now "decompress" further the expanded trees formed by
// multiword token splitting
t = expander.expandPhrases(t, tn, tf);
if (tn != null)
t = tn.normalizeWholeTree(t, tf);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " + nTrees + " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.
the class TaggedFileRecord method createRecord.
public static TaggedFileRecord createRecord(Properties config, String description) {
String[] pieces = description.split(",");
if (pieces.length == 1) {
return new TaggedFileRecord(description, Format.TEXT, getEncoding(config), getTagSeparator(config), null, null, null, null, null, null, null);
}
String[] args = new String[pieces.length - 1];
System.arraycopy(pieces, 0, args, 0, pieces.length - 1);
String file = pieces[pieces.length - 1];
Format format = Format.TEXT;
String encoding = getEncoding(config);
String tagSeparator = getTagSeparator(config);
TreeTransformer treeTransformer = null;
TreeNormalizer treeNormalizer = null;
TreeReaderFactory trf = null;
NumberRangesFileFilter treeRange = null;
Predicate<Tree> treeFilter = null;
Integer wordColumn = null, tagColumn = null;
for (String arg : args) {
String[] argPieces = arg.split("=", 2);
if (argPieces.length != 2) {
throw new IllegalArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s");
}
if (argPieces[0].equalsIgnoreCase(FORMAT)) {
format = Format.valueOf(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(ENCODING)) {
encoding = argPieces[1];
} else if (argPieces[0].equalsIgnoreCase(TAG_SEPARATOR)) {
tagSeparator = argPieces[1];
} else if (argPieces[0].equalsIgnoreCase(TREE_TRANSFORMER)) {
treeTransformer = ReflectionLoading.loadByReflection(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(TREE_NORMALIZER)) {
treeNormalizer = ReflectionLoading.loadByReflection(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(TREE_READER)) {
trf = ReflectionLoading.loadByReflection(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(TREE_RANGE)) {
String range = argPieces[1].replaceAll(":", ",");
treeRange = new NumberRangesFileFilter(range, true);
} else if (argPieces[0].equalsIgnoreCase(TREE_FILTER)) {
treeFilter = ReflectionLoading.loadByReflection(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(WORD_COLUMN)) {
wordColumn = Integer.valueOf(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
tagColumn = Integer.valueOf(argPieces[1]);
} else {
throw new IllegalArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown");
}
}
return new TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn);
}
Aggregations