use of edu.stanford.nlp.trees.PennTreeReader in project CoreNLP by stanfordnlp.
the class CreateClauseDataset method processDirectory.
/**
* Process all the trees in the given directory. For example, the WSJ section of the Penn Treebank.
*
* @param name The name of the directory we are processing.
* @param directory The directory we are processing.
* @return A dataset of subject/object pairs in the trees in the directory.
* This is a list of sentences, such that each sentence has a collection of pairs of spans.
* Each pair of spans is a subject/object span pair that constitutes a valid extraction.
* @throws IOException
*/
private static List<Pair<CoreMap, Collection<Pair<Span, Span>>>> processDirectory(String name, File directory) throws IOException {
forceTrack("Processing " + name);
// Prepare the files to iterate over
Iterable<File> files = IOUtils.iterFilesRecursive(directory, "mrg");
int numTreesProcessed = 0;
List<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData = new ArrayList<>(1024);
// Iterate over the files
for (File file : files) {
// log(file);
TreeReader reader = new PennTreeReader(IOUtils.readerFromFile(file));
Tree tree;
while ((tree = reader.readTree()) != null) {
try {
// Prepare the tree
tree.indexSpans();
tree.setSpans();
// Get relevant information from sentence
List<CoreLabel> tokens = tree.getLeaves().stream().map(leaf -> (CoreLabel) leaf.label()).collect(Collectors.toList());
SemanticGraph graph = parse(tree);
Map<Integer, Span> targets = findTraceTargets(tree);
Map<Integer, Integer> sources = findTraceSources(tree);
// Create a sentence object
CoreMap sentence = new ArrayCoreMap(4) {
{
set(CoreAnnotations.TokensAnnotation.class, tokens);
set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, graph);
set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, graph);
}
};
natlog.doOneSentence(null, sentence);
// Generate training data
Collection<Pair<Span, Span>> trainingDataFromSentence = subjectObjectPairs(graph, tokens, targets, sources);
trainingData.add(Pair.makePair(sentence, trainingDataFromSentence));
// Debug print
numTreesProcessed += 1;
if (numTreesProcessed % 100 == 0) {
log("[" + new DecimalFormat("00000").format(numTreesProcessed) + "] " + countDatums(trainingData) + " known extractions");
}
} catch (Throwable t) {
t.printStackTrace();
}
}
}
// End
log("" + numTreesProcessed + " trees processed yielding " + countDatums(trainingData) + " known extractions");
endTrack("Processing " + name);
return trainingData;
}
use of edu.stanford.nlp.trees.PennTreeReader in project CoreNLP by stanfordnlp.
the class MetaClass method cast.
/**
* Cast a String representation of an object into that object.
* E.g. "5.4" will be cast to a Double; "[1,2,3]" will be cast
* to an Integer[].
*
* NOTE: Date parses from a Long
*
* @param <E> The type of the object returned (same as type)
* @param value The string representation of the object
* @param type The type (usually class) to be returned (same as E)
* @return An object corresponding to the String value passed
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static <E> E cast(String value, Type type) {
// --Get Type
Class<?> clazz;
if (type instanceof Class) {
clazz = (Class<?>) type;
} else if (type instanceof ParameterizedType) {
ParameterizedType pt = (ParameterizedType) type;
clazz = (Class<?>) pt.getRawType();
} else {
throw new IllegalArgumentException("Cannot cast to type (unhandled type): " + type);
}
// --Cast
if (String.class.isAssignableFrom(clazz)) {
// (case: String)
return (E) value;
} else if (Boolean.class.isAssignableFrom(clazz) || boolean.class.isAssignableFrom(clazz)) {
// (case: boolean)
if ("1".equals(value)) {
return (E) Boolean.TRUE;
}
return (E) Boolean.valueOf(Boolean.parseBoolean(value));
} else if (Integer.class.isAssignableFrom(clazz) || int.class.isAssignableFrom(clazz)) {
// (case: integer)
try {
return (E) Integer.valueOf(Integer.parseInt(value));
} catch (NumberFormatException e) {
return (E) Integer.valueOf((int) Double.parseDouble(value));
}
} else if (BigInteger.class.isAssignableFrom(clazz)) {
// (case: biginteger)
if (value == null) {
return (E) BigInteger.ZERO;
}
return (E) new BigInteger(value);
} else if (Long.class.isAssignableFrom(clazz) || long.class.isAssignableFrom(clazz)) {
// (case: long)
try {
return (E) Long.valueOf(Long.parseLong(value));
} catch (NumberFormatException e) {
return (E) Long.valueOf((long) Double.parseDouble(value));
}
} else if (Float.class.isAssignableFrom(clazz) || float.class.isAssignableFrom(clazz)) {
// (case: float)
if (value == null) {
return (E) Float.valueOf(Float.NaN);
}
return (E) Float.valueOf(Float.parseFloat(value));
} else if (Double.class.isAssignableFrom(clazz) || double.class.isAssignableFrom(clazz)) {
// (case: double)
if (value == null) {
return (E) Double.valueOf(Double.NaN);
}
return (E) Double.valueOf(Double.parseDouble(value));
} else if (BigDecimal.class.isAssignableFrom(clazz)) {
// (case: bigdecimal)
if (value == null) {
return (E) BigDecimal.ZERO;
}
return (E) new BigDecimal(value);
} else if (Short.class.isAssignableFrom(clazz) || short.class.isAssignableFrom(clazz)) {
// (case: short)
try {
return (E) Short.valueOf(Short.parseShort(value));
} catch (NumberFormatException e) {
return (E) Short.valueOf((short) Double.parseDouble(value));
}
} else if (Byte.class.isAssignableFrom(clazz) || byte.class.isAssignableFrom(clazz)) {
// (case: byte)
try {
return (E) Byte.valueOf(Byte.parseByte(value));
} catch (NumberFormatException e) {
return (E) Byte.valueOf((byte) Double.parseDouble(value));
}
} else if (Character.class.isAssignableFrom(clazz) || char.class.isAssignableFrom(clazz)) {
// (case: char)
return (E) Character.valueOf((char) Integer.parseInt(value));
} else if (Lazy.class.isAssignableFrom(clazz)) {
// (case: Lazy)
final String v = value;
return (E) Lazy.of(() -> MetaClass.castWithoutKnowingType(v));
} else if (Optional.class.isAssignableFrom(clazz)) {
// (case: Optional)
return (E) ((value == null || "null".equals(value.toLowerCase()) || "empty".equals(value.toLowerCase()) || "none".equals(value.toLowerCase())) ? Optional.empty() : Optional.of(value));
} else if (java.util.Date.class.isAssignableFrom(clazz)) {
// (case: date)
try {
return (E) new Date(Long.parseLong(value));
} catch (NumberFormatException e) {
return null;
}
} else if (java.util.Calendar.class.isAssignableFrom(clazz)) {
// (case: date)
try {
Date d = new Date(Long.parseLong(value));
GregorianCalendar cal = new GregorianCalendar();
cal.setTime(d);
return (E) cal;
} catch (NumberFormatException e) {
return null;
}
} else if (FileWriter.class.isAssignableFrom(clazz)) {
try {
return (E) new FileWriter(new File(value));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else if (BufferedReader.class.isAssignableFrom(clazz)) {
try {
return (E) IOUtils.readerFromString(value);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else if (FileReader.class.isAssignableFrom(clazz)) {
try {
return (E) new FileReader(new File(value));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
} else if (File.class.isAssignableFrom(clazz)) {
return (E) new File(value);
} else if (Class.class.isAssignableFrom(clazz)) {
try {
return (E) Class.forName(value);
} catch (ClassNotFoundException e) {
return null;
}
} else if (clazz.isArray()) {
if (value == null) {
return null;
}
Class<?> subType = clazz.getComponentType();
// (case: array)
String[] strings = StringUtils.decodeArray(value);
Object[] array = (Object[]) Array.newInstance(clazz.getComponentType(), strings.length);
for (int i = 0; i < strings.length; i++) {
array[i] = cast(strings[i], subType);
}
return (E) array;
} else if (Map.class.isAssignableFrom(clazz)) {
return (E) StringUtils.decodeMap(value);
} else if (clazz.isEnum()) {
// (case: enumeration)
Class c = (Class) clazz;
if (value == null) {
return null;
}
if (value.charAt(0) == '"')
value = value.substring(1);
if (value.charAt(value.length() - 1) == '"')
value = value.substring(0, value.length() - 1);
try {
return (E) Enum.valueOf(c, value);
} catch (Exception e) {
try {
return (E) Enum.valueOf(c, value.toLowerCase(Locale.ROOT));
} catch (Exception e2) {
try {
return (E) Enum.valueOf(c, value.toUpperCase(Locale.ROOT));
} catch (Exception e3) {
return (E) Enum.valueOf(c, (Character.isUpperCase(value.charAt(0)) ? Character.toLowerCase(value.charAt(0)) : Character.toUpperCase(value.charAt(0))) + value.substring(1));
}
}
}
} else if (ObjectOutputStream.class.isAssignableFrom(clazz)) {
// (case: object output stream)
try {
return (E) new ObjectOutputStream((OutputStream) cast(value, OutputStream.class));
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (ObjectInputStream.class.isAssignableFrom(clazz)) {
// (case: object input stream)
try {
return (E) new ObjectInputStream((InputStream) cast(value, InputStream.class));
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (PrintStream.class.isAssignableFrom(clazz)) {
// (case: input stream)
if (value.equalsIgnoreCase("stdout") || value.equalsIgnoreCase("out")) {
return (E) System.out;
}
if (value.equalsIgnoreCase("stderr") || value.equalsIgnoreCase("err")) {
return (E) System.err;
}
try {
return (E) new PrintStream(new FileOutputStream(value));
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (PrintWriter.class.isAssignableFrom(clazz)) {
// (case: input stream)
if (value.equalsIgnoreCase("stdout") || value.equalsIgnoreCase("out")) {
return (E) new PrintWriter(System.out);
}
if (value.equalsIgnoreCase("stderr") || value.equalsIgnoreCase("err")) {
return (E) new PrintWriter(System.err);
}
try {
return (E) IOUtils.getPrintWriter(value);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (OutputStream.class.isAssignableFrom(clazz)) {
// (case: output stream)
if (value.equalsIgnoreCase("stdout") || value.equalsIgnoreCase("out")) {
return (E) System.out;
}
if (value.equalsIgnoreCase("stderr") || value.equalsIgnoreCase("err")) {
return (E) System.err;
}
File toWriteTo = cast(value, File.class);
try {
if (toWriteTo == null || (!toWriteTo.exists() && !toWriteTo.createNewFile())) {
throw new IllegalStateException("Could not create output stream (cannot write file): " + value);
}
return (E) IOUtils.getFileOutputStream(value);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (InputStream.class.isAssignableFrom(clazz)) {
// (case: input stream)
if (value.equalsIgnoreCase("stdin") || value.equalsIgnoreCase("in")) {
return (E) System.in;
}
try {
return (E) IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(value);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
try {
// (case: can parse from string)
Method decode = clazz.getMethod("fromString", String.class);
return (E) decode.invoke(MetaClass.create(clazz), value);
} catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException | ClassCastException e) {
// Silent errors for misc failures
}
// Pass 2: Guess what the object could be
if (Tree.class.isAssignableFrom(clazz)) {
// (case: reading a tree)
try {
return (E) new PennTreeReader(new StringReader(value), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
} catch (IOException e) {
throw new RuntimeException(e);
}
} else if (Collection.class.isAssignableFrom(clazz)) {
// (case: reading a collection)
Collection rtn;
if (Modifier.isAbstract(clazz.getModifiers())) {
rtn = abstractToConcreteCollectionMap.get(clazz).createInstance();
} else {
rtn = MetaClass.create(clazz).createInstance();
}
Class<?> subType = clazz.getComponentType();
String[] strings = StringUtils.decodeArray(value);
for (String string : strings) {
if (subType == null) {
rtn.add(castWithoutKnowingType(string));
} else {
rtn.add(cast(string, subType));
}
}
return (E) rtn;
} else {
// We could not cast this object
return null;
}
}
}
use of edu.stanford.nlp.trees.PennTreeReader in project CoreNLP by stanfordnlp.
the class DependencyIndexITest method testPositions.
@Test
public void testPositions() {
try {
// System.err.println();
// System.err.println("One.");
// check a tree loaded from a reader, using StringLabelFactory
Tree tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(new StringLabelFactory()))).readTree();
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Two.");
// check a tree created using Tree.valueOf()
tree = Tree.valueOf("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))");
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Three.");
// check a tree loaded from a reader, using CoreLabelFactory
tree = (new PennTreeReader(new StringReader("(S (NP (NNP Mary)) (VP (VBD had) (NP (DT a) (JJ little) (NN lamb))) (. .))"), new LabeledScoredTreeFactory(CoreLabel.factory()))).readTree();
// System.out.println(tree.pennString());
checkTree(tree);
// System.err.println("Four.");
// check a tree generated by the parser
LexicalizedParser parser = LexicalizedParser.loadModel();
tree = parser.parse("Mary had a little lamb .");
// System.out.println(tree.pennString());
tree.indexLeaves();
checkTree(tree);
} catch (IOException e) {
// this should never happen
fail("IOException shouldn't happen.");
}
}
use of edu.stanford.nlp.trees.PennTreeReader in project CoreNLP by stanfordnlp.
the class CustomAnnotationSerializer method read.
@Override
public Pair<Annotation, InputStream> read(InputStream is) throws IOException {
if (compress && !(is instanceof GZIPInputStream))
is = new GZIPInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
Annotation doc = new Annotation("");
String line;
// read the coref graph (new format)
Map<Integer, CorefChain> chains = loadCorefChains(reader);
if (chains != null)
doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains);
// read the coref graph (old format)
line = reader.readLine().trim();
if (line.length() > 0) {
String[] bits = line.split(" ");
if (bits.length % 4 != 0) {
throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
}
List<Pair<IntTuple, IntTuple>> corefGraph = new ArrayList<>();
for (int i = 0; i < bits.length; i += 4) {
IntTuple src = new IntTuple(2);
IntTuple dst = new IntTuple(2);
src.set(0, Integer.parseInt(bits[i]));
src.set(1, Integer.parseInt(bits[i + 1]));
dst.set(0, Integer.parseInt(bits[i + 2]));
dst.set(1, Integer.parseInt(bits[i + 3]));
corefGraph.add(new Pair<>(src, dst));
}
doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph);
}
// read individual sentences
List<CoreMap> sentences = new ArrayList<>();
while ((line = reader.readLine()) != null) {
CoreMap sentence = new Annotation("");
// first line is the parse tree. construct it with CoreLabels in Tree nodes
Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
// read the dependency graphs
IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader);
// the remaining lines until empty line are tokens
List<CoreLabel> tokens = new ArrayList<>();
while ((line = reader.readLine()) != null) {
if (line.length() == 0)
break;
CoreLabel token = loadToken(line, haveExplicitAntecedent);
tokens.add(token);
}
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
// convert the intermediate graph to an actual SemanticGraph
SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps);
SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
sentences.add(sentence);
}
doc.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return Pair.makePair(doc, is);
}
use of edu.stanford.nlp.trees.PennTreeReader in project CoreNLP by stanfordnlp.
the class TreeJPanel method main.
public static void main(String[] args) throws IOException {
TreeJPanel tjp = new TreeJPanel();
// String ptbTreeString1 = "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test))) (. .)))";
String ptbTreeString = "(ROOT (S (NP (NNP Interactive_Tregex)) (VP (VBZ works)) (PP (IN for) (PRP me)) (. !))))";
if (args.length > 0) {
ptbTreeString = args[0];
}
Tree tree = (new PennTreeReader(new StringReader(ptbTreeString), new LabeledScoredTreeFactory(new StringLabelFactory()))).readTree();
tjp.setTree(tree);
tjp.setBackground(Color.white);
JFrame frame = new JFrame();
frame.getContentPane().add(tjp, BorderLayout.CENTER);
frame.addWindowListener(new WindowAdapter() {
@Override
public void windowClosing(WindowEvent e) {
System.exit(0);
}
});
frame.pack();
frame.setVisible(true);
frame.setVisible(true);
}
Aggregations