use of edu.stanford.nlp.trees.TreebankLanguagePack in project CoreNLP by stanfordnlp.
the class SceneGraphImagePCFGParser method main.
public static void main(String[] args) throws IOException {
LexicalizedParser parser = LexicalizedParser.getParserFromSerializedFile(PCFG_MODEL);
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
String filename = args[0];
BufferedReader reader = IOUtils.readerFromString(filename);
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
SceneGraphImage img = SceneGraphImage.readFromJSON(line);
if (img == null) {
continue;
}
for (SceneGraphImageRegion region : img.regions) {
if (region.tokens != null) {
for (CoreLabel token : region.tokens) {
token.remove(CoreAnnotations.PartOfSpeechAnnotation.class);
}
Tree t = parser.apply(region.tokens);
region.gs = gsf.newGrammaticalStructure(t);
}
}
System.out.println(img.toJSON());
}
}
use of edu.stanford.nlp.trees.TreebankLanguagePack in project CoreNLP by stanfordnlp.
the class PTBTokenizerTest method testPTBTokenizerGerman.
@SuppressWarnings("unchecked")
@Test
public void testPTBTokenizerGerman() {
String[] sample = { "Das TV-Duell von Kanzlerin Merkel und SPD-Herausforderer Steinbrück war eher lahm - können es die Spitzenleute der kleinen Parteien besser? ", "Die erquickende Sicherheit und Festigkeit in der Bewegung, den Vorrat von Kraft, kann ja die Versammlung nicht fühlen, hören will sie sie nicht, also muß sie sie sehen; und die sehe man einmal in einem Paar spitzen Schultern, zylindrischen Schenkeln, oder leeren Ärmeln, oder lattenförmigen Beinen." };
String[][] tokenized = { { "Das", "TV", "-", "Duell", "von", "Kanzlerin", "Merkel", "und", "SPD", "-", "Herausforderer", "Steinbrück", "war", "eher", "lahm", "-", "können", "es", "die", "Spitzenleute", "der", "kleinen", "Parteien", "besser", "?" }, { "Die", "erquickende", "Sicherheit", "und", "Festigkeit", "in", "der", "Bewegung", ",", "den", "Vorrat", "von", "Kraft", ",", "kann", "ja", "die", "Versammlung", "nicht", "fühlen", ",", "hören", "will", "sie", "sie", "nicht", ",", "also", "muß", "sie", "sie", "sehen", ";", "und", "die", "sehe", "man", "einmal", "in", "einem", "Paar", "spitzen", "Schultern", ",", "zylindrischen", "Schenkeln", ",", "oder", "leeren", "Ärmeln", ",", "oder", "lattenförmigen", "Beinen", "." } };
TreebankLanguagePack tlp = new NegraPennLanguagePack();
TokenizerFactory tokFactory = tlp.getTokenizerFactory();
runOnTwoArrays(tokFactory, sample, tokenized);
}
use of edu.stanford.nlp.trees.TreebankLanguagePack in project CoreNLP by stanfordnlp.
the class ChineseEnglishWordMap method main.
/**
* The main method reads (segmented, whitespace delimited) words from a file
* and prints them with their English translation(s).
*
* The path and filename of the CEDict Lexicon can be supplied via the
* "-dictPath" flag; otherwise the default filename "cedict_ts.u8" in the
* current directory is checked.
*
* By default, only the first translation is printed. If the "-all" flag
* is given, all translations are printed.
*
* The input and output encoding can be specified using the "-encoding" flag.
* Otherwise UTF-8 is assumed.
*/
public static void main(String[] args) throws IOException {
Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
flagsToNumArgs.put("-dictPath", 1);
flagsToNumArgs.put("-encoding", 1);
Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
String[] otherArgs = argMap.get(null);
if (otherArgs.length < 1) {
log.info("usage: ChineseEnglishWordMap [-all] [-dictPath path] [-encoding enc_string] inputFile");
System.exit(1);
}
String filename = otherArgs[0];
boolean allTranslations = argMap.containsKey("-all");
String charset = defaultCharset;
if (argMap.containsKey("-encoding")) {
charset = argMap.get("-encoding")[0];
}
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));
TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
String[] dpString = argMap.get("-dictPath");
ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
int totalWords = 0, coveredWords = 0;
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true);
for (String line = r.readLine(); line != null; line = r.readLine()) {
String[] words = line.split("\\s", 1000);
for (String word : words) {
totalWords++;
if (word.length() == 0)
continue;
pw.print(StringUtils.pad(word + ':', 8));
if (tlp.isPunctuationWord(word)) {
totalWords--;
pw.print(word);
} else if (isDigits(word)) {
pw.print(word + " [NUMBER]");
} else if (cewm.containsKey(word)) {
coveredWords++;
if (allTranslations) {
List<String> trans = new ArrayList<>(cewm.getAllTranslations(word));
for (String s : trans) {
pw.print((trans.indexOf(s) > 0 ? "|" : "") + s);
}
} else {
pw.print(cewm.getFirstTranslation(word));
}
} else {
pw.print("[UNK]");
}
pw.println();
}
pw.println();
}
r.close();
log.info("Finished translating " + totalWords + " words (");
log.info(coveredWords + " were in dictionary).");
}
use of edu.stanford.nlp.trees.TreebankLanguagePack in project CoreNLP by stanfordnlp.
the class TueBaDZLanguagePack method main.
/**
* Prints a few aspects of the TreebankLanguagePack, just for debugging.
*/
public static void main(String[] args) {
TreebankLanguagePack tlp = new TueBaDZLanguagePack();
System.out.println("Start symbol: " + tlp.startSymbol());
String start = tlp.startSymbol();
System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
String[] strs = new String[] { "-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD" };
for (String str : strs) {
System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
}
}
use of edu.stanford.nlp.trees.TreebankLanguagePack in project CoreNLP by stanfordnlp.
the class SplitMaker method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s tree_file%n", SplitMaker.class.getName());
System.exit(-1);
}
TreebankLanguagePack tlp = new HebrewTreebankLanguagePack();
String inputFile = args[0];
File treeFile = new File(inputFile);
try {
TreeReaderFactory trf = new HebrewTreeReaderFactory();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.getEncoding()));
TreeReader tr = trf.newTreeReader(br);
PrintWriter pwDev = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.dev"), false, tlp.getEncoding()));
PrintWriter pwTrain = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.train"), false, tlp.getEncoding()));
PrintWriter pwTest = new PrintWriter(new PrintStream(new FileOutputStream(inputFile + ".clean.test"), false, tlp.getEncoding()));
int numTrees = 0;
for (Tree t; ((t = tr.readTree()) != null); numTrees++) {
if (numTrees < 483)
pwDev.println(t.toString());
else if (numTrees >= 483 && numTrees < 5724)
pwTrain.println(t.toString());
else
pwTest.println(t.toString());
}
tr.close();
pwDev.close();
pwTrain.close();
pwTest.close();
System.err.printf("Processed %d trees.%n", numTrees);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations