use of edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method replacePOSTags.
private static void replacePOSTags(Tree tree) {
List<Label> yield = tree.yield();
List<Label> preYield = tree.preTerminalYield();
assert yield.size() == preYield.size();
MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification();
for (int i = 0; i < yield.size(); i++) {
// Morphological Analysis
String morphStr = ((CoreLabel) yield.get(i)).originalText();
if (morphStr == null || morphStr.equals("")) {
morphStr = preYield.get(i).value();
// POS subcategory
String subCat = ((CoreLabel) yield.get(i)).category();
if (subCat != null && subCat != "") {
morphStr += "-" + subCat + "--";
} else {
morphStr += "---";
}
}
MorphoFeatures feats = spec.strToFeatures(morphStr);
if (feats.getAltTag() != null && !feats.getAltTag().equals("")) {
CoreLabel cl = (CoreLabel) preYield.get(i);
cl.setValue(feats.getAltTag());
cl.setTag(feats.getAltTag());
}
}
}
use of edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification in project CoreNLP by stanfordnlp.
the class TreebankFactoredLexiconStats method main.
// private static String stripTag(String tag) {
// if (tag.startsWith("DT")) {
// String newTag = tag.substring(2, tag.length());
// return newTag.length() > 0 ? newTag : tag;
// }
// return tag;
// }
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.printf("Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName());
System.exit(-1);
}
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
if (language.equals(Language.Arabic)) {
String[] options = { "-arabicFactored" };
tlpp.setOptionFlag(options, 0);
} else {
String[] options = { "-frenchFactored" };
tlpp.setOptionFlag(options, 0);
}
Treebank tb = tlpp.diskTreebank();
tb.loadPath(args[1]);
MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();
String[] features = args[2].trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
// Counters
Counter<String> wordTagCounter = new ClassicCounter<>(30000);
Counter<String> morphTagCounter = new ClassicCounter<>(500);
// Counter<String> signatureTagCounter = new ClassicCounter<String>();
Counter<String> morphCounter = new ClassicCounter<>(500);
Counter<String> wordCounter = new ClassicCounter<>(30000);
Counter<String> tagCounter = new ClassicCounter<>(300);
Counter<String> lemmaCounter = new ClassicCounter<>(25000);
Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);
Counter<String> richTagCounter = new ClassicCounter<>(1000);
Counter<String> reducedTagCounter = new ClassicCounter<>(500);
Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);
Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();
TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000);
TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500);
TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300);
int numTrees = 0;
for (Tree tree : tb) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
List<Label> pretermList = tree.preTerminalYield();
List<Label> yield = tree.yield();
assert yield.size() == pretermList.size();
int yieldLen = yield.size();
for (int i = 0; i < yieldLen; ++i) {
String tag = pretermList.get(i).value();
String word = yield.get(i).value();
String morph = ((CoreLabel) yield.get(i)).originalText();
// Note: if there is no lemma, then we use the surface form.
Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
String lemma = lemmaTag.first();
String richTag = lemmaTag.second();
// WSGDEBUG
if (tag.contains("MW"))
lemma += "-MWE";
lemmaCounter.incrementCount(lemma);
lemmaTagCounter.incrementCount(lemma + tag);
richTagCounter.incrementCount(richTag);
String reducedTag = morphoSpec.strToFeatures(richTag).toString();
reducedTagCounter.incrementCount(reducedTag);
reducedTagLemmaCounter.incrementCount(reducedTag + lemma);
wordTagCounter.incrementCount(word + tag);
morphTagCounter.incrementCount(morph + tag);
morphCounter.incrementCount(morph);
wordCounter.incrementCount(word);
tagCounter.incrementCount(tag);
reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
if (wordLemmaMap.containsKey(word)) {
wordLemmaMap.get(word).add(lemma);
} else {
Set<String> lemmas = Generics.newHashSet(1);
wordLemmaMap.put(word, lemmas);
}
lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
tagReducedTagCounter.incrementCount(tag, reducedTag);
}
++numTrees;
}
// Barf...
System.out.println("Language: " + language.toString());
System.out.printf("#trees:\t%d%n", numTrees);
System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());
// Extra
System.out.println("==================");
StringBuilder sbNoLemma = new StringBuilder();
StringBuilder sbMultLemmas = new StringBuilder();
for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
String word = wordLemmas.getKey();
Set<String> lemmas = wordLemmas.getValue();
if (lemmas.size() == 0) {
sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
continue;
}
if (lemmas.size() > 1) {
sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
continue;
}
String lemma = lemmas.iterator().next();
Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
if (reducedTags.size() > 1) {
System.out.printf("%s --> %s%n", word, lemma);
for (String reducedTag : reducedTags) {
int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
}
System.out.println();
}
}
System.out.println("==================");
System.out.println(sbNoLemma.toString());
System.out.println(sbMultLemmas.toString());
System.out.println("==================");
List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
Collections.sort(tags);
for (String tag : tags) {
System.out.println(tag);
Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
for (String reducedTag : reducedTags) {
int count = tagReducedTagCounter.getCount(tag, reducedTag);
// reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
System.out.printf("\t%s\t%d%n", reducedTag, count);
}
System.out.println();
}
System.out.println("==================");
}
use of edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification in project CoreNLP by stanfordnlp.
the class FactoredLexicon method main.
/**
* @param args
*/
public static void main(String[] args) {
if (args.length != 4) {
System.err.printf("Usage: java %s language features train_file dev_file%n", FactoredLexicon.class.getName());
System.exit(-1);
}
// Command line options
Language language = Language.valueOf(args[0]);
TreebankLangParserParams tlpp = language.params;
Treebank trainTreebank = tlpp.diskTreebank();
trainTreebank.loadPath(args[2]);
Treebank devTreebank = tlpp.diskTreebank();
devTreebank.loadPath(args[3]);
MorphoFeatureSpecification morphoSpec;
Options options = getOptions(language);
if (language.equals(Language.Arabic)) {
morphoSpec = new ArabicMorphoFeatureSpecification();
String[] languageOptions = { "-arabicFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else if (language.equals(Language.French)) {
morphoSpec = new FrenchMorphoFeatureSpecification();
String[] languageOptions = { "-frenchFactored" };
tlpp.setOptionFlag(languageOptions, 0);
} else {
throw new UnsupportedOperationException();
}
String featureList = args[1];
String[] features = featureList.trim().split(",");
for (String feature : features) {
morphoSpec.activate(MorphoFeatureType.valueOf(feature));
}
System.out.println("Language: " + language.toString());
System.out.println("Features: " + args[1]);
// Create word and tag indices
// Save trees in a collection since the interface requires that....
System.out.print("Loading training trees...");
List<Tree> trainTrees = new ArrayList<>(19000);
Index<String> wordIndex = new HashIndex<>();
Index<String> tagIndex = new HashIndex<>();
for (Tree tree : trainTreebank) {
for (Tree subTree : tree) {
if (!subTree.isLeaf()) {
tlpp.transformTree(subTree, tree);
}
}
trainTrees.add(tree);
}
System.out.printf("Done! (%d trees)%n", trainTrees.size());
// Setup and train the lexicon.
System.out.print("Collecting sufficient statistics for lexicon...");
FactoredLexicon lexicon = new FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
lexicon.initializeTraining(trainTrees.size());
lexicon.train(trainTrees, null);
lexicon.finishTraining();
System.out.println("Done!");
trainTrees = null;
// Load the tuning set
System.out.print("Loading tuning set...");
List<FactoredLexiconEvent> tuningSet = getTuningSet(devTreebank, lexicon, tlpp);
System.out.printf("...Done! (%d events)%n", tuningSet.size());
// Print the probabilities that we obtain
// TODO(spenceg): Implement tagging accuracy with FactLex
int nCorrect = 0;
Counter<String> errors = new ClassicCounter<>();
for (FactoredLexiconEvent event : tuningSet) {
Iterator<IntTaggedWord> itr = lexicon.ruleIteratorByWord(event.word(), event.getLoc(), event.featureStr());
Counter<Integer> logScores = new ClassicCounter<>();
boolean noRules = true;
int goldTagId = -1;
while (itr.hasNext()) {
noRules = false;
IntTaggedWord iTW = itr.next();
if (iTW.tag() == event.tagId()) {
log.info("GOLD-");
goldTagId = iTW.tag();
}
float tagScore = lexicon.score(iTW, event.getLoc(), event.word(), event.featureStr());
logScores.incrementCount(iTW.tag(), tagScore);
}
if (noRules) {
System.err.printf("NO TAGGINGS: %s %s%n", event.word(), event.featureStr());
} else {
// Score the tagging
int hypTagId = Counters.argmax(logScores);
if (hypTagId == goldTagId) {
++nCorrect;
} else {
String goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.get(goldTagId);
errors.incrementCount(goldTag);
}
}
log.info();
}
// Output accuracy
double acc = (double) nCorrect / (double) tuningSet.size();
System.err.printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
log.info("% of errors by type:");
List<String> biggestKeys = new ArrayList<>(errors.keySet());
Collections.sort(biggestKeys, Counters.toComparator(errors, false, true));
Counters.normalize(errors);
for (String key : biggestKeys) {
System.err.printf("%s\t%.2f%n", key, errors.getCount(key) * 100.0);
}
}
use of edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification in project CoreNLP by stanfordnlp.
the class FrenchTreebankParserParams method setOptionFlag.
@Override
public int setOptionFlag(String[] args, int i) {
if (annotations.containsKey(args[i])) {
addFeature(args[i]);
i++;
} else if (args[i].equals("-collinizerRetainsPunctuation")) {
optionsString.append("Collinizer retains punctuation.\n");
collinizerRetainsPunctuation = true;
i++;
} else if (args[i].equalsIgnoreCase("-headFinder") && (i + 1 < args.length)) {
try {
HeadFinder hf = (HeadFinder) Class.forName(args[i + 1]).newInstance();
setHeadFinder(hf);
optionsString.append("HeadFinder: " + args[i + 1] + "\n");
} catch (Exception e) {
log.info(e);
log.info(this.getClass().getName() + ": Could not load head finder " + args[i + 1]);
}
i += 2;
} else if (args[i].equals("-xmlFormat")) {
optionsString.append("Reading trees in XML format.\n");
readPennFormat = false;
setInputEncoding(tlp.getEncoding());
i++;
} else if (args[i].equals("-frenchFactored")) {
for (String feature : factoredFeatures) addFeature(feature);
i++;
} else if (args[i].equals("-frenchMWMap")) {
loadMWMap(args[i + 1]);
i += 2;
} else if (args[i].equals("-tsg")) {
//wsg2011: These features should be removed for TSG extraction.
//If they are retained, the resulting grammar seems to be too brittle....
optionsString.append("Removing baseline features: -markVN, -coord1");
removeFeature("-markVN");
optionsString.append(" (removed -markVN)");
removeFeature("-coord1");
optionsString.append(" (removed -coord1)\n");
i++;
} else if (args[i].equals("-factlex") && (i + 1 < args.length)) {
String activeFeats = setupMorphoFeatures(args[i + 1]);
optionsString.append("Factored Lexicon: active features: ").append(activeFeats);
// WSGDEBUG Maybe add -mweTag in place of -tagPAFr?
removeFeature("-tagPAFr");
optionsString.append(" (removed -tagPAFr)\n");
// Add -mweTag
String[] option = { "-mweTag" };
setOptionFlag(option, 0);
i += 2;
} else if (args[i].equals("-noFeatures")) {
for (String feature : annotations.keySet()) removeFeature(feature);
optionsString.append("Removed all manual features.\n");
i++;
} else if (args[i].equals("-ccTagsetAnnotations")) {
tagSpec = new FrenchMorphoFeatureSpecification();
tagSpec.activate(MorphoFeatureType.OTHER);
optionsString.append("Adding CC tagset as POS state splits.\n");
++i;
}
return i;
}
Aggregations