use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.
the class FrenchMorphoFeatureSpecification method main.
/**
* For debugging
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 1) {
System.err.printf("Usage: java %s file%n", FrenchMorphoFeatureSpecification.class.getName());
System.exit(-1);
}
try {
BufferedReader br = new BufferedReader(new FileReader(args[0]));
MorphoFeatureSpecification mfs = new FrenchMorphoFeatureSpecification();
//Activate all features for debugging
mfs.activate(MorphoFeatureType.GEN);
mfs.activate(MorphoFeatureType.NUM);
mfs.activate(MorphoFeatureType.PER);
for (String line; (line = br.readLine()) != null; ) {
MorphoFeatures feats = mfs.strToFeatures(line);
System.out.printf("%s\t%s%n", line.trim(), feats.toString());
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.
the class IOBUtils method tokenToDatums.
/**
* Convert token to a sequence of datums and add to iobList.
*
* @param iobList
* @param token
* @param tokType
* @param tokenLabel
* @param lastToken
* @param applyRewriteRules
* @param tf a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)
* @param origText the original string before tokenization (for determining original segment boundaries)
*/
private static void tokenToDatums(List<CoreLabel> iobList, CoreLabel cl, String token, TokenType tokType, CoreLabel tokenLabel, String lastToken, boolean applyRewriteRules, boolean stripRewrites, TokenizerFactory<CoreLabel> tf, String origText) {
if (token.isEmpty())
return;
String lastLabel = ContinuationSymbol;
String firstLabel = BeginSymbol;
String rewritten = cl.get(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation.class);
boolean crossRefRewrites = true;
if (rewritten == null) {
rewritten = token;
crossRefRewrites = false;
} else {
rewritten = stripSegmentationMarkers(rewritten, tokType);
}
if (applyRewriteRules) {
// Apply Arabic-specific re-write rules
String rawToken = tokenLabel.word();
String tag = tokenLabel.tag();
MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
featureSpec.activate(MorphoFeatureType.NGEN);
featureSpec.activate(MorphoFeatureType.NNUM);
featureSpec.activate(MorphoFeatureType.DEF);
featureSpec.activate(MorphoFeatureType.TENSE);
MorphoFeatures features = featureSpec.strToFeatures(tag);
// Rule #1 : ت --> ة
if (features.getValue(MorphoFeatureType.NGEN).equals("F") && features.getValue(MorphoFeatureType.NNUM).equals("SG") && rawToken.endsWith("ت-") && !stripRewrites) {
lastLabel = RewriteSymbol;
} else if (rawToken.endsWith("ة-")) {
assert token.endsWith("ة");
token = token.substring(0, token.length() - 1) + "ت";
lastLabel = RewriteSymbol;
}
// Rule #2 : لل --> ل ال
if (lastToken.equals("ل") && features.getValue(MorphoFeatureType.DEF).equals("D")) {
if (rawToken.startsWith("-ال")) {
if (!token.startsWith("ا"))
log.info("Bad REWAL: " + rawToken + " / " + token);
token = token.substring(1);
rewritten = rewritten.substring(1);
if (!stripRewrites)
firstLabel = RewriteSymbol;
} else if (rawToken.startsWith("-ل")) {
if (!token.startsWith("ل"))
log.info("Bad REWAL: " + rawToken + " / " + token);
if (!stripRewrites)
firstLabel = RewriteSymbol;
} else {
log.info("Ignoring REWAL: " + rawToken + " / " + token);
}
}
// Rule #4 : ا --> ى
if (rawToken.endsWith("ى-")) {
if (features.getValue(MorphoFeatureType.TENSE) != null) {
// verb: ى becomes ا
token = token.substring(0, token.length() - 1) + "ا";
} else {
// assume preposition:
token = token.substring(0, token.length() - 1) + "ي";
}
if (!stripRewrites)
lastLabel = RewriteSymbol;
} else if (rawToken.equals("علي-") || rawToken.equals("-علي-")) {
if (!stripRewrites)
lastLabel = RewriteSymbol;
}
}
String origWord;
if (origText == null) {
origWord = tokenLabel.word();
} else {
origWord = origText.substring(cl.beginPosition(), cl.endPosition());
}
int origIndex = 0;
while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
++origIndex;
}
// Create datums and add to iobList
if (token.isEmpty())
log.info("Rewriting resulted in empty token: " + tokenLabel.word());
String firstChar = String.valueOf(token.charAt(0));
// Start at 0 to make sure we include the whole token according to the tokenizer
iobList.add(createDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
final int numChars = token.length();
if (crossRefRewrites && rewritten.length() != numChars) {
System.err.printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
crossRefRewrites = false;
}
++origIndex;
for (int j = 1; j < numChars; ++j, ++origIndex) {
while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
++origIndex;
}
if (origIndex >= origWord.length()) {
origIndex = origWord.length() - 1;
}
String charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
String thisChar = String.valueOf(token.charAt(j));
if (crossRefRewrites && !String.valueOf(rewritten.charAt(j)).equals(thisChar))
charLabel = RewriteSymbol;
if (charLabel == ContinuationSymbol && thisChar.equals("ى") && j != numChars - 1)
// Assume all mid-word alef maqsura are supposed to be yah
charLabel = RewriteSymbol;
iobList.add(createDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
}
// End at endPosition to make sure we include the whole token according to the tokenizer
if (!iobList.isEmpty()) {
iobList.get(iobList.size() - 1).setEndPosition(cl.endPosition());
}
}
use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.
the class ArabicMorphoFeatureSpecification method main.
/**
* For debugging. Converts a set of long tags (BAMA analyses as in the ATB) to their morpho
* feature specification. The input file should have one long tag per line.
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.printf("Usage: java %s filename feats%n", ArabicMorphoFeatureSpecification.class.getName());
System.exit(-1);
}
MorphoFeatureSpecification fSpec = new ArabicMorphoFeatureSpecification();
String[] feats = args[1].split(",");
for (String feat : feats) {
MorphoFeatureType fType = MorphoFeatureType.valueOf(feat);
fSpec.activate(fType);
}
File fName = new File(args[0]);
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fName)));
int nLine = 0;
for (String line; (line = br.readLine()) != null; nLine++) {
MorphoFeatures mFeats = fSpec.strToFeatures(line.trim());
System.out.printf("%s\t%s%n", line.trim(), mFeats.toString());
}
br.close();
System.out.printf("%nRead %d lines%n", nLine);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.
the class FrenchTreeNormalizer method replacePOSTag.
private static void replacePOSTag(Tree t, MorphoFeatureSpecification morpho) {
if (!t.isPreTerminal()) {
throw new IllegalArgumentException("Can only operate on preterminals");
}
if (!(t.label() instanceof CoreLabel)) {
throw new IllegalArgumentException("Only operates on CoreLabels");
}
CoreLabel label = (CoreLabel) t.label();
Tree child = t.children()[0];
if (!(child.label() instanceof CoreLabel)) {
throw new IllegalArgumentException("Only operates on CoreLabels");
}
CoreLabel childLabel = (CoreLabel) child.label();
// Morphological Analysis
String morphStr = childLabel.originalText();
if (morphStr == null || morphStr.equals("")) {
morphStr = label.value();
// POS subcategory
String subCat = childLabel.category();
if (subCat != null && subCat != "") {
morphStr += "-" + subCat + "--";
} else {
morphStr += "---";
}
}
MorphoFeatures feats = morpho.strToFeatures(morphStr);
if (feats.getAltTag() != null && !feats.getAltTag().equals("")) {
label.setValue(feats.getAltTag());
label.setTag(feats.getAltTag());
}
}
Aggregations