Search in sources :

Example 6 with MorphoFeatures

use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.

the class FrenchMorphoFeatureSpecification method main.

/**
   * For debugging
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", FrenchMorphoFeatureSpecification.class.getName());
        System.exit(-1);
    }
    try {
        BufferedReader br = new BufferedReader(new FileReader(args[0]));
        MorphoFeatureSpecification mfs = new FrenchMorphoFeatureSpecification();
        //Activate all features for debugging
        mfs.activate(MorphoFeatureType.GEN);
        mfs.activate(MorphoFeatureType.NUM);
        mfs.activate(MorphoFeatureType.PER);
        for (String line; (line = br.readLine()) != null; ) {
            MorphoFeatures feats = mfs.strToFeatures(line);
            System.out.printf("%s\t%s%n", line.trim(), feats.toString());
        }
        br.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : BufferedReader(java.io.BufferedReader) MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) FileNotFoundException(java.io.FileNotFoundException) FileReader(java.io.FileReader) IOException(java.io.IOException) MorphoFeatures(edu.stanford.nlp.international.morph.MorphoFeatures)

Example 7 with MorphoFeatures

use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.

the class IOBUtils method tokenToDatums.

/**
   * Convert token to a sequence of datums and add to iobList.
   *
   * @param iobList
   * @param token
   * @param tokType
   * @param tokenLabel
   * @param lastToken
   * @param applyRewriteRules
   * @param tf a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)
   * @param origText the original string before tokenization (for determining original segment boundaries)
   */
private static void tokenToDatums(List<CoreLabel> iobList, CoreLabel cl, String token, TokenType tokType, CoreLabel tokenLabel, String lastToken, boolean applyRewriteRules, boolean stripRewrites, TokenizerFactory<CoreLabel> tf, String origText) {
    if (token.isEmpty())
        return;
    String lastLabel = ContinuationSymbol;
    String firstLabel = BeginSymbol;
    String rewritten = cl.get(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation.class);
    boolean crossRefRewrites = true;
    if (rewritten == null) {
        rewritten = token;
        crossRefRewrites = false;
    } else {
        rewritten = stripSegmentationMarkers(rewritten, tokType);
    }
    if (applyRewriteRules) {
        // Apply Arabic-specific re-write rules
        String rawToken = tokenLabel.word();
        String tag = tokenLabel.tag();
        MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
        featureSpec.activate(MorphoFeatureType.NGEN);
        featureSpec.activate(MorphoFeatureType.NNUM);
        featureSpec.activate(MorphoFeatureType.DEF);
        featureSpec.activate(MorphoFeatureType.TENSE);
        MorphoFeatures features = featureSpec.strToFeatures(tag);
        // Rule #1 : ت --> ة
        if (features.getValue(MorphoFeatureType.NGEN).equals("F") && features.getValue(MorphoFeatureType.NNUM).equals("SG") && rawToken.endsWith("ت-") && !stripRewrites) {
            lastLabel = RewriteSymbol;
        } else if (rawToken.endsWith("ة-")) {
            assert token.endsWith("ة");
            token = token.substring(0, token.length() - 1) + "ت";
            lastLabel = RewriteSymbol;
        }
        // Rule #2 : لل --> ل ال
        if (lastToken.equals("ل") && features.getValue(MorphoFeatureType.DEF).equals("D")) {
            if (rawToken.startsWith("-ال")) {
                if (!token.startsWith("ا"))
                    log.info("Bad REWAL: " + rawToken + " / " + token);
                token = token.substring(1);
                rewritten = rewritten.substring(1);
                if (!stripRewrites)
                    firstLabel = RewriteSymbol;
            } else if (rawToken.startsWith("-ل")) {
                if (!token.startsWith("ل"))
                    log.info("Bad REWAL: " + rawToken + " / " + token);
                if (!stripRewrites)
                    firstLabel = RewriteSymbol;
            } else {
                log.info("Ignoring REWAL: " + rawToken + " / " + token);
            }
        }
        // Rule #4 : ا --> ى
        if (rawToken.endsWith("ى-")) {
            if (features.getValue(MorphoFeatureType.TENSE) != null) {
                // verb: ى becomes ا
                token = token.substring(0, token.length() - 1) + "ا";
            } else {
                // assume preposition:
                token = token.substring(0, token.length() - 1) + "ي";
            }
            if (!stripRewrites)
                lastLabel = RewriteSymbol;
        } else if (rawToken.equals("علي-") || rawToken.equals("-علي-")) {
            if (!stripRewrites)
                lastLabel = RewriteSymbol;
        }
    }
    String origWord;
    if (origText == null) {
        origWord = tokenLabel.word();
    } else {
        origWord = origText.substring(cl.beginPosition(), cl.endPosition());
    }
    int origIndex = 0;
    while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
        ++origIndex;
    }
    // Create datums and add to iobList
    if (token.isEmpty())
        log.info("Rewriting resulted in empty token: " + tokenLabel.word());
    String firstChar = String.valueOf(token.charAt(0));
    // Start at 0 to make sure we include the whole token according to the tokenizer
    iobList.add(createDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
    final int numChars = token.length();
    if (crossRefRewrites && rewritten.length() != numChars) {
        System.err.printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
        crossRefRewrites = false;
    }
    ++origIndex;
    for (int j = 1; j < numChars; ++j, ++origIndex) {
        while (origIndex < origWord.length() && isDeletedCharacter(origWord.charAt(origIndex), tf)) {
            ++origIndex;
        }
        if (origIndex >= origWord.length()) {
            origIndex = origWord.length() - 1;
        }
        String charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
        String thisChar = String.valueOf(token.charAt(j));
        if (crossRefRewrites && !String.valueOf(rewritten.charAt(j)).equals(thisChar))
            charLabel = RewriteSymbol;
        if (charLabel == ContinuationSymbol && thisChar.equals("ى") && j != numChars - 1)
            // Assume all mid-word alef maqsura are supposed to be yah
            charLabel = RewriteSymbol;
        iobList.add(createDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
    }
    // End at endPosition to make sure we include the whole token according to the tokenizer
    if (!iobList.isEmpty()) {
        iobList.get(iobList.size() - 1).setEndPosition(cl.endPosition());
    }
}
Also used : MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification) MorphoFeatures(edu.stanford.nlp.international.morph.MorphoFeatures) ArabicMorphoFeatureSpecification(edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)

Example 8 with MorphoFeatures

use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.

the class ArabicMorphoFeatureSpecification method main.

/**
   * For debugging. Converts a set of long tags (BAMA analyses as in the ATB) to their morpho
   * feature specification. The input file should have one long tag per line.
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.printf("Usage: java %s filename feats%n", ArabicMorphoFeatureSpecification.class.getName());
        System.exit(-1);
    }
    MorphoFeatureSpecification fSpec = new ArabicMorphoFeatureSpecification();
    String[] feats = args[1].split(",");
    for (String feat : feats) {
        MorphoFeatureType fType = MorphoFeatureType.valueOf(feat);
        fSpec.activate(fType);
    }
    File fName = new File(args[0]);
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fName)));
        int nLine = 0;
        for (String line; (line = br.readLine()) != null; nLine++) {
            MorphoFeatures mFeats = fSpec.strToFeatures(line.trim());
            System.out.printf("%s\t%s%n", line.trim(), mFeats.toString());
        }
        br.close();
        System.out.printf("%nRead %d lines%n", nLine);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : MorphoFeatureSpecification(edu.stanford.nlp.international.morph.MorphoFeatureSpecification) MorphoFeatures(edu.stanford.nlp.international.morph.MorphoFeatures)

Example 9 with MorphoFeatures

use of edu.stanford.nlp.international.morph.MorphoFeatures in project CoreNLP by stanfordnlp.

the class FrenchTreeNormalizer method replacePOSTag.

private static void replacePOSTag(Tree t, MorphoFeatureSpecification morpho) {
    if (!t.isPreTerminal()) {
        throw new IllegalArgumentException("Can only operate on preterminals");
    }
    if (!(t.label() instanceof CoreLabel)) {
        throw new IllegalArgumentException("Only operates on CoreLabels");
    }
    CoreLabel label = (CoreLabel) t.label();
    Tree child = t.children()[0];
    if (!(child.label() instanceof CoreLabel)) {
        throw new IllegalArgumentException("Only operates on CoreLabels");
    }
    CoreLabel childLabel = (CoreLabel) child.label();
    // Morphological Analysis
    String morphStr = childLabel.originalText();
    if (morphStr == null || morphStr.equals("")) {
        morphStr = label.value();
        // POS subcategory
        String subCat = childLabel.category();
        if (subCat != null && subCat != "") {
            morphStr += "-" + subCat + "--";
        } else {
            morphStr += "---";
        }
    }
    MorphoFeatures feats = morpho.strToFeatures(morphStr);
    if (feats.getAltTag() != null && !feats.getAltTag().equals("")) {
        label.setValue(feats.getAltTag());
        label.setTag(feats.getAltTag());
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) Tree(edu.stanford.nlp.trees.Tree) MorphoFeatures(edu.stanford.nlp.international.morph.MorphoFeatures)

Aggregations

MorphoFeatures (edu.stanford.nlp.international.morph.MorphoFeatures)9 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)4 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)1 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)1 HasTag (edu.stanford.nlp.ling.HasTag)1 Label (edu.stanford.nlp.ling.Label)1 SerializableFunction (edu.stanford.nlp.process.SerializableFunction)1 Tree (edu.stanford.nlp.trees.Tree)1 BufferedReader (java.io.BufferedReader)1 FileNotFoundException (java.io.FileNotFoundException)1 FileReader (java.io.FileReader)1 IOException (java.io.IOException)1 Function (java.util.function.Function)1