Search in sources :

Example 1 with Example

use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.

the class Utils method main.

public static void main(String[] args) throws Exception {
    // romanization();
    String[] arabic_names = { "Urdu", "Arabic", "Egyptian_Arabic", "Mazandarani", "Pashto", "Persian", "Western_Punjabi" };
    String[] devanagari_names = { "Newar", "Hindi", "Marathi", "Nepali", "Sanskrit" };
    String[] cyrillic_names = { "Chuvash", "Bashkir", "Bulgarian", "Chechen", "Kirghiz", "Macedonian", "Russian", "Ukrainian" };
    // for(String name : arabic_names){
    // System.out.println(name + " : " + WAVE("models/probs-"+name+"-Urdu.txt"));
    // getSize(name);
    // }
    String lang = "Arabic";
    String wikidata = "Data/wikidata." + lang;
    List<String> allnames = LineIO.read("/Users/stephen/Dropbox/papers/NAACL2016/data/all-names2.txt");
    List<Example> training = readWikiData(wikidata);
    training = training.subList(0, 2000);
    SPModel m = new SPModel(training);
    m.Train(5);
    TopList<Double, String> res = m.Generate("stephen");
    System.out.println(res);
    List<String> outlines = new ArrayList<>();
    int i = 0;
    for (String nameAndLabel : allnames) {
        if (i % 100 == 0) {
            System.out.println(i);
        }
        i++;
        String[] s = nameAndLabel.split("\t");
        String name = s[0];
        String label = s[1];
        String[] sname = name.split(" ");
        String line = "";
        for (String tok : sname) {
            res = m.Generate(tok.toLowerCase());
            if (res.size() > 0) {
                String topcand = res.getFirst().getSecond();
                line += topcand + " ";
            } else {
            }
        }
        if (line.trim().length() > 0) {
            outlines.add(line.trim() + "\t" + label);
        }
    }
    LineIO.write("/Users/stephen/Dropbox/papers/NAACL2016/data/all-names-" + lang + "2.txt", outlines);
// Transliterator t = Transliterator.getInstance("Any-am_FONIPA");
// 
// String result = t.transform("Stephen");
// System.out.println(result);
// 
// Enumeration<String> tids = t.getAvailableIDs();
// 
// while(tids.hasMoreElements()){
// String e = tids.nextElement();
// System.out.println(e);
// }
}
Also used : SPModel(edu.illinois.cs.cogcomp.transliteration.SPModel) MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example)

Example 2 with Example

use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.

the class Utils method readWikiData.

/**
 * This reads data in the format created by the wikipedia-api project, commonly named wikidata.Language
 * @param file name of file
 * @param fix whether or not the names should be reordered according to edit distance.
 * @return list of examples
 * @throws FileNotFoundException
 */
public static List<Example> readWikiData(String file, boolean fix) throws FileNotFoundException {
    List<Example> examples = new ArrayList<>();
    List<String> lines = LineIO.read(file);
    String id = "Any-Latin; NFD; [^\\p{Alnum}] Remove";
    // id = "Any-Latin; NFD";
    Transliterator t = Transliterator.getInstance(id);
    HashSet<Example> unique = new HashSet<>();
    int skipping = 0;
    for (String line : lines) {
        if (line.contains("#")) {
            continue;
        }
        String[] parts = line.split("\t");
        if (parts.length < 2) {
            continue;
        }
        // In wikipedia data, the foreign name comes first, English second.
        String foreign = parts[0].toLowerCase();
        String english = parts[1].toLowerCase();
        String[] ftoks = foreign.split(" ");
        String[] etoks = english.split(" ");
        if (ftoks.length != etoks.length) {
            logger.error("Mismatching length of tokens: " + english);
            skipping++;
            continue;
        }
        // other heuristics to help clean data
        if (english.contains("jr.") || english.contains("sr.") || english.contains(" of ") || english.contains(" de ") || english.contains("(") || english.contains("pope ")) {
            skipping++;
            // logger.debug("Skipping: " + english);
            continue;
        }
        int numtoks = ftoks.length;
        for (int i = 0; i < numtoks; i++) {
            String ftrans = t.transform(ftoks[i]);
            int mindist = Integer.MAX_VALUE;
            String bestmatch = null;
            // this is intended to help with ordering.
            for (int j = 0; j < numtoks; j++) {
                int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
                if (d < mindist) {
                    // match etoks[j] with ftrans
                    bestmatch = etoks[j];
                    mindist = d;
                // then take etoks[j] out of the running
                }
            }
            // strip those pesky commas.
            if (ftoks[i].endsWith(",")) {
                ftoks[i] = ftoks[i].substring(0, ftoks[i].length() - 1);
            }
            // This version uses transliterated words as the target (cheating)
            // examples.add(new Example(bestmatch, ftrans));
            Example addme;
            if (fix) {
                // This uses the best aligned version (recommended)
                addme = new Example(bestmatch, ftoks[i]);
            } else {
                // This assumes the file ordering is correct
                addme = new Example(etoks[i], ftoks[i]);
            }
            examples.add(addme);
            unique.add(addme);
        }
    }
    // System.out.println(file.split("\\.")[1] + " & " + numnames + " & " + examples.size() + " & " + unique.size() + " \\\\");
    logger.debug(String.format("Skipped %d lines", skipping));
    return new ArrayList<>(unique);
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example) Transliterator(com.ibm.icu.text.Transliterator)

Example 3 with Example

use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.

the class Utils method convertMulti.

public static List<Example> convertMulti(List<MultiExample> lme) {
    List<Example> training = new ArrayList<>();
    for (MultiExample me : lme) {
        for (Example e : me.toExampleList()) {
            String[] tls = e.getTransliteratedWord().split(" ");
            String[] ss = e.sourceWord.split(" ");
            if (tls.length != ss.length) {
                logger.error("Mismatched length: " + e.sourceWord);
                continue;
            }
            for (int i = 0; i < tls.length; i++) {
                training.add(new Example(ss[i], tls[i]));
            }
        }
    }
    return training;
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example) MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample)

Example 4 with Example

use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.

the class Utils method romanization.

public static void romanization() throws FileNotFoundException {
    List<String> lines = LineIO.read("/shared/corpora/transliteration/wikidata/wikidata.Russian.fixed");
    String id = "Any-Arabic; NFD; [^\\p{Alnum}] Remove";
    // id = "Any-Latin; NFD";
    Transliterator t = Transliterator.getInstance(id);
    int jj = 0;
    List<Example> examples = new ArrayList<>();
    for (String line : lines) {
        if (line.contains("#")) {
            continue;
        }
        jj++;
        String[] parts = line.split("\t");
        if (parts.length < 2) {
            continue;
        }
        // In wikipedia data, the foreign name comes first, English second.
        String foreign = parts[0].toLowerCase();
        String english = parts[1].toLowerCase();
        String[] ftoks = foreign.split(" ");
        String[] etoks = english.split(" ");
        if (ftoks.length != etoks.length) {
            logger.error("Mismatching length of tokens: " + english);
            continue;
        }
        int numtoks = ftoks.length;
        for (int i = 0; i < numtoks; i++) {
            String ftrans = t.transform(ftoks[i]);
            ftoks[i] = ftrans;
            int mindist = Integer.MAX_VALUE;
            String bestmatch = null;
            for (int j = 0; j < numtoks; j++) {
                int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
                if (d < mindist) {
                    // match etoks[j] with ftrans
                    bestmatch = etoks[j];
                    mindist = d;
                // then take etoks[j] out of the running
                }
            }
            // System.out.print(ftrans + " : " + bestmatch + ", ");
            examples.add(new Example(bestmatch, ftrans));
        }
        if (jj % 1000 == 0) {
            System.out.println(jj);
        }
    }
    System.out.println(examples.size());
    Enumeration<String> tids = t.getAvailableIDs();
    while (tids.hasMoreElements()) {
        String e = tids.nextElement();
    // System.out.println(e);
    }
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example) Transliterator(com.ibm.icu.text.Transliterator)

Example 5 with Example

use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.

the class Utils method readCCBData.

/**
 * This reads data from the Anne Irvine, CCB paper called Transliterating from Any Language.
 * @return
 */
public static List<Example> readCCBData(String srccode, String targetcode) throws FileNotFoundException {
    List<Example> examples = new ArrayList<>();
    String fname = "/shared/corpora/transliteration/from_anne_irvine/wikipedia_names";
    List<String> lines = LineIO.read(fname);
    List<String> key = Arrays.asList(lines.get(0).split("\t"));
    int srcind = key.indexOf(srccode);
    int tgtind = key.indexOf(targetcode);
    System.out.println(srcind + ", " + tgtind);
    int i = 0;
    for (String line : lines) {
        if (i == 0 || line.trim().length() == 0) {
            i++;
            continue;
        }
        String[] sline = line.split("\t");
        // Java removes whitespace at the end of a line.
        if (tgtind >= sline.length) {
            i++;
            continue;
        }
        String src = sline[srcind].trim();
        String tgt = sline[tgtind].trim();
        if (src.length() > 0 && tgt.length() > 0) {
            Example e = new Example(src, tgt);
            examples.add(e);
        }
        i++;
    }
    return examples;
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example)

Aggregations

Example (edu.illinois.cs.cogcomp.transliteration.Example)5 MultiExample (edu.illinois.cs.cogcomp.transliteration.MultiExample)5 Transliterator (com.ibm.icu.text.Transliterator)2 SPModel (edu.illinois.cs.cogcomp.transliteration.SPModel)1