use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.
the class Utils method main.
public static void main(String[] args) throws Exception {
// romanization();
String[] arabic_names = { "Urdu", "Arabic", "Egyptian_Arabic", "Mazandarani", "Pashto", "Persian", "Western_Punjabi" };
String[] devanagari_names = { "Newar", "Hindi", "Marathi", "Nepali", "Sanskrit" };
String[] cyrillic_names = { "Chuvash", "Bashkir", "Bulgarian", "Chechen", "Kirghiz", "Macedonian", "Russian", "Ukrainian" };
// for(String name : arabic_names){
// System.out.println(name + " : " + WAVE("models/probs-"+name+"-Urdu.txt"));
// getSize(name);
// }
String lang = "Arabic";
String wikidata = "Data/wikidata." + lang;
List<String> allnames = LineIO.read("/Users/stephen/Dropbox/papers/NAACL2016/data/all-names2.txt");
List<Example> training = readWikiData(wikidata);
training = training.subList(0, 2000);
SPModel m = new SPModel(training);
m.Train(5);
TopList<Double, String> res = m.Generate("stephen");
System.out.println(res);
List<String> outlines = new ArrayList<>();
int i = 0;
for (String nameAndLabel : allnames) {
if (i % 100 == 0) {
System.out.println(i);
}
i++;
String[] s = nameAndLabel.split("\t");
String name = s[0];
String label = s[1];
String[] sname = name.split(" ");
String line = "";
for (String tok : sname) {
res = m.Generate(tok.toLowerCase());
if (res.size() > 0) {
String topcand = res.getFirst().getSecond();
line += topcand + " ";
} else {
}
}
if (line.trim().length() > 0) {
outlines.add(line.trim() + "\t" + label);
}
}
LineIO.write("/Users/stephen/Dropbox/papers/NAACL2016/data/all-names-" + lang + "2.txt", outlines);
// Transliterator t = Transliterator.getInstance("Any-am_FONIPA");
//
// String result = t.transform("Stephen");
// System.out.println(result);
//
// Enumeration<String> tids = t.getAvailableIDs();
//
// while(tids.hasMoreElements()){
// String e = tids.nextElement();
// System.out.println(e);
// }
}
use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.
the class Utils method readWikiData.
/**
* This reads data in the format created by the wikipedia-api project, commonly named wikidata.Language
* @param file name of file
* @param fix whether or not the names should be reordered according to edit distance.
* @return list of examples
* @throws FileNotFoundException
*/
public static List<Example> readWikiData(String file, boolean fix) throws FileNotFoundException {
List<Example> examples = new ArrayList<>();
List<String> lines = LineIO.read(file);
String id = "Any-Latin; NFD; [^\\p{Alnum}] Remove";
// id = "Any-Latin; NFD";
Transliterator t = Transliterator.getInstance(id);
HashSet<Example> unique = new HashSet<>();
int skipping = 0;
for (String line : lines) {
if (line.contains("#")) {
continue;
}
String[] parts = line.split("\t");
if (parts.length < 2) {
continue;
}
// In wikipedia data, the foreign name comes first, English second.
String foreign = parts[0].toLowerCase();
String english = parts[1].toLowerCase();
String[] ftoks = foreign.split(" ");
String[] etoks = english.split(" ");
if (ftoks.length != etoks.length) {
logger.error("Mismatching length of tokens: " + english);
skipping++;
continue;
}
// other heuristics to help clean data
if (english.contains("jr.") || english.contains("sr.") || english.contains(" of ") || english.contains(" de ") || english.contains("(") || english.contains("pope ")) {
skipping++;
// logger.debug("Skipping: " + english);
continue;
}
int numtoks = ftoks.length;
for (int i = 0; i < numtoks; i++) {
String ftrans = t.transform(ftoks[i]);
int mindist = Integer.MAX_VALUE;
String bestmatch = null;
// this is intended to help with ordering.
for (int j = 0; j < numtoks; j++) {
int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
if (d < mindist) {
// match etoks[j] with ftrans
bestmatch = etoks[j];
mindist = d;
// then take etoks[j] out of the running
}
}
// strip those pesky commas.
if (ftoks[i].endsWith(",")) {
ftoks[i] = ftoks[i].substring(0, ftoks[i].length() - 1);
}
// This version uses transliterated words as the target (cheating)
// examples.add(new Example(bestmatch, ftrans));
Example addme;
if (fix) {
// This uses the best aligned version (recommended)
addme = new Example(bestmatch, ftoks[i]);
} else {
// This assumes the file ordering is correct
addme = new Example(etoks[i], ftoks[i]);
}
examples.add(addme);
unique.add(addme);
}
}
// System.out.println(file.split("\\.")[1] + " & " + numnames + " & " + examples.size() + " & " + unique.size() + " \\\\");
logger.debug(String.format("Skipped %d lines", skipping));
return new ArrayList<>(unique);
}
use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.
the class Utils method convertMulti.
public static List<Example> convertMulti(List<MultiExample> lme) {
List<Example> training = new ArrayList<>();
for (MultiExample me : lme) {
for (Example e : me.toExampleList()) {
String[] tls = e.getTransliteratedWord().split(" ");
String[] ss = e.sourceWord.split(" ");
if (tls.length != ss.length) {
logger.error("Mismatched length: " + e.sourceWord);
continue;
}
for (int i = 0; i < tls.length; i++) {
training.add(new Example(ss[i], tls[i]));
}
}
}
return training;
}
use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.
the class Utils method romanization.
public static void romanization() throws FileNotFoundException {
List<String> lines = LineIO.read("/shared/corpora/transliteration/wikidata/wikidata.Russian.fixed");
String id = "Any-Arabic; NFD; [^\\p{Alnum}] Remove";
// id = "Any-Latin; NFD";
Transliterator t = Transliterator.getInstance(id);
int jj = 0;
List<Example> examples = new ArrayList<>();
for (String line : lines) {
if (line.contains("#")) {
continue;
}
jj++;
String[] parts = line.split("\t");
if (parts.length < 2) {
continue;
}
// In wikipedia data, the foreign name comes first, English second.
String foreign = parts[0].toLowerCase();
String english = parts[1].toLowerCase();
String[] ftoks = foreign.split(" ");
String[] etoks = english.split(" ");
if (ftoks.length != etoks.length) {
logger.error("Mismatching length of tokens: " + english);
continue;
}
int numtoks = ftoks.length;
for (int i = 0; i < numtoks; i++) {
String ftrans = t.transform(ftoks[i]);
ftoks[i] = ftrans;
int mindist = Integer.MAX_VALUE;
String bestmatch = null;
for (int j = 0; j < numtoks; j++) {
int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
if (d < mindist) {
// match etoks[j] with ftrans
bestmatch = etoks[j];
mindist = d;
// then take etoks[j] out of the running
}
}
// System.out.print(ftrans + " : " + bestmatch + ", ");
examples.add(new Example(bestmatch, ftrans));
}
if (jj % 1000 == 0) {
System.out.println(jj);
}
}
System.out.println(examples.size());
Enumeration<String> tids = t.getAvailableIDs();
while (tids.hasMoreElements()) {
String e = tids.nextElement();
// System.out.println(e);
}
}
use of edu.illinois.cs.cogcomp.transliteration.Example in project cogcomp-nlp by CogComp.
the class Utils method readCCBData.
/**
* This reads data from the Anne Irvine, CCB paper called Transliterating from Any Language.
* @return
*/
public static List<Example> readCCBData(String srccode, String targetcode) throws FileNotFoundException {
List<Example> examples = new ArrayList<>();
String fname = "/shared/corpora/transliteration/from_anne_irvine/wikipedia_names";
List<String> lines = LineIO.read(fname);
List<String> key = Arrays.asList(lines.get(0).split("\t"));
int srcind = key.indexOf(srccode);
int tgtind = key.indexOf(targetcode);
System.out.println(srcind + ", " + tgtind);
int i = 0;
for (String line : lines) {
if (i == 0 || line.trim().length() == 0) {
i++;
continue;
}
String[] sline = line.split("\t");
// Java removes whitespace at the end of a line.
if (tgtind >= sline.length) {
i++;
continue;
}
String src = sline[srcind].trim();
String tgt = sline[tgtind].trim();
if (src.length() > 0 && tgt.length() > 0) {
Example e = new Example(src, tgt);
examples.add(e);
}
i++;
}
return examples;
}
Aggregations