use of edu.umd.hooka.corpora.LanguagePair in project Cloud9 by lintool.
the class CreateWordAlignmentCorpus method main.
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: CreateWordAlignmentCorpus <lang> <infile.txt> <outfile.txt>");
System.err.println(" (note: lang must be a two-letter ISO639 code)");
System.exit(1);
}
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF8"));
Language fl = Language.languageForISO639_1(args[0]);
LanguagePair lp = LanguagePair.languageForISO639_1Pair(args[0] + "-en");
AlignmentWordPreprocessor sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, fl, null);
String l;
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[2]), "UTF8"));
while ((l = in.readLine()) != null) {
String[] res = sawp.preprocessWordsForAlignment(l.split("\\s+"));
boolean first = true;
for (String r : res) {
if (first)
first = false;
else
out.write(' ');
out.write(r);
}
out.newLine();
}
out.flush();
out.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
use of edu.umd.hooka.corpora.LanguagePair in project Cloud9 by lintool.
the class ParallelCorpusReader method convertToXMLDocument.
private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
try {
if (readAlignments) {
if (afile1_2 == null || afile1_2.equals(""))
throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
} else if (afile1_2 != null && !afile1_2.equals(""))
throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
BufferedReader r1_2 = null;
if (readAlignments)
r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
Language de = Language.languageForISO639_1(lf);
Language en = Language.languageForISO639_1(le);
LanguagePair ende = null;
if (readAlignments)
ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
System.err.println("Reading " + en + " from: " + ifile1);
System.err.println("Reading " + de + " from: " + ifile2);
if (readAlignments)
System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
BufferedWriter w = new BufferedWriter(w1);
w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
w.newLine();
int x = ifile1.lastIndexOf('/');
if (x < 0 || x >= ifile1.length())
x = 0;
w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
w.newLine();
String e;
int lc = 0;
while ((e = r1.readLine()) != null) {
lc += 1;
String f = r2.readLine();
if (f == null) {
System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
break;
}
String a = null;
if (readAlignments) {
a = r1_2.readLine();
if (a == null)
System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
}
Chunk ec = new Chunk(e);
Chunk fc = new Chunk(f);
String name = label + lc;
ParallelChunk p = new ParallelChunk();
p.setName(name);
p.addChunk(de, fc);
p.addChunk(en, ec);
if (a != null) {
ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
try {
ra.addAlignmentPointsPharaoh(a);
p.addReferenceAlignment(ende, ra);
} catch (RuntimeException re) {
System.err.println("Couldn't set alignment points for sentence # " + lc);
System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
System.err.println(" " + ende + ": " + a);
}
}
w.write(p.toXML());
}
String t = r2.readLine();
if (t != null)
System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
w.write("</pdoc>");
System.out.println("Converted " + lc + " sentences");
w.newLine();
w.close();
r1.close();
r2.close();
if (readAlignments)
r1_2.close();
} catch (Exception e) {
e.printStackTrace();
}
}
Aggregations