use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.
the class PhrasePair method readFields.
public void readFields(DataInput in) throws IOException {
f.readFields(in);
e.readFields(in);
byte at = in.readByte();
a = null;
if (at != 0) {
//System.out.println("Reading " +f.size() + "--" + e.size());
if (at == 1)
a = new Alignment(f.size(), e.size());
else if (at == 2)
a = new ReferenceAlignment(f.size(), e.size());
else
throw new IOException("bad format! at=" + at);
a.readFields(in);
assert (a.getELength() == e.getWords().length);
assert (a.getFLength() == f.getWords().length);
}
boolean hasg = in.readBoolean();
if (hasg) {
g = new AlignmentPosteriorGrid(this);
g.readFiles(in);
}
}
use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.
the class AlignmentTest method testReference.
public void testReference() {
ReferenceAlignment ra = new ReferenceAlignment(4, 5);
ra.align(0, 0);
ra.sureAlign(1, 1);
ra.sureAlign(3, 4);
ra.align(2, 4);
Alignment a = new Alignment(4, 5);
a.align(0, 1);
a.align(1, 1);
a.align(2, 4);
a.align(1, 4);
assertEquals(1, ra.countSureHits(a));
// Alignment b = new Alignment(3,4);
// try {
// ra.countProbableHits(b);
// fail("Should fail");
// } catch (RuntimeException f) {}
}
use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.
the class ParallelCorpusReader method endElement.
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("pchunk")) {
pchunkCount++;
cb_.handlePChunk(pchunk);
} else if (qName.equalsIgnoreCase("s")) {
String s = tempVal.toString().trim();
if (s.length() == 0) {
System.err.println(pchunk.getName() + ": Empty segment for lang=" + lang);
} else {
Chunk c = new Chunk(tempVal.toString().trim());
pchunk.addChunk(lang, c);
chunkCount++;
tempVal = null;
}
} else if (qName.equalsIgnoreCase("wordalignment")) {
Chunk sc = pchunk.getChunk(langpair.getSource());
if (sc == null)
throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getSource() + ". Note: manual word alignment data must follow the chunk data.");
Chunk tc = pchunk.getChunk(langpair.getTarget());
if (tc == null)
throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getTarget() + ". Note: manual word alignment data must follow the chunk data.");
ReferenceAlignment r = new ReferenceAlignment(sc.getLength(), tc.getLength());
r.addAlignmentPointsPharaoh(tempVal.toString().trim());
pchunk.addReferenceAlignment(langpair, r);
refAlignCount++;
tempVal = null;
} else if (qName.equalsIgnoreCase("pdoc")) {
System.err.println("Finished parsing document " + docName);
System.err.println(" pchunks: " + pchunkCount);
System.err.println(" chunks: " + chunkCount);
System.err.println(" ref alignments: " + refAlignCount);
} else {
throw new SAXException("Unknown tag: " + qName);
}
}
use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.
the class ParallelCorpusReader method convertToXMLDocument.
private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
try {
if (readAlignments) {
if (afile1_2 == null || afile1_2.equals(""))
throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
} else if (afile1_2 != null && !afile1_2.equals(""))
throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
BufferedReader r1_2 = null;
if (readAlignments)
r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
Language de = Language.languageForISO639_1(lf);
Language en = Language.languageForISO639_1(le);
LanguagePair ende = null;
if (readAlignments)
ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
System.err.println("Reading " + en + " from: " + ifile1);
System.err.println("Reading " + de + " from: " + ifile2);
if (readAlignments)
System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
BufferedWriter w = new BufferedWriter(w1);
w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
w.newLine();
int x = ifile1.lastIndexOf('/');
if (x < 0 || x >= ifile1.length())
x = 0;
w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
w.newLine();
String e;
int lc = 0;
while ((e = r1.readLine()) != null) {
lc += 1;
String f = r2.readLine();
if (f == null) {
System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
break;
}
String a = null;
if (readAlignments) {
a = r1_2.readLine();
if (a == null)
System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
}
Chunk ec = new Chunk(e);
Chunk fc = new Chunk(f);
String name = label + lc;
ParallelChunk p = new ParallelChunk();
p.setName(name);
p.addChunk(de, fc);
p.addChunk(en, ec);
if (a != null) {
ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
try {
ra.addAlignmentPointsPharaoh(a);
p.addReferenceAlignment(ende, ra);
} catch (RuntimeException re) {
System.err.println("Couldn't set alignment points for sentence # " + lc);
System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
System.err.println(" " + ende + ": " + a);
}
}
w.write(p.toXML());
}
String t = r2.readLine();
if (t != null)
System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
w.write("</pdoc>");
System.out.println("Converted " + lc + " sentences");
w.newLine();
w.close();
r1.close();
r2.close();
if (readAlignments)
r1_2.close();
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.
the class ParallelChunk method getReferenceAlignment.
public ReferenceAlignment getReferenceAlignment(LanguagePair lp) {
if (lp2ra == null)
return null;
ReferenceAlignment r = lp2ra.get(lp);
if (r == null) {
//if can't get alignment (for en-fr), then try for other translation direction (fr-en), and transpose the alignment for those.
r = lp2ra.get(lp.inverted());
if (r != null) {
r = (ReferenceAlignment) r.getTranspose();
lp2ra.put(lp, r);
}
}
return r;
}
Aggregations