Search in sources :

Example 1 with ReferenceAlignment

use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.

the class PhrasePair method readFields.

public void readFields(DataInput in) throws IOException {
    f.readFields(in);
    e.readFields(in);
    byte at = in.readByte();
    a = null;
    if (at != 0) {
        //System.out.println("Reading " +f.size() + "--" + e.size());
        if (at == 1)
            a = new Alignment(f.size(), e.size());
        else if (at == 2)
            a = new ReferenceAlignment(f.size(), e.size());
        else
            throw new IOException("bad format! at=" + at);
        a.readFields(in);
        assert (a.getELength() == e.getWords().length);
        assert (a.getFLength() == f.getWords().length);
    }
    boolean hasg = in.readBoolean();
    if (hasg) {
        g = new AlignmentPosteriorGrid(this);
        g.readFiles(in);
    }
}
Also used : ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) IOException(java.io.IOException) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment)

Example 2 with ReferenceAlignment

use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.

the class AlignmentTest method testReference.

public void testReference() {
    ReferenceAlignment ra = new ReferenceAlignment(4, 5);
    ra.align(0, 0);
    ra.sureAlign(1, 1);
    ra.sureAlign(3, 4);
    ra.align(2, 4);
    Alignment a = new Alignment(4, 5);
    a.align(0, 1);
    a.align(1, 1);
    a.align(2, 4);
    a.align(1, 4);
    assertEquals(1, ra.countSureHits(a));
//		Alignment b = new Alignment(3,4);
//		try {
//			ra.countProbableHits(b);
//			fail("Should fail");
//		} catch (RuntimeException f) {}
}
Also used : ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment)

Example 3 with ReferenceAlignment

use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.

the class ParallelCorpusReader method endElement.

public void endElement(String uri, String localName, String qName) throws SAXException {
    if (qName.equalsIgnoreCase("pchunk")) {
        pchunkCount++;
        cb_.handlePChunk(pchunk);
    } else if (qName.equalsIgnoreCase("s")) {
        String s = tempVal.toString().trim();
        if (s.length() == 0) {
            System.err.println(pchunk.getName() + ": Empty segment for lang=" + lang);
        } else {
            Chunk c = new Chunk(tempVal.toString().trim());
            pchunk.addChunk(lang, c);
            chunkCount++;
            tempVal = null;
        }
    } else if (qName.equalsIgnoreCase("wordalignment")) {
        Chunk sc = pchunk.getChunk(langpair.getSource());
        if (sc == null)
            throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getSource() + ".  Note: manual word alignment data must follow the chunk data.");
        Chunk tc = pchunk.getChunk(langpair.getTarget());
        if (tc == null)
            throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getTarget() + ".  Note: manual word alignment data must follow the chunk data.");
        ReferenceAlignment r = new ReferenceAlignment(sc.getLength(), tc.getLength());
        r.addAlignmentPointsPharaoh(tempVal.toString().trim());
        pchunk.addReferenceAlignment(langpair, r);
        refAlignCount++;
        tempVal = null;
    } else if (qName.equalsIgnoreCase("pdoc")) {
        System.err.println("Finished parsing document " + docName);
        System.err.println("  pchunks: " + pchunkCount);
        System.err.println("  chunks: " + chunkCount);
        System.err.println("  ref alignments: " + refAlignCount);
    } else {
        throw new SAXException("Unknown tag: " + qName);
    }
}
Also used : Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) SAXException(org.xml.sax.SAXException)

Example 4 with ReferenceAlignment

use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.

the class ParallelCorpusReader method convertToXMLDocument.

private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
    try {
        if (readAlignments) {
            if (afile1_2 == null || afile1_2.equals(""))
                throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
        } else if (afile1_2 != null && !afile1_2.equals(""))
            throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
        BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
        BufferedReader r1_2 = null;
        if (readAlignments)
            r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
        OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
        Language de = Language.languageForISO639_1(lf);
        Language en = Language.languageForISO639_1(le);
        LanguagePair ende = null;
        if (readAlignments)
            ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
        System.err.println("Reading " + en + " from: " + ifile1);
        System.err.println("Reading " + de + " from: " + ifile2);
        if (readAlignments)
            System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
        BufferedWriter w = new BufferedWriter(w1);
        w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
        w.newLine();
        int x = ifile1.lastIndexOf('/');
        if (x < 0 || x >= ifile1.length())
            x = 0;
        w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
        w.newLine();
        String e;
        int lc = 0;
        while ((e = r1.readLine()) != null) {
            lc += 1;
            String f = r2.readLine();
            if (f == null) {
                System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
                break;
            }
            String a = null;
            if (readAlignments) {
                a = r1_2.readLine();
                if (a == null)
                    System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
            }
            Chunk ec = new Chunk(e);
            Chunk fc = new Chunk(f);
            String name = label + lc;
            ParallelChunk p = new ParallelChunk();
            p.setName(name);
            p.addChunk(de, fc);
            p.addChunk(en, ec);
            if (a != null) {
                ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
                try {
                    ra.addAlignmentPointsPharaoh(a);
                    p.addReferenceAlignment(ende, ra);
                } catch (RuntimeException re) {
                    System.err.println("Couldn't set alignment points for sentence # " + lc);
                    System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
                    System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
                    System.err.println(" " + ende + ": " + a);
                }
            }
            w.write(p.toXML());
        }
        String t = r2.readLine();
        if (t != null)
            System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
        w.write("</pdoc>");
        System.out.println("Converted " + lc + " sentences");
        w.newLine();
        w.close();
        r1.close();
        r2.close();
        if (readAlignments)
            r1_2.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 5 with ReferenceAlignment

use of edu.umd.hooka.alignment.aer.ReferenceAlignment in project Cloud9 by lintool.

the class ParallelChunk method getReferenceAlignment.

public ReferenceAlignment getReferenceAlignment(LanguagePair lp) {
    if (lp2ra == null)
        return null;
    ReferenceAlignment r = lp2ra.get(lp);
    if (r == null) {
        //if can't get alignment (for en-fr), then try for other translation direction (fr-en), and transpose the alignment for those.
        r = lp2ra.get(lp.inverted());
        if (r != null) {
            r = (ReferenceAlignment) r.getTranspose();
            lp2ra.put(lp, r);
        }
    }
    return r;
}
Also used : ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment)

Aggregations

ReferenceAlignment (edu.umd.hooka.alignment.aer.ReferenceAlignment)5 Chunk (edu.umd.hooka.corpora.Chunk)2 ParallelChunk (edu.umd.hooka.corpora.ParallelChunk)2 IOException (java.io.IOException)2 SAXException (org.xml.sax.SAXException)2 Language (edu.umd.hooka.corpora.Language)1 LanguagePair (edu.umd.hooka.corpora.LanguagePair)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 InputStreamReader (java.io.InputStreamReader)1 OutputStreamWriter (java.io.OutputStreamWriter)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1