Search in sources :

Example 1 with AnchorText

use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.

the class ClueWeb09EN01WebgraphIT method verifyWebGraph.

private void verifyWebGraph() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00000")));
    //read key 200
    reader.next(key, value);
    verifyURLs(200, urlMap, value);
    verifyLinks(200, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    //skip key 400
    reader.next(key, value);
    //read key 600
    reader.next(key, value);
    verifyURLs(600, urlMap, value);
    verifyLinks(600, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    verifyLinks(600, AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, externalLinkMap, value);
    reader.close();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00010")));
    //read key 10
    reader.next(key, value);
    verifyURLs(10, urlMap, value);
    verifyLinks(10, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    //skip key 210
    reader.next(key, value);
    //skip key 410
    reader.next(key, value);
    //read key 610
    reader.next(key, value);
    verifyURLs(610, urlMap, value);
    verifyLinks(610, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    reader.close();
}
Also used : ArrayListWritable(tl.lin.data.array.ArrayListWritable) Path(org.apache.hadoop.fs.Path) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable)

Example 2 with AnchorText

use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.

the class Gov2WebgraphIT method verifyAnchors.

private void verifyAnchors() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00000")));
    reader.next(key, value);
    reader.next(key, value);
    verifyWeights(anchorList1, value);
    verifySources(anchorSources1, value);
    reader.close();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00010")));
    reader.next(key, value);
    reader.next(key, value);
    verifyWeights(anchorList2, value);
    verifySources(anchorSources2, value);
    reader.close();
}
Also used : ArrayListWritable(tl.lin.data.array.ArrayListWritable) Path(org.apache.hadoop.fs.Path) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable)

Example 3 with AnchorText

use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.

the class AnchorTextTest method testConstructors.

@Test
public void testConstructors() {
    AnchorText anchor = new AnchorText();
    assertTrue(anchor.isInternalInLink());
    assertEquals(anchor.getText(), AnchorTextConstants.EMPTY_STRING);
    assertEquals(anchor.getSize(), 0);
    assertEquals(anchor.getWeight(), 0, 1e-100);
    AnchorText anchor2 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
    assertEquals(anchor2.getText(), "text");
    assertEquals(anchor2.getSize(), 0);
    AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text");
    assertNull(anchor3.getText());
    assertEquals(anchor3.getSize(), 0);
    AnchorText anchor4 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text", 100);
    assertNull(anchor4.getText());
    assertEquals(anchor4.getSize(), 1);
}
Also used : AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Test(org.junit.Test)

Example 4 with AnchorText

use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.

the class AnchorTextTest method testClone.

@Test
public void testClone() {
    AnchorText anchor1 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text", 1);
    AnchorText anchor2 = anchor1.clone();
    anchor2.setText("some text");
    assertTrue(anchor2.equals(anchor1));
    anchor2.addDocument(2);
    assertNull(anchor2.getText());
    assertEquals(anchor2.getSize(), 2);
    assertTrue(anchor2.equalsIgnoreSources(anchor1));
    AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text");
    anchor3.addDocumentsFrom(anchor2);
    anchor3.addDocument(2);
    assertNull(anchor3.getText());
    assertEquals(anchor3.getSize(), 2);
    anchor3.setWeight(1);
    assertEquals(anchor3.getWeight(), 0, 1e-100);
    assertEquals(anchor3.compareTo(anchor2), 1);
    ByteArrayOutputStream bstream = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(bstream);
    try {
        anchor3.write(out);
        out.close();
    } catch (Exception e) {
    }
    DataInputStream in = new DataInputStream(new ByteArrayInputStream(bstream.toByteArray()));
    AnchorText readAnchor = new AnchorText();
    try {
        readAnchor.readFields(in);
        in.close();
    } catch (Exception e) {
    }
    assertEquals(anchor3, readAnchor);
    assertTrue(anchor3.intersects(anchor2));
    assertTrue(anchor3.containsDocument(2));
    anchor3.resetToType(AnchorTextConstants.Type.IN_DEGREE.val);
    assertNull(anchor3.getText());
    anchor3.resetToType(AnchorTextConstants.Type.INTERNAL_IN_LINK.val);
    assertEquals(anchor3.getText(), AnchorTextConstants.EMPTY_STRING);
    assertTrue(anchor3.isInternalInLink());
    assertEquals(anchor3.getSize(), 0);
    assertEquals(anchor3.getWeight(), 0, 1e-100);
    assertFalse(anchor3.containsDocument(3));
    assertFalse(anchor3.intersects(anchor2));
}
Also used : AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) Test(org.junit.Test)

Example 5 with AnchorText

use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.

the class AnchorTextTest method testIterable.

@Test
public void testIterable() {
    AnchorText anchor = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
    anchor.addDocument(1);
    anchor.addDocument(2);
    anchor.addDocument(3);
    int[] sources = anchor.getDocuments();
    assertTrue((sources[0] == 1 && sources[1] == 2 && sources[2] == 3) || (sources[0] == 1 && sources[1] == 3 && sources[2] == 2) || (sources[0] == 2 && sources[1] == 1 && sources[2] == 3) || (sources[0] == 2 && sources[1] == 3 && sources[2] == 1) || (sources[0] == 3 && sources[1] == 1 && sources[2] == 2) || (sources[0] == 3 && sources[1] == 2 && sources[2] == 1));
    anchor.resetToType(AnchorTextConstants.Type.URL_FIELD.val);
    assertEquals(anchor.getSize(), 0);
    for (@SuppressWarnings("unused") int s : anchor) fail();
}
Also used : AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Test(org.junit.Test)

Aggregations

AnchorText (edu.umd.cloud9.webgraph.data.AnchorText)5 Test (org.junit.Test)3 Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 IntWritable (org.apache.hadoop.io.IntWritable)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 ArrayListWritable (tl.lin.data.array.ArrayListWritable)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1