use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.
the class ClueWeb09EN01WebgraphIT method verifyWebGraph.
private void verifyWebGraph() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
SequenceFile.Reader reader;
IntWritable key = new IntWritable();
ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00000")));
//read key 200
reader.next(key, value);
verifyURLs(200, urlMap, value);
verifyLinks(200, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
//skip key 400
reader.next(key, value);
//read key 600
reader.next(key, value);
verifyURLs(600, urlMap, value);
verifyLinks(600, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
verifyLinks(600, AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, externalLinkMap, value);
reader.close();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00010")));
//read key 10
reader.next(key, value);
verifyURLs(10, urlMap, value);
verifyLinks(10, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
//skip key 210
reader.next(key, value);
//skip key 410
reader.next(key, value);
//read key 610
reader.next(key, value);
verifyURLs(610, urlMap, value);
verifyLinks(610, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
reader.close();
}
use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.
the class Gov2WebgraphIT method verifyAnchors.
private void verifyAnchors() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
SequenceFile.Reader reader;
IntWritable key = new IntWritable();
ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00000")));
reader.next(key, value);
reader.next(key, value);
verifyWeights(anchorList1, value);
verifySources(anchorSources1, value);
reader.close();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00010")));
reader.next(key, value);
reader.next(key, value);
verifyWeights(anchorList2, value);
verifySources(anchorSources2, value);
reader.close();
}
use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.
the class AnchorTextTest method testConstructors.
@Test
public void testConstructors() {
AnchorText anchor = new AnchorText();
assertTrue(anchor.isInternalInLink());
assertEquals(anchor.getText(), AnchorTextConstants.EMPTY_STRING);
assertEquals(anchor.getSize(), 0);
assertEquals(anchor.getWeight(), 0, 1e-100);
AnchorText anchor2 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
assertEquals(anchor2.getText(), "text");
assertEquals(anchor2.getSize(), 0);
AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text");
assertNull(anchor3.getText());
assertEquals(anchor3.getSize(), 0);
AnchorText anchor4 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text", 100);
assertNull(anchor4.getText());
assertEquals(anchor4.getSize(), 1);
}
use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.
the class AnchorTextTest method testClone.
@Test
public void testClone() {
AnchorText anchor1 = new AnchorText(AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, "text", 1);
AnchorText anchor2 = anchor1.clone();
anchor2.setText("some text");
assertTrue(anchor2.equals(anchor1));
anchor2.addDocument(2);
assertNull(anchor2.getText());
assertEquals(anchor2.getSize(), 2);
assertTrue(anchor2.equalsIgnoreSources(anchor1));
AnchorText anchor3 = new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "text");
anchor3.addDocumentsFrom(anchor2);
anchor3.addDocument(2);
assertNull(anchor3.getText());
assertEquals(anchor3.getSize(), 2);
anchor3.setWeight(1);
assertEquals(anchor3.getWeight(), 0, 1e-100);
assertEquals(anchor3.compareTo(anchor2), 1);
ByteArrayOutputStream bstream = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(bstream);
try {
anchor3.write(out);
out.close();
} catch (Exception e) {
}
DataInputStream in = new DataInputStream(new ByteArrayInputStream(bstream.toByteArray()));
AnchorText readAnchor = new AnchorText();
try {
readAnchor.readFields(in);
in.close();
} catch (Exception e) {
}
assertEquals(anchor3, readAnchor);
assertTrue(anchor3.intersects(anchor2));
assertTrue(anchor3.containsDocument(2));
anchor3.resetToType(AnchorTextConstants.Type.IN_DEGREE.val);
assertNull(anchor3.getText());
anchor3.resetToType(AnchorTextConstants.Type.INTERNAL_IN_LINK.val);
assertEquals(anchor3.getText(), AnchorTextConstants.EMPTY_STRING);
assertTrue(anchor3.isInternalInLink());
assertEquals(anchor3.getSize(), 0);
assertEquals(anchor3.getWeight(), 0, 1e-100);
assertFalse(anchor3.containsDocument(3));
assertFalse(anchor3.intersects(anchor2));
}
use of edu.umd.cloud9.webgraph.data.AnchorText in project Cloud9 by lintool.
the class AnchorTextTest method testIterable.
@Test
public void testIterable() {
AnchorText anchor = new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, "text");
anchor.addDocument(1);
anchor.addDocument(2);
anchor.addDocument(3);
int[] sources = anchor.getDocuments();
assertTrue((sources[0] == 1 && sources[1] == 2 && sources[2] == 3) || (sources[0] == 1 && sources[1] == 3 && sources[2] == 2) || (sources[0] == 2 && sources[1] == 1 && sources[2] == 3) || (sources[0] == 2 && sources[1] == 3 && sources[2] == 1) || (sources[0] == 3 && sources[1] == 1 && sources[2] == 2) || (sources[0] == 3 && sources[1] == 2 && sources[2] == 1));
anchor.resetToType(AnchorTextConstants.Type.URL_FIELD.val);
assertEquals(anchor.getSize(), 0);
for (@SuppressWarnings("unused") int s : anchor) fail();
}
Aggregations