Search in sources :

Example 1 with ArrayListWritable

use of tl.lin.data.array.ArrayListWritable in project Cloud9 by lintool.

the class BooleanRetrieval method fetchPostings.

private ArrayListWritable<PairOfInts> fetchPostings(String term) throws IOException {
    Text key = new Text();
    PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();
    key.set(term);
    index.get(key, value);
    return value.getRightElement();
}
Also used : ArrayListWritable(tl.lin.data.array.ArrayListWritable) PairOfWritables(tl.lin.data.pair.PairOfWritables) PairOfInts(tl.lin.data.pair.PairOfInts) Text(org.apache.hadoop.io.Text) IntWritable(org.apache.hadoop.io.IntWritable)

Example 2 with ArrayListWritable

use of tl.lin.data.array.ArrayListWritable in project Cloud9 by lintool.

the class InvertedIndexingIT method testInvertedIndexing.

@Test
public void testInvertedIndexing() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(collectionPath));
    String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix, "-numReducers", "1" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf);
    Text key = new Text();
    PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();
    key.set("gold");
    reader.get(key, value);
    assertEquals(584, value.getLeftElement().get());
    ArrayListWritable<PairOfInts> postings = value.getRightElement();
    assertEquals(584, value.getLeftElement().get());
    assertEquals(5303, postings.get(0).getLeftElement());
    assertEquals(684030, postings.get(100).getLeftElement());
    assertEquals(1634312, postings.get(200).getLeftElement());
    reader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayListWritable(tl.lin.data.array.ArrayListWritable) Configuration(org.apache.hadoop.conf.Configuration) PairOfInts(tl.lin.data.pair.PairOfInts) MapFile(org.apache.hadoop.io.MapFile) Text(org.apache.hadoop.io.Text) PairOfWritables(tl.lin.data.pair.PairOfWritables) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 3 with ArrayListWritable

use of tl.lin.data.array.ArrayListWritable in project Cloud9 by lintool.

the class ClueWeb09EN01WebgraphIT method verifyWebGraph.

private void verifyWebGraph() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00000")));
    //read key 200
    reader.next(key, value);
    verifyURLs(200, urlMap, value);
    verifyLinks(200, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    //skip key 400
    reader.next(key, value);
    //read key 600
    reader.next(key, value);
    verifyURLs(600, urlMap, value);
    verifyLinks(600, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    verifyLinks(600, AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, externalLinkMap, value);
    reader.close();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00010")));
    //read key 10
    reader.next(key, value);
    verifyURLs(10, urlMap, value);
    verifyLinks(10, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    //skip key 210
    reader.next(key, value);
    //skip key 410
    reader.next(key, value);
    //read key 610
    reader.next(key, value);
    verifyURLs(610, urlMap, value);
    verifyLinks(610, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
    reader.close();
}
Also used : ArrayListWritable(tl.lin.data.array.ArrayListWritable) Path(org.apache.hadoop.fs.Path) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable)

Example 4 with ArrayListWritable

use of tl.lin.data.array.ArrayListWritable in project Cloud9 by lintool.

the class Gov2WebgraphIT method verifyAnchors.

private void verifyAnchors() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00000")));
    reader.next(key, value);
    reader.next(key, value);
    verifyWeights(anchorList1, value);
    verifySources(anchorSources1, value);
    reader.close();
    reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/part-00010")));
    reader.next(key, value);
    reader.next(key, value);
    verifyWeights(anchorList2, value);
    verifySources(anchorSources2, value);
    reader.close();
}
Also used : ArrayListWritable(tl.lin.data.array.ArrayListWritable) Path(org.apache.hadoop.fs.Path) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable)

Example 5 with ArrayListWritable

use of tl.lin.data.array.ArrayListWritable in project Cloud9 by lintool.

the class IndexableAnchorTextForwardIndex method getDocument.

public IndexableAnchorText getDocument(int docno) {
    int idx = Arrays.binarySearch(docnos, docno);
    if (idx < 0)
        idx = -idx - 2;
    DecimalFormat df = new DecimalFormat("00000");
    String file = collectionPath + "/part-" + df.format(filenos[idx]);
    try {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file), conf);
        IntWritable key = new IntWritable();
        ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
        reader.seek(offsets[idx]);
        while (reader.next(key)) {
            if (key.get() == docno)
                break;
        }
        reader.getCurrentValue(value);
        reader.close();
        indexableAnchorText.createHTML(value);
        return indexableAnchorText;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayListWritable(tl.lin.data.array.ArrayListWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) DecimalFormat(java.text.DecimalFormat) IOException(java.io.IOException) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

IntWritable (org.apache.hadoop.io.IntWritable)6 ArrayListWritable (tl.lin.data.array.ArrayListWritable)6 Path (org.apache.hadoop.fs.Path)5 Configuration (org.apache.hadoop.conf.Configuration)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 SequenceFile (org.apache.hadoop.io.SequenceFile)3 Text (org.apache.hadoop.io.Text)3 PairOfInts (tl.lin.data.pair.PairOfInts)3 PairOfWritables (tl.lin.data.pair.PairOfWritables)3 AnchorText (edu.umd.cloud9.webgraph.data.AnchorText)2 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1 DecimalFormat (java.text.DecimalFormat)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 MapFile (org.apache.hadoop.io.MapFile)1 Writable (org.apache.hadoop.io.Writable)1 Test (org.junit.Test)1 Int2IntFrequencyDistribution (tl.lin.data.fd.Int2IntFrequencyDistribution)1 Int2IntFrequencyDistributionEntry (tl.lin.data.fd.Int2IntFrequencyDistributionEntry)1